diff --git "a/sft/Full_xmoe/checkpoint-9984/trainer_state.json" "b/sft/Full_xmoe/checkpoint-9984/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/Full_xmoe/checkpoint-9984/trainer_state.json" @@ -0,0 +1,149793 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6002705546370058, + "eval_steps": 500, + "global_step": 9984, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.03013315, + "auxiliary_loss_mlp": 0.02744923, + "balance_loss_clip": 2.50106812, + "balance_loss_mlp": 2.27685976, + "epoch": 6.012325266796934e-05, + "flos": 24458666507280.0, + "grad_norm": 54.43001028097931, + "language_loss": 2.85806966, + "learning_rate": 0.0, + "loss": 1.93911517, + "num_input_tokens_seen": 19155, + "step": 1, + "time_per_iteration": 17.218258142471313 + }, + { + "auxiliary_loss_clip": 0.01998256, + "auxiliary_loss_mlp": 0.01776085, + "balance_loss_clip": 1.65772355, + "balance_loss_mlp": 1.46900177, + "epoch": 0.00012024650533593868, + "flos": 20227782627360.0, + "grad_norm": 37.3266094357333, + "language_loss": 1.82876635, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.8665098, + "num_input_tokens_seen": 36175, + "step": 2, + "time_per_iteration": 2.680952548980713 + }, + { + "auxiliary_loss_clip": 0.02013651, + "auxiliary_loss_mlp": 0.01861754, + "balance_loss_clip": 1.67236495, + "balance_loss_mlp": 1.54818571, + "epoch": 0.000180369758003908, + "flos": 22312145426400.0, + "grad_norm": 33.11945019554233, + "language_loss": 1.57677627, + "learning_rate": 7.073439208833112e-07, + "loss": 1.61553025, + "num_input_tokens_seen": 54870, + "step": 3, + "time_per_iteration": 2.6395492553710938 + }, + { + "auxiliary_loss_clip": 0.0202262, + "auxiliary_loss_mlp": 0.01776837, + "balance_loss_clip": 1.68153977, + "balance_loss_mlp": 1.46517646, + "epoch": 0.00024049301067187735, + "flos": 22416600672000.0, + "grad_norm": 51.42054239553306, + "language_loss": 1.67806602, + "learning_rate": 8.925686513863519e-07, + "loss": 1.71606064, + "num_input_tokens_seen": 74575, + "step": 4, + "time_per_iteration": 2.6672027111053467 + }, + { + "auxiliary_loss_clip": 0.02015029, + "auxiliary_loss_mlp": 0.01830344, + "balance_loss_clip": 1.67292809, + "balance_loss_mlp": 1.52097154, + "epoch": 0.0003006162633398467, + "flos": 21399091722720.0, + "grad_norm": 56.03699938995309, + "language_loss": 1.91210973, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.95056343, + "num_input_tokens_seen": 92580, + "step": 5, + "time_per_iteration": 2.8996057510375977 + }, + { + "auxiliary_loss_clip": 0.0201537, + "auxiliary_loss_mlp": 0.01766716, + "balance_loss_clip": 1.6752665, + "balance_loss_mlp": 1.45734358, + "epoch": 0.000360739516007816, + "flos": 21654502502400.0, + "grad_norm": 33.323703131407456, + "language_loss": 1.61018276, + "learning_rate": 1.153628246576487e-06, + "loss": 1.6480037, + "num_input_tokens_seen": 109705, + "step": 6, + "time_per_iteration": 2.9529802799224854 + }, + { + "auxiliary_loss_clip": 0.02003282, + "auxiliary_loss_mlp": 0.01811772, + "balance_loss_clip": 1.66173887, + "balance_loss_mlp": 1.49457991, + "epoch": 0.0004208627686757854, + "flos": 27162067700160.0, + "grad_norm": 24.892583195240473, + "language_loss": 1.53420734, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.57235789, + "num_input_tokens_seen": 129425, + "step": 7, + "time_per_iteration": 2.9300003051757812 + }, + { + "auxiliary_loss_clip": 0.02010534, + "auxiliary_loss_mlp": 0.01786893, + "balance_loss_clip": 1.67002845, + "balance_loss_mlp": 1.47714019, + "epoch": 0.0004809860213437547, + "flos": 31321652611680.0, + "grad_norm": 31.530435385814524, + "language_loss": 1.43878901, + "learning_rate": 1.338852977079528e-06, + "loss": 1.47676325, + "num_input_tokens_seen": 149210, + "step": 8, + "time_per_iteration": 3.013366222381592 + }, + { + "auxiliary_loss_clip": 0.01987283, + "auxiliary_loss_mlp": 0.01814924, + "balance_loss_clip": 1.64452052, + "balance_loss_mlp": 1.50555253, + "epoch": 0.000541109274011724, + "flos": 32163135148800.0, + "grad_norm": 59.26963784586382, + "language_loss": 1.49698734, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.53500962, + "num_input_tokens_seen": 169055, + "step": 9, + "time_per_iteration": 2.9573099613189697 + }, + { + "auxiliary_loss_clip": 0.02008619, + "auxiliary_loss_mlp": 0.01827872, + "balance_loss_clip": 1.66791677, + "balance_loss_mlp": 1.51010823, + "epoch": 0.0006012325266796934, + "flos": 18918830782080.0, + "grad_norm": 27.996403581866602, + "language_loss": 1.44770384, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.48606873, + "num_input_tokens_seen": 188045, + "step": 10, + "time_per_iteration": 2.942462682723999 + }, + { + "auxiliary_loss_clip": 0.02011901, + "auxiliary_loss_mlp": 0.01836187, + "balance_loss_clip": 1.67106807, + "balance_loss_mlp": 1.5220468, + "epoch": 0.0006613557793476627, + "flos": 20776836136320.0, + "grad_norm": 18.6540741882004, + "language_loss": 1.45404971, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.49253058, + "num_input_tokens_seen": 207035, + "step": 11, + "time_per_iteration": 2.941220760345459 + }, + { + "auxiliary_loss_clip": 0.02009063, + "auxiliary_loss_mlp": 0.0180274, + "balance_loss_clip": 1.66872859, + "balance_loss_mlp": 1.47849107, + "epoch": 0.000721479032015632, + "flos": 16583570654400.0, + "grad_norm": 17.195973004382665, + "language_loss": 1.45370173, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49181986, + "num_input_tokens_seen": 223225, + "step": 12, + "time_per_iteration": 2.8963370323181152 + }, + { + "auxiliary_loss_clip": 0.02004208, + "auxiliary_loss_mlp": 0.01807014, + "balance_loss_clip": 1.66430497, + "balance_loss_mlp": 1.48924959, + "epoch": 0.0007816022846836014, + "flos": 23807743568640.0, + "grad_norm": 13.958882003230386, + "language_loss": 1.28860188, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.32671416, + "num_input_tokens_seen": 242570, + "step": 13, + "time_per_iteration": 3.0859341621398926 + }, + { + "auxiliary_loss_clip": 0.02006669, + "auxiliary_loss_mlp": 0.01801743, + "balance_loss_clip": 1.66423118, + "balance_loss_mlp": 1.48645806, + "epoch": 0.0008417255373515708, + "flos": 19174241561760.0, + "grad_norm": 5.949417343087796, + "language_loss": 1.21050024, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.24858439, + "num_input_tokens_seen": 261215, + "step": 14, + "time_per_iteration": 4.45519757270813 + }, + { + "auxiliary_loss_clip": 0.0200799, + "auxiliary_loss_mlp": 0.01839546, + "balance_loss_clip": 1.66656184, + "balance_loss_mlp": 1.52235413, + "epoch": 0.00090184879001954, + "flos": 26397959338080.0, + "grad_norm": 6.763082640600665, + "language_loss": 1.12800694, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.16648221, + "num_input_tokens_seen": 280035, + "step": 15, + "time_per_iteration": 4.3720667362213135 + }, + { + "auxiliary_loss_clip": 0.02002563, + "auxiliary_loss_mlp": 0.01840189, + "balance_loss_clip": 1.66098499, + "balance_loss_mlp": 1.52509499, + "epoch": 0.0009619720426875094, + "flos": 24681730903200.0, + "grad_norm": 4.596167343404168, + "language_loss": 1.11293101, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15135849, + "num_input_tokens_seen": 300265, + "step": 16, + "time_per_iteration": 4.470144271850586 + }, + { + "auxiliary_loss_clip": 0.01988196, + "auxiliary_loss_mlp": 0.01835549, + "balance_loss_clip": 1.64600503, + "balance_loss_mlp": 1.51683164, + "epoch": 0.0010220952953554788, + "flos": 18626515538400.0, + "grad_norm": 4.755168778451519, + "language_loss": 1.12818229, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.16641974, + "num_input_tokens_seen": 317375, + "step": 17, + "time_per_iteration": 2.8725385665893555 + }, + { + "auxiliary_loss_clip": 0.01991761, + "auxiliary_loss_mlp": 0.0183003, + "balance_loss_clip": 1.65113461, + "balance_loss_mlp": 1.51169348, + "epoch": 0.001082218548023448, + "flos": 26145051816960.0, + "grad_norm": 5.7149966099798775, + "language_loss": 1.0816288, + "learning_rate": 1.860972167459798e-06, + "loss": 1.11984658, + "num_input_tokens_seen": 337975, + "step": 18, + "time_per_iteration": 3.093780279159546 + }, + { + "auxiliary_loss_clip": 0.01999149, + "auxiliary_loss_mlp": 0.01788995, + "balance_loss_clip": 1.65759969, + "balance_loss_mlp": 1.47638071, + "epoch": 0.0011423418006914173, + "flos": 19611823115520.0, + "grad_norm": 4.674524491582701, + "language_loss": 1.02489424, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06277573, + "num_input_tokens_seen": 356635, + "step": 19, + "time_per_iteration": 2.807722330093384 + }, + { + "auxiliary_loss_clip": 0.0199839, + "auxiliary_loss_mlp": 0.0181655, + "balance_loss_clip": 1.65663743, + "balance_loss_mlp": 1.48962998, + "epoch": 0.0012024650533593868, + "flos": 17897339376000.0, + "grad_norm": 5.978554434891839, + "language_loss": 1.16737354, + "learning_rate": 1.928808765521199e-06, + "loss": 1.20552301, + "num_input_tokens_seen": 375625, + "step": 20, + "time_per_iteration": 2.9782938957214355 + }, + { + "auxiliary_loss_clip": 0.02001045, + "auxiliary_loss_mlp": 0.01722894, + "balance_loss_clip": 1.65933275, + "balance_loss_mlp": 1.39883554, + "epoch": 0.001262588306027356, + "flos": 21254204694240.0, + "grad_norm": 6.293016780687086, + "language_loss": 1.06250584, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.09974527, + "num_input_tokens_seen": 394350, + "step": 21, + "time_per_iteration": 2.887416362762451 + }, + { + "auxiliary_loss_clip": 0.01959563, + "auxiliary_loss_mlp": 0.01709186, + "balance_loss_clip": 1.61698735, + "balance_loss_mlp": 1.38341033, + "epoch": 0.0013227115586953253, + "flos": 26106478513920.0, + "grad_norm": 3.4418278102692166, + "language_loss": 1.06179941, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.0984869, + "num_input_tokens_seen": 413255, + "step": 22, + "time_per_iteration": 2.949183940887451 + }, + { + "auxiliary_loss_clip": 0.01963019, + "auxiliary_loss_mlp": 0.01761066, + "balance_loss_clip": 1.61872137, + "balance_loss_mlp": 1.43529069, + "epoch": 0.0013828348113632948, + "flos": 23953654657440.0, + "grad_norm": 2.907012658347345, + "language_loss": 0.91965729, + "learning_rate": 2.018794797290208e-06, + "loss": 0.95689809, + "num_input_tokens_seen": 433065, + "step": 23, + "time_per_iteration": 2.9378178119659424 + }, + { + "auxiliary_loss_clip": 0.01972874, + "auxiliary_loss_mlp": 0.01734671, + "balance_loss_clip": 1.63094616, + "balance_loss_mlp": 1.42224741, + "epoch": 0.001442958064031264, + "flos": 15961618493280.0, + "grad_norm": 2.807321365166143, + "language_loss": 1.08248687, + "learning_rate": 2.046196897962839e-06, + "loss": 1.11956239, + "num_input_tokens_seen": 451175, + "step": 24, + "time_per_iteration": 2.8333916664123535 + }, + { + "auxiliary_loss_clip": 0.01970008, + "auxiliary_loss_mlp": 0.01655952, + "balance_loss_clip": 1.62891817, + "balance_loss_mlp": 1.3194958, + "epoch": 0.0015030813166992333, + "flos": 18110004755040.0, + "grad_norm": 3.848546861676896, + "language_loss": 1.01354134, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.04980087, + "num_input_tokens_seen": 468775, + "step": 25, + "time_per_iteration": 2.8710219860076904 + }, + { + "auxiliary_loss_clip": 0.01981886, + "auxiliary_loss_mlp": 0.0169316, + "balance_loss_clip": 1.64089668, + "balance_loss_mlp": 1.36223507, + "epoch": 0.0015632045693672028, + "flos": 22236629731200.0, + "grad_norm": 2.736171592070547, + "language_loss": 1.06495738, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10170794, + "num_input_tokens_seen": 488530, + "step": 26, + "time_per_iteration": 2.98580002784729 + }, + { + "auxiliary_loss_clip": 0.0198121, + "auxiliary_loss_mlp": 0.01697054, + "balance_loss_clip": 1.63961411, + "balance_loss_mlp": 1.3733772, + "epoch": 0.001623327822035172, + "flos": 23994465721920.0, + "grad_norm": 2.657387221725565, + "language_loss": 0.9549489, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99173158, + "num_input_tokens_seen": 510495, + "step": 27, + "time_per_iteration": 2.922116756439209 + }, + { + "auxiliary_loss_clip": 0.01973961, + "auxiliary_loss_mlp": 0.01700374, + "balance_loss_clip": 1.63313389, + "balance_loss_mlp": 1.35342753, + "epoch": 0.0016834510747031415, + "flos": 19679411825280.0, + "grad_norm": 2.7403829376701054, + "language_loss": 1.06394219, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.1006856, + "num_input_tokens_seen": 528605, + "step": 28, + "time_per_iteration": 2.8738694190979004 + }, + { + "auxiliary_loss_clip": 0.01972442, + "auxiliary_loss_mlp": 0.01658873, + "balance_loss_clip": 1.63159692, + "balance_loss_mlp": 1.32050943, + "epoch": 0.0017435743273711108, + "flos": 20925971118720.0, + "grad_norm": 2.258100834017194, + "language_loss": 1.02509022, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06140327, + "num_input_tokens_seen": 548515, + "step": 29, + "time_per_iteration": 2.859247922897339 + }, + { + "auxiliary_loss_clip": 0.01948144, + "auxiliary_loss_mlp": 0.01641888, + "balance_loss_clip": 1.60460067, + "balance_loss_mlp": 1.30543137, + "epoch": 0.00180369758003908, + "flos": 19530580268160.0, + "grad_norm": 3.4908204754653878, + "language_loss": 1.1916399, + "learning_rate": 2.189868360711334e-06, + "loss": 1.22754014, + "num_input_tokens_seen": 564025, + "step": 30, + "time_per_iteration": 2.8668315410614014 + }, + { + "auxiliary_loss_clip": 0.01968221, + "auxiliary_loss_mlp": 0.01708222, + "balance_loss_clip": 1.62726676, + "balance_loss_mlp": 1.36566234, + "epoch": 0.0018638208327070496, + "flos": 27455596644960.0, + "grad_norm": 3.0221190549573818, + "language_loss": 1.02506471, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06182921, + "num_input_tokens_seen": 583345, + "step": 31, + "time_per_iteration": 2.8582358360290527 + }, + { + "auxiliary_loss_clip": 0.01961249, + "auxiliary_loss_mlp": 0.01705921, + "balance_loss_clip": 1.62049103, + "balance_loss_mlp": 1.36030936, + "epoch": 0.0019239440853750188, + "flos": 13591160668800.0, + "grad_norm": 2.598454436565777, + "language_loss": 0.95436502, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99103665, + "num_input_tokens_seen": 600010, + "step": 32, + "time_per_iteration": 2.8425302505493164 + }, + { + "auxiliary_loss_clip": 0.01971117, + "auxiliary_loss_mlp": 0.01661755, + "balance_loss_clip": 1.63002288, + "balance_loss_mlp": 1.3121376, + "epoch": 0.001984067338042988, + "flos": 11255407475040.0, + "grad_norm": 4.042337359690822, + "language_loss": 0.95049727, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.98682594, + "num_input_tokens_seen": 616295, + "step": 33, + "time_per_iteration": 2.8458826541900635 + }, + { + "auxiliary_loss_clip": 0.01932769, + "auxiliary_loss_mlp": 0.01645201, + "balance_loss_clip": 1.59076059, + "balance_loss_mlp": 1.3009243, + "epoch": 0.0020441905907109576, + "flos": 22388647253760.0, + "grad_norm": 2.5176654694900518, + "language_loss": 0.91502762, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95080733, + "num_input_tokens_seen": 637640, + "step": 34, + "time_per_iteration": 2.8946564197540283 + }, + { + "auxiliary_loss_clip": 0.01906146, + "auxiliary_loss_mlp": 0.01659347, + "balance_loss_clip": 1.56377697, + "balance_loss_mlp": 1.29981208, + "epoch": 0.0021043138433789266, + "flos": 49782041352000.0, + "grad_norm": 2.581583657292783, + "language_loss": 0.76661193, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80226684, + "num_input_tokens_seen": 659710, + "step": 35, + "time_per_iteration": 3.1000895500183105 + }, + { + "auxiliary_loss_clip": 0.01911224, + "auxiliary_loss_mlp": 0.01633502, + "balance_loss_clip": 1.56970406, + "balance_loss_mlp": 1.2819773, + "epoch": 0.002164437096046896, + "flos": 20560036587840.0, + "grad_norm": 2.1738322030218193, + "language_loss": 0.88690859, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92235589, + "num_input_tokens_seen": 679670, + "step": 36, + "time_per_iteration": 2.875767230987549 + }, + { + "auxiliary_loss_clip": 0.01911529, + "auxiliary_loss_mlp": 0.01600606, + "balance_loss_clip": 1.56868482, + "balance_loss_mlp": 1.23840022, + "epoch": 0.0022245603487148656, + "flos": 26544742774560.0, + "grad_norm": 2.430643865991714, + "language_loss": 0.92902827, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96414965, + "num_input_tokens_seen": 700170, + "step": 37, + "time_per_iteration": 3.0447964668273926 + }, + { + "auxiliary_loss_clip": 0.01912826, + "auxiliary_loss_mlp": 0.01632547, + "balance_loss_clip": 1.57052207, + "balance_loss_mlp": 1.27167702, + "epoch": 0.0022846836013828346, + "flos": 20340202786560.0, + "grad_norm": 2.1587057554038114, + "language_loss": 1.03955936, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07501316, + "num_input_tokens_seen": 718545, + "step": 38, + "time_per_iteration": 2.953655958175659 + }, + { + "auxiliary_loss_clip": 0.01912859, + "auxiliary_loss_mlp": 0.01663493, + "balance_loss_clip": 1.5714643, + "balance_loss_mlp": 1.30491161, + "epoch": 0.002344806854050804, + "flos": 26249848416000.0, + "grad_norm": 2.332506673453385, + "language_loss": 0.85292381, + "learning_rate": 2.358792165262154e-06, + "loss": 0.88868731, + "num_input_tokens_seen": 739865, + "step": 39, + "time_per_iteration": 2.8804116249084473 + }, + { + "auxiliary_loss_clip": 0.0190132, + "auxiliary_loss_mlp": 0.01596627, + "balance_loss_clip": 1.55790687, + "balance_loss_mlp": 1.23594689, + "epoch": 0.0024049301067187736, + "flos": 11803133498400.0, + "grad_norm": 2.9535936247413184, + "language_loss": 0.90090185, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93588126, + "num_input_tokens_seen": 755770, + "step": 40, + "time_per_iteration": 2.871279716491699 + }, + { + "auxiliary_loss_clip": 0.01893513, + "auxiliary_loss_mlp": 0.01541667, + "balance_loss_clip": 1.54943919, + "balance_loss_mlp": 1.17755401, + "epoch": 0.0024650533593867426, + "flos": 20633617946880.0, + "grad_norm": 3.6131280569086863, + "language_loss": 0.93241274, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96676451, + "num_input_tokens_seen": 773440, + "step": 41, + "time_per_iteration": 2.897951602935791 + }, + { + "auxiliary_loss_clip": 0.01897653, + "auxiliary_loss_mlp": 0.01550639, + "balance_loss_clip": 1.55366135, + "balance_loss_mlp": 1.19053197, + "epoch": 0.002525176612054712, + "flos": 18408123007200.0, + "grad_norm": 2.789510412716303, + "language_loss": 0.9734748, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.0079577, + "num_input_tokens_seen": 790455, + "step": 42, + "time_per_iteration": 2.9289116859436035 + }, + { + "auxiliary_loss_clip": 0.01898208, + "auxiliary_loss_mlp": 0.01583273, + "balance_loss_clip": 1.55608213, + "balance_loss_mlp": 1.22240233, + "epoch": 0.0025852998647226816, + "flos": 28186555430880.0, + "grad_norm": 3.207648959675087, + "language_loss": 0.97464377, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.00945854, + "num_input_tokens_seen": 810645, + "step": 43, + "time_per_iteration": 2.920456886291504 + }, + { + "auxiliary_loss_clip": 0.01889132, + "auxiliary_loss_mlp": 0.01613714, + "balance_loss_clip": 1.54527545, + "balance_loss_mlp": 1.25494146, + "epoch": 0.0026454231173906506, + "flos": 14284873637280.0, + "grad_norm": 2.6799713922995916, + "language_loss": 0.93396878, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.96899724, + "num_input_tokens_seen": 827470, + "step": 44, + "time_per_iteration": 2.8830463886260986 + }, + { + "auxiliary_loss_clip": 0.01897213, + "auxiliary_loss_mlp": 0.01572855, + "balance_loss_clip": 1.55484736, + "balance_loss_mlp": 1.2087425, + "epoch": 0.00270554637005862, + "flos": 22421417548320.0, + "grad_norm": 2.0208214403203537, + "language_loss": 0.98647738, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02117801, + "num_input_tokens_seen": 847285, + "step": 45, + "time_per_iteration": 2.8623547554016113 + }, + { + "auxiliary_loss_clip": 0.01885463, + "auxiliary_loss_mlp": 0.01569226, + "balance_loss_clip": 1.54362154, + "balance_loss_mlp": 1.2026335, + "epoch": 0.0027656696227265896, + "flos": 23987942078400.0, + "grad_norm": 1.987503926250225, + "language_loss": 1.02663624, + "learning_rate": 2.465079122983384e-06, + "loss": 1.06118321, + "num_input_tokens_seen": 867545, + "step": 46, + "time_per_iteration": 2.9786767959594727 + }, + { + "auxiliary_loss_clip": 0.01881646, + "auxiliary_loss_mlp": 0.0159389, + "balance_loss_clip": 1.53837943, + "balance_loss_mlp": 1.22748804, + "epoch": 0.0028257928753945586, + "flos": 37672520898240.0, + "grad_norm": 2.1687377873091362, + "language_loss": 0.87853527, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91329062, + "num_input_tokens_seen": 889915, + "step": 47, + "time_per_iteration": 3.055675506591797 + }, + { + "auxiliary_loss_clip": 0.01883369, + "auxiliary_loss_mlp": 0.01569578, + "balance_loss_clip": 1.53972697, + "balance_loss_mlp": 1.1980269, + "epoch": 0.002885916128062528, + "flos": 22456691101440.0, + "grad_norm": 1.889624291370793, + "language_loss": 0.88113058, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91566002, + "num_input_tokens_seen": 908975, + "step": 48, + "time_per_iteration": 3.054354190826416 + }, + { + "auxiliary_loss_clip": 0.018592, + "auxiliary_loss_mlp": 0.01599308, + "balance_loss_clip": 1.51661682, + "balance_loss_mlp": 1.23710299, + "epoch": 0.0029460393807304976, + "flos": 27015056694720.0, + "grad_norm": 3.1931583771563403, + "language_loss": 0.89423656, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.92882168, + "num_input_tokens_seen": 929810, + "step": 49, + "time_per_iteration": 2.9633865356445312 + }, + { + "auxiliary_loss_clip": 0.01855927, + "auxiliary_loss_mlp": 0.01588763, + "balance_loss_clip": 1.51159239, + "balance_loss_mlp": 1.22217083, + "epoch": 0.0030061626333984666, + "flos": 15853673856960.0, + "grad_norm": 2.3519052743127893, + "language_loss": 0.90905344, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94350028, + "num_input_tokens_seen": 948650, + "step": 50, + "time_per_iteration": 2.821136951446533 + }, + { + "auxiliary_loss_clip": 0.01850252, + "auxiliary_loss_mlp": 0.01569771, + "balance_loss_clip": 1.50660038, + "balance_loss_mlp": 1.20279717, + "epoch": 0.003066285886066436, + "flos": 31829477846400.0, + "grad_norm": 7.434045332981381, + "language_loss": 0.8686502, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90285045, + "num_input_tokens_seen": 966455, + "step": 51, + "time_per_iteration": 2.9139091968536377 + }, + { + "auxiliary_loss_clip": 0.01853301, + "auxiliary_loss_mlp": 0.01563572, + "balance_loss_clip": 1.51129723, + "balance_loss_mlp": 1.19450021, + "epoch": 0.0031264091387344056, + "flos": 41430708084960.0, + "grad_norm": 2.40993977832427, + "language_loss": 0.95123577, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98540443, + "num_input_tokens_seen": 988110, + "step": 52, + "time_per_iteration": 4.619322299957275 + }, + { + "auxiliary_loss_clip": 0.01869179, + "auxiliary_loss_mlp": 0.01580796, + "balance_loss_clip": 1.52616358, + "balance_loss_mlp": 1.20600247, + "epoch": 0.0031865323914023747, + "flos": 23443212379680.0, + "grad_norm": 1.974029162830753, + "language_loss": 0.92142701, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.9559269, + "num_input_tokens_seen": 1008550, + "step": 53, + "time_per_iteration": 5.865837812423706 + }, + { + "auxiliary_loss_clip": 0.01869114, + "auxiliary_loss_mlp": 0.01562698, + "balance_loss_clip": 1.52747762, + "balance_loss_mlp": 1.19419861, + "epoch": 0.003246655644070344, + "flos": 14430974366880.0, + "grad_norm": 7.64902874582464, + "language_loss": 0.82820958, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.86252773, + "num_input_tokens_seen": 1026840, + "step": 54, + "time_per_iteration": 4.42754054069519 + }, + { + "auxiliary_loss_clip": 0.01856733, + "auxiliary_loss_mlp": 0.01556881, + "balance_loss_clip": 1.51419806, + "balance_loss_mlp": 1.18990672, + "epoch": 0.0033067788967383136, + "flos": 35921170622880.0, + "grad_norm": 2.451396233204177, + "language_loss": 0.81401944, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84815562, + "num_input_tokens_seen": 1048875, + "step": 55, + "time_per_iteration": 3.0656604766845703 + }, + { + "auxiliary_loss_clip": 0.01857621, + "auxiliary_loss_mlp": 0.01591373, + "balance_loss_clip": 1.51499629, + "balance_loss_mlp": 1.23736882, + "epoch": 0.003366902149406283, + "flos": 22960533879360.0, + "grad_norm": 2.881372859264907, + "language_loss": 0.87065953, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90514946, + "num_input_tokens_seen": 1066435, + "step": 56, + "time_per_iteration": 2.842888355255127 + }, + { + "auxiliary_loss_clip": 0.01860473, + "auxiliary_loss_mlp": 0.01579366, + "balance_loss_clip": 1.51852071, + "balance_loss_mlp": 1.21658802, + "epoch": 0.003427025402074252, + "flos": 26585515910880.0, + "grad_norm": 2.0137401079780317, + "language_loss": 0.9266119, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.96101034, + "num_input_tokens_seen": 1090330, + "step": 57, + "time_per_iteration": 3.0554943084716797 + }, + { + "auxiliary_loss_clip": 0.01868687, + "auxiliary_loss_mlp": 0.0161126, + "balance_loss_clip": 1.52685857, + "balance_loss_mlp": 1.23188806, + "epoch": 0.0034871486547422216, + "flos": 23953465016640.0, + "grad_norm": 2.1427396825293603, + "language_loss": 0.99439132, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02919078, + "num_input_tokens_seen": 1109840, + "step": 58, + "time_per_iteration": 2.8449318408966064 + }, + { + "auxiliary_loss_clip": 0.01851288, + "auxiliary_loss_mlp": 0.01593955, + "balance_loss_clip": 1.50846672, + "balance_loss_mlp": 1.22202182, + "epoch": 0.003547271907410191, + "flos": 21217110589440.0, + "grad_norm": 2.481350334789067, + "language_loss": 0.88058889, + "learning_rate": 2.625331386578098e-06, + "loss": 0.91504139, + "num_input_tokens_seen": 1128415, + "step": 59, + "time_per_iteration": 2.8881540298461914 + }, + { + "auxiliary_loss_clip": 0.01847782, + "auxiliary_loss_mlp": 0.01597189, + "balance_loss_clip": 1.50531065, + "balance_loss_mlp": 1.24070537, + "epoch": 0.00360739516007816, + "flos": 16506575760960.0, + "grad_norm": 2.0962932951541466, + "language_loss": 0.93453676, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96898645, + "num_input_tokens_seen": 1146515, + "step": 60, + "time_per_iteration": 2.8500030040740967 + }, + { + "auxiliary_loss_clip": 0.01845444, + "auxiliary_loss_mlp": 0.016057, + "balance_loss_clip": 1.50212836, + "balance_loss_mlp": 1.24654627, + "epoch": 0.0036675184127461296, + "flos": 19466898158880.0, + "grad_norm": 2.770078057177811, + "language_loss": 0.89878422, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.93329573, + "num_input_tokens_seen": 1166330, + "step": 61, + "time_per_iteration": 2.930814504623413 + }, + { + "auxiliary_loss_clip": 0.01839593, + "auxiliary_loss_mlp": 0.01584009, + "balance_loss_clip": 1.49670339, + "balance_loss_mlp": 1.22599947, + "epoch": 0.003727641665414099, + "flos": 20959310335680.0, + "grad_norm": 2.264710573485336, + "language_loss": 0.883255, + "learning_rate": 2.657264485425803e-06, + "loss": 0.91749108, + "num_input_tokens_seen": 1186010, + "step": 62, + "time_per_iteration": 2.8642966747283936 + }, + { + "auxiliary_loss_clip": 0.01847175, + "auxiliary_loss_mlp": 0.01605425, + "balance_loss_clip": 1.50412941, + "balance_loss_mlp": 1.23597193, + "epoch": 0.003787764918082068, + "flos": 18408047150880.0, + "grad_norm": 1.8635402243785502, + "language_loss": 0.96118146, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99570745, + "num_input_tokens_seen": 1204985, + "step": 63, + "time_per_iteration": 2.848515033721924 + }, + { + "auxiliary_loss_clip": 0.01858624, + "auxiliary_loss_mlp": 0.0158938, + "balance_loss_clip": 1.51653755, + "balance_loss_mlp": 1.22755575, + "epoch": 0.0038478881707500376, + "flos": 12460928135040.0, + "grad_norm": 2.459472733639988, + "language_loss": 0.98909569, + "learning_rate": 2.677705954159056e-06, + "loss": 1.02357578, + "num_input_tokens_seen": 1223545, + "step": 64, + "time_per_iteration": 2.901719093322754 + }, + { + "auxiliary_loss_clip": 0.01850378, + "auxiliary_loss_mlp": 0.01592222, + "balance_loss_clip": 1.50796843, + "balance_loss_mlp": 1.22334075, + "epoch": 0.003908011423418007, + "flos": 13555052696160.0, + "grad_norm": 3.369543743494967, + "language_loss": 0.85360694, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88803291, + "num_input_tokens_seen": 1241175, + "step": 65, + "time_per_iteration": 2.7996394634246826 + }, + { + "auxiliary_loss_clip": 0.01834022, + "auxiliary_loss_mlp": 0.01568454, + "balance_loss_clip": 1.49037623, + "balance_loss_mlp": 1.20205259, + "epoch": 0.003968134676085976, + "flos": 18335489852160.0, + "grad_norm": 2.259463602928144, + "language_loss": 0.85105443, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88507921, + "num_input_tokens_seen": 1259315, + "step": 66, + "time_per_iteration": 2.8637850284576416 + }, + { + "auxiliary_loss_clip": 0.01826269, + "auxiliary_loss_mlp": 0.01580601, + "balance_loss_clip": 1.48432386, + "balance_loss_mlp": 1.22888553, + "epoch": 0.004028257928753946, + "flos": 20487327576480.0, + "grad_norm": 3.5124185010423457, + "language_loss": 0.96274817, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99681687, + "num_input_tokens_seen": 1277055, + "step": 67, + "time_per_iteration": 2.8805811405181885 + }, + { + "auxiliary_loss_clip": 0.01817628, + "auxiliary_loss_mlp": 0.01571913, + "balance_loss_clip": 1.47501159, + "balance_loss_mlp": 1.19597507, + "epoch": 0.004088381181421915, + "flos": 18845856273600.0, + "grad_norm": 2.440410890967043, + "language_loss": 0.94493997, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97883534, + "num_input_tokens_seen": 1294355, + "step": 68, + "time_per_iteration": 2.910098075866699 + }, + { + "auxiliary_loss_clip": 0.01817124, + "auxiliary_loss_mlp": 0.01565595, + "balance_loss_clip": 1.47537279, + "balance_loss_mlp": 1.20987415, + "epoch": 0.004148504434089885, + "flos": 19429879910400.0, + "grad_norm": 2.356684054221942, + "language_loss": 0.95742697, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.99125415, + "num_input_tokens_seen": 1313525, + "step": 69, + "time_per_iteration": 2.8659002780914307 + }, + { + "auxiliary_loss_clip": 0.01831335, + "auxiliary_loss_mlp": 0.01587374, + "balance_loss_clip": 1.48896766, + "balance_loss_mlp": 1.21811104, + "epoch": 0.004208627686757853, + "flos": 20812982037120.0, + "grad_norm": 2.3301153158945755, + "language_loss": 0.97892725, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.01311445, + "num_input_tokens_seen": 1330505, + "step": 70, + "time_per_iteration": 2.872121810913086 + }, + { + "auxiliary_loss_clip": 0.01818368, + "auxiliary_loss_mlp": 0.01611395, + "balance_loss_clip": 1.47643828, + "balance_loss_mlp": 1.2551024, + "epoch": 0.004268750939425823, + "flos": 19100622274560.0, + "grad_norm": 2.8107834987578233, + "language_loss": 0.93990737, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.97420496, + "num_input_tokens_seen": 1349615, + "step": 71, + "time_per_iteration": 2.8362479209899902 + }, + { + "auxiliary_loss_clip": 0.02276325, + "auxiliary_loss_mlp": 0.01484749, + "balance_loss_clip": 1.93589914, + "balance_loss_mlp": 1.14829254, + "epoch": 0.004328874192093792, + "flos": 52445155409280.0, + "grad_norm": 2.4258741146424376, + "language_loss": 0.65789562, + "learning_rate": 2.75354081884615e-06, + "loss": 0.69550639, + "num_input_tokens_seen": 1410275, + "step": 72, + "time_per_iteration": 3.4405410289764404 + }, + { + "auxiliary_loss_clip": 0.02263431, + "auxiliary_loss_mlp": 0.0146077, + "balance_loss_clip": 1.92258775, + "balance_loss_mlp": 1.11668396, + "epoch": 0.004388997444761762, + "flos": 66480838485120.0, + "grad_norm": 2.265717096520154, + "language_loss": 0.6382972, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.67553926, + "num_input_tokens_seen": 1473020, + "step": 73, + "time_per_iteration": 3.369170665740967 + }, + { + "auxiliary_loss_clip": 0.01799871, + "auxiliary_loss_mlp": 0.01547171, + "balance_loss_clip": 1.45742059, + "balance_loss_mlp": 1.1794343, + "epoch": 0.004449120697429731, + "flos": 18954976682880.0, + "grad_norm": 3.3899974175787855, + "language_loss": 0.86011809, + "learning_rate": 2.771181708202938e-06, + "loss": 0.89358854, + "num_input_tokens_seen": 1490385, + "step": 74, + "time_per_iteration": 2.8723437786102295 + }, + { + "auxiliary_loss_clip": 0.01804462, + "auxiliary_loss_mlp": 0.01560316, + "balance_loss_clip": 1.46069098, + "balance_loss_mlp": 1.19238901, + "epoch": 0.004509243950097701, + "flos": 21107610898560.0, + "grad_norm": 4.267400295052841, + "language_loss": 0.97127211, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00491989, + "num_input_tokens_seen": 1509725, + "step": 75, + "time_per_iteration": 2.8509719371795654 + }, + { + "auxiliary_loss_clip": 0.01797468, + "auxiliary_loss_mlp": 0.01562788, + "balance_loss_clip": 1.45409799, + "balance_loss_mlp": 1.18951964, + "epoch": 0.004569367202765669, + "flos": 20700523949760.0, + "grad_norm": 3.2186305219393923, + "language_loss": 0.87617981, + "learning_rate": 2.788352117317012e-06, + "loss": 0.90978229, + "num_input_tokens_seen": 1527245, + "step": 76, + "time_per_iteration": 2.860159158706665 + }, + { + "auxiliary_loss_clip": 0.01787627, + "auxiliary_loss_mlp": 0.01578604, + "balance_loss_clip": 1.44279492, + "balance_loss_mlp": 1.20056701, + "epoch": 0.004629490455433639, + "flos": 28661041448640.0, + "grad_norm": 2.0173871371399374, + "language_loss": 0.91727144, + "learning_rate": 2.796768605577095e-06, + "loss": 0.95093381, + "num_input_tokens_seen": 1548930, + "step": 77, + "time_per_iteration": 2.9134883880615234 + }, + { + "auxiliary_loss_clip": 0.01801768, + "auxiliary_loss_mlp": 0.01563356, + "balance_loss_clip": 1.45857191, + "balance_loss_mlp": 1.18684578, + "epoch": 0.004689613708101608, + "flos": 11073995264160.0, + "grad_norm": 2.2530343389990466, + "language_loss": 0.92413992, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95779121, + "num_input_tokens_seen": 1565695, + "step": 78, + "time_per_iteration": 2.921921730041504 + }, + { + "auxiliary_loss_clip": 0.0179671, + "auxiliary_loss_mlp": 0.01573841, + "balance_loss_clip": 1.45256925, + "balance_loss_mlp": 1.20038223, + "epoch": 0.004749736960769578, + "flos": 21801285938880.0, + "grad_norm": 3.0404126203480213, + "language_loss": 0.82506895, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85877442, + "num_input_tokens_seen": 1582625, + "step": 79, + "time_per_iteration": 2.843963861465454 + }, + { + "auxiliary_loss_clip": 0.01778866, + "auxiliary_loss_mlp": 0.0156144, + "balance_loss_clip": 1.43437755, + "balance_loss_mlp": 1.18550169, + "epoch": 0.004809860213437547, + "flos": 19794676596480.0, + "grad_norm": 2.721209156689351, + "language_loss": 0.9127875, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.9461906, + "num_input_tokens_seen": 1601725, + "step": 80, + "time_per_iteration": 2.8316915035247803 + }, + { + "auxiliary_loss_clip": 0.01789311, + "auxiliary_loss_mlp": 0.01567436, + "balance_loss_clip": 1.44487882, + "balance_loss_mlp": 1.19874597, + "epoch": 0.004869983466105517, + "flos": 26576526936960.0, + "grad_norm": 2.1006803284646685, + "language_loss": 0.94963634, + "learning_rate": 2.829375683533245e-06, + "loss": 0.98320383, + "num_input_tokens_seen": 1622420, + "step": 81, + "time_per_iteration": 2.926065683364868 + }, + { + "auxiliary_loss_clip": 0.01790686, + "auxiliary_loss_mlp": 0.01587859, + "balance_loss_clip": 1.44637764, + "balance_loss_mlp": 1.21573532, + "epoch": 0.004930106718773485, + "flos": 12825042114240.0, + "grad_norm": 3.372311878731944, + "language_loss": 0.95853615, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99232155, + "num_input_tokens_seen": 1640715, + "step": 82, + "time_per_iteration": 2.849884033203125 + }, + { + "auxiliary_loss_clip": 0.01775305, + "auxiliary_loss_mlp": 0.01569296, + "balance_loss_clip": 1.43009758, + "balance_loss_mlp": 1.18687248, + "epoch": 0.004990229971441455, + "flos": 25777258806240.0, + "grad_norm": 2.040962594538945, + "language_loss": 0.86512232, + "learning_rate": 2.84508017388607e-06, + "loss": 0.89856827, + "num_input_tokens_seen": 1662210, + "step": 83, + "time_per_iteration": 2.9015629291534424 + }, + { + "auxiliary_loss_clip": 0.01773568, + "auxiliary_loss_mlp": 0.01575449, + "balance_loss_clip": 1.42945409, + "balance_loss_mlp": 1.20981085, + "epoch": 0.005050353224109424, + "flos": 17459188899840.0, + "grad_norm": 6.884434730180042, + "language_loss": 0.91788846, + "learning_rate": 2.852791070641559e-06, + "loss": 0.9513787, + "num_input_tokens_seen": 1681070, + "step": 84, + "time_per_iteration": 2.858863353729248 + }, + { + "auxiliary_loss_clip": 0.02145835, + "auxiliary_loss_mlp": 0.01505264, + "balance_loss_clip": 1.80006111, + "balance_loss_mlp": 1.11616516, + "epoch": 0.005110476476777394, + "flos": 69811874726400.0, + "grad_norm": 1.4292852295527783, + "language_loss": 0.62583315, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.66234422, + "num_input_tokens_seen": 1747140, + "step": 85, + "time_per_iteration": 3.4012370109558105 + }, + { + "auxiliary_loss_clip": 0.01761551, + "auxiliary_loss_mlp": 0.01554071, + "balance_loss_clip": 1.41594124, + "balance_loss_mlp": 1.18766928, + "epoch": 0.005170599729445363, + "flos": 24792406367040.0, + "grad_norm": 4.360957607233462, + "language_loss": 0.90690124, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.94005752, + "num_input_tokens_seen": 1767475, + "step": 86, + "time_per_iteration": 2.9059064388275146 + }, + { + "auxiliary_loss_clip": 0.01772337, + "auxiliary_loss_mlp": 0.01591194, + "balance_loss_clip": 1.42666399, + "balance_loss_mlp": 1.22975159, + "epoch": 0.005230722982113333, + "flos": 23260283042400.0, + "grad_norm": 2.9127894845503364, + "language_loss": 0.82041323, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.85404855, + "num_input_tokens_seen": 1784980, + "step": 87, + "time_per_iteration": 2.829425811767578 + }, + { + "auxiliary_loss_clip": 0.01779297, + "auxiliary_loss_mlp": 0.01595439, + "balance_loss_clip": 1.43511474, + "balance_loss_mlp": 1.22789288, + "epoch": 0.005290846234781301, + "flos": 16729823096640.0, + "grad_norm": 2.711567482771107, + "language_loss": 0.95830393, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.99205124, + "num_input_tokens_seen": 1803030, + "step": 88, + "time_per_iteration": 2.876547336578369 + }, + { + "auxiliary_loss_clip": 0.01776209, + "auxiliary_loss_mlp": 0.01609648, + "balance_loss_clip": 1.43206012, + "balance_loss_mlp": 1.25774252, + "epoch": 0.005350969487449271, + "flos": 20888156378880.0, + "grad_norm": 2.237227006610966, + "language_loss": 0.85828131, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.89213991, + "num_input_tokens_seen": 1822865, + "step": 89, + "time_per_iteration": 4.560169458389282 + }, + { + "auxiliary_loss_clip": 0.01763111, + "auxiliary_loss_mlp": 0.01585507, + "balance_loss_clip": 1.41664803, + "balance_loss_mlp": 1.21986818, + "epoch": 0.00541109274011724, + "flos": 26212185388800.0, + "grad_norm": 2.0716511499903603, + "language_loss": 0.91549492, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94898105, + "num_input_tokens_seen": 1842435, + "step": 90, + "time_per_iteration": 2.8546600341796875 + }, + { + "auxiliary_loss_clip": 0.01760682, + "auxiliary_loss_mlp": 0.01563219, + "balance_loss_clip": 1.41632771, + "balance_loss_mlp": 1.2078805, + "epoch": 0.00547121599278521, + "flos": 21180699191520.0, + "grad_norm": 2.5568651121523085, + "language_loss": 0.85875559, + "learning_rate": 2.90432674275074e-06, + "loss": 0.8919946, + "num_input_tokens_seen": 1860065, + "step": 91, + "time_per_iteration": 5.906277894973755 + }, + { + "auxiliary_loss_clip": 0.01767794, + "auxiliary_loss_mlp": 0.01581438, + "balance_loss_clip": 1.423334, + "balance_loss_mlp": 1.22304702, + "epoch": 0.005531339245453179, + "flos": 19720829740320.0, + "grad_norm": 3.0855026165219823, + "language_loss": 0.87155938, + "learning_rate": 2.91136344867656e-06, + "loss": 0.90505177, + "num_input_tokens_seen": 1878135, + "step": 92, + "time_per_iteration": 4.351999282836914 + }, + { + "auxiliary_loss_clip": 0.0175898, + "auxiliary_loss_mlp": 0.01588665, + "balance_loss_clip": 1.41227376, + "balance_loss_mlp": 1.2338984, + "epoch": 0.005591462498121149, + "flos": 17637832355040.0, + "grad_norm": 3.990934816865033, + "language_loss": 0.91901422, + "learning_rate": 2.918324080615938e-06, + "loss": 0.95249063, + "num_input_tokens_seen": 1894895, + "step": 93, + "time_per_iteration": 2.8792145252227783 + }, + { + "auxiliary_loss_clip": 0.01768583, + "auxiliary_loss_mlp": 0.01588721, + "balance_loss_clip": 1.42276084, + "balance_loss_mlp": 1.23319101, + "epoch": 0.005651585750789117, + "flos": 20013296696640.0, + "grad_norm": 2.2521390246461794, + "language_loss": 0.87535822, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90893126, + "num_input_tokens_seen": 1913220, + "step": 94, + "time_per_iteration": 2.8183891773223877 + }, + { + "auxiliary_loss_clip": 0.02067425, + "auxiliary_loss_mlp": 0.01800476, + "balance_loss_clip": 1.71946859, + "balance_loss_mlp": 1.50750732, + "epoch": 0.005711709003457087, + "flos": 59819070156480.0, + "grad_norm": 1.5749318650899682, + "language_loss": 0.68141347, + "learning_rate": 2.932023580065507e-06, + "loss": 0.72009248, + "num_input_tokens_seen": 1970970, + "step": 95, + "time_per_iteration": 3.2542543411254883 + }, + { + "auxiliary_loss_clip": 0.01755286, + "auxiliary_loss_mlp": 0.01560823, + "balance_loss_clip": 1.40750802, + "balance_loss_mlp": 1.19747305, + "epoch": 0.005771832256125056, + "flos": 15561396541440.0, + "grad_norm": 2.748372821327911, + "language_loss": 0.90071386, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.93387496, + "num_input_tokens_seen": 1988930, + "step": 96, + "time_per_iteration": 2.8846993446350098 + }, + { + "auxiliary_loss_clip": 0.01759968, + "auxiliary_loss_mlp": 0.01614413, + "balance_loss_clip": 1.41237664, + "balance_loss_mlp": 1.26117229, + "epoch": 0.005831955508793026, + "flos": 22530575885760.0, + "grad_norm": 5.560707407490209, + "language_loss": 0.89738524, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.93112904, + "num_input_tokens_seen": 2006285, + "step": 97, + "time_per_iteration": 2.8960373401641846 + }, + { + "auxiliary_loss_clip": 0.01746955, + "auxiliary_loss_mlp": 0.01551358, + "balance_loss_clip": 1.40050793, + "balance_loss_mlp": 1.19430244, + "epoch": 0.005892078761460995, + "flos": 22051766057760.0, + "grad_norm": 2.1784913450117185, + "language_loss": 0.76186949, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79485261, + "num_input_tokens_seen": 2024905, + "step": 98, + "time_per_iteration": 3.050239324569702 + }, + { + "auxiliary_loss_clip": 0.02036279, + "auxiliary_loss_mlp": 0.01467537, + "balance_loss_clip": 1.68687391, + "balance_loss_mlp": 1.10437775, + "epoch": 0.005952202014128965, + "flos": 68546085491520.0, + "grad_norm": 1.0502745876268251, + "language_loss": 0.65528238, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.69032049, + "num_input_tokens_seen": 2086220, + "step": 99, + "time_per_iteration": 3.4221532344818115 + }, + { + "auxiliary_loss_clip": 0.01751651, + "auxiliary_loss_mlp": 0.01561636, + "balance_loss_clip": 1.40499616, + "balance_loss_mlp": 1.19676065, + "epoch": 0.006012325266796933, + "flos": 22962809568960.0, + "grad_norm": 2.3822640371533175, + "language_loss": 0.90828037, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.94141316, + "num_input_tokens_seen": 2103365, + "step": 100, + "time_per_iteration": 2.889101505279541 + }, + { + "auxiliary_loss_clip": 0.0175147, + "auxiliary_loss_mlp": 0.01550629, + "balance_loss_clip": 1.40358055, + "balance_loss_mlp": 1.18174815, + "epoch": 0.006072448519464903, + "flos": 17349878849760.0, + "grad_norm": 2.543858099214464, + "language_loss": 0.91024148, + "learning_rate": 2.971455421902446e-06, + "loss": 0.94326246, + "num_input_tokens_seen": 2121995, + "step": 101, + "time_per_iteration": 2.8827998638153076 + }, + { + "auxiliary_loss_clip": 0.01761055, + "auxiliary_loss_mlp": 0.01548791, + "balance_loss_clip": 1.41524351, + "balance_loss_mlp": 1.17094505, + "epoch": 0.006132571772132872, + "flos": 24683437670400.0, + "grad_norm": 2.1601661373785253, + "language_loss": 0.90785384, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.94095224, + "num_input_tokens_seen": 2141815, + "step": 102, + "time_per_iteration": 2.8687779903411865 + }, + { + "auxiliary_loss_clip": 0.01760529, + "auxiliary_loss_mlp": 0.01550287, + "balance_loss_clip": 1.4131496, + "balance_loss_mlp": 1.1703428, + "epoch": 0.006192695024800842, + "flos": 21467552780160.0, + "grad_norm": 2.463600373808102, + "language_loss": 0.87996358, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.91307169, + "num_input_tokens_seen": 2161125, + "step": 103, + "time_per_iteration": 2.8925797939300537 + }, + { + "auxiliary_loss_clip": 0.01755765, + "auxiliary_loss_mlp": 0.01565967, + "balance_loss_clip": 1.40840626, + "balance_loss_mlp": 1.18678594, + "epoch": 0.006252818277468811, + "flos": 17422322364000.0, + "grad_norm": 2.6012847530639087, + "language_loss": 0.93815541, + "learning_rate": 2.990301221458371e-06, + "loss": 0.97137272, + "num_input_tokens_seen": 2179510, + "step": 104, + "time_per_iteration": 2.756457805633545 + }, + { + "auxiliary_loss_clip": 0.01757988, + "auxiliary_loss_mlp": 0.01538782, + "balance_loss_clip": 1.41114187, + "balance_loss_mlp": 1.15845644, + "epoch": 0.006312941530136781, + "flos": 19101608406720.0, + "grad_norm": 2.436482014837278, + "language_loss": 0.9625743, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99554199, + "num_input_tokens_seen": 2197870, + "step": 105, + "time_per_iteration": 2.8295345306396484 + }, + { + "auxiliary_loss_clip": 0.01759243, + "auxiliary_loss_mlp": 0.01603898, + "balance_loss_clip": 1.41096163, + "balance_loss_mlp": 1.22357321, + "epoch": 0.006373064782804749, + "flos": 24063116420160.0, + "grad_norm": 2.395072854089453, + "language_loss": 0.86912382, + "learning_rate": 3.002565443382063e-06, + "loss": 0.9027552, + "num_input_tokens_seen": 2217495, + "step": 106, + "time_per_iteration": 2.8451669216156006 + }, + { + "auxiliary_loss_clip": 0.01742398, + "auxiliary_loss_mlp": 0.01553213, + "balance_loss_clip": 1.39409685, + "balance_loss_mlp": 1.17345989, + "epoch": 0.006433188035472719, + "flos": 18334389935520.0, + "grad_norm": 2.9720750557106905, + "language_loss": 0.83280277, + "learning_rate": 3.008611048208843e-06, + "loss": 0.8657589, + "num_input_tokens_seen": 2236520, + "step": 107, + "time_per_iteration": 2.7572295665740967 + }, + { + "auxiliary_loss_clip": 0.01981313, + "auxiliary_loss_mlp": 0.01705265, + "balance_loss_clip": 1.62954116, + "balance_loss_mlp": 1.26886368, + "epoch": 0.006493311288140688, + "flos": 62569382146560.0, + "grad_norm": 1.0717937984720578, + "language_loss": 0.64818078, + "learning_rate": 3.014600414036285e-06, + "loss": 0.68504661, + "num_input_tokens_seen": 2300140, + "step": 108, + "time_per_iteration": 3.377385139465332 + }, + { + "auxiliary_loss_clip": 0.01758691, + "auxiliary_loss_mlp": 0.01534935, + "balance_loss_clip": 1.41020441, + "balance_loss_mlp": 1.14621711, + "epoch": 0.006553434540808658, + "flos": 19502171712000.0, + "grad_norm": 2.1692117788236835, + "language_loss": 0.97716069, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.01009691, + "num_input_tokens_seen": 2317320, + "step": 109, + "time_per_iteration": 2.8767192363739014 + }, + { + "auxiliary_loss_clip": 0.01751116, + "auxiliary_loss_mlp": 0.01540899, + "balance_loss_clip": 1.40280104, + "balance_loss_mlp": 1.17297125, + "epoch": 0.006613557793476627, + "flos": 21107231616960.0, + "grad_norm": 2.4995773281320077, + "language_loss": 0.8397311, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.87265122, + "num_input_tokens_seen": 2337820, + "step": 110, + "time_per_iteration": 3.1546387672424316 + }, + { + "auxiliary_loss_clip": 0.01743977, + "auxiliary_loss_mlp": 0.01589338, + "balance_loss_clip": 1.3955729, + "balance_loss_mlp": 1.21931267, + "epoch": 0.006673681046144597, + "flos": 26033124723840.0, + "grad_norm": 2.0763168416112947, + "language_loss": 0.83039463, + "learning_rate": 3.032241303393073e-06, + "loss": 0.86372769, + "num_input_tokens_seen": 2358560, + "step": 111, + "time_per_iteration": 2.921374797821045 + }, + { + "auxiliary_loss_clip": 0.017462, + "auxiliary_loss_mlp": 0.01543952, + "balance_loss_clip": 1.3983928, + "balance_loss_mlp": 1.17526138, + "epoch": 0.006733804298812566, + "flos": 23149948932000.0, + "grad_norm": 2.5106080239573636, + "language_loss": 0.93983191, + "learning_rate": 3.0380158011446e-06, + "loss": 0.97273338, + "num_input_tokens_seen": 2379005, + "step": 112, + "time_per_iteration": 2.8605637550354004 + }, + { + "auxiliary_loss_clip": 0.01754237, + "auxiliary_loss_mlp": 0.01532962, + "balance_loss_clip": 1.40719151, + "balance_loss_mlp": 1.15797782, + "epoch": 0.006793927551480535, + "flos": 11766001465440.0, + "grad_norm": 3.07461674325677, + "language_loss": 0.79607058, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.8289426, + "num_input_tokens_seen": 2395610, + "step": 113, + "time_per_iteration": 2.9569013118743896 + }, + { + "auxiliary_loss_clip": 0.01750703, + "auxiliary_loss_mlp": 0.01560928, + "balance_loss_clip": 1.40373123, + "balance_loss_mlp": 1.19280958, + "epoch": 0.006854050804148504, + "flos": 19173976064640.0, + "grad_norm": 1.897772667984513, + "language_loss": 0.93388563, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.96700197, + "num_input_tokens_seen": 2415005, + "step": 114, + "time_per_iteration": 2.866842269897461 + }, + { + "auxiliary_loss_clip": 0.01744365, + "auxiliary_loss_mlp": 0.01555661, + "balance_loss_clip": 1.39607787, + "balance_loss_mlp": 1.19631648, + "epoch": 0.006914174056816474, + "flos": 21984101491680.0, + "grad_norm": 2.0006031745519053, + "language_loss": 0.94818467, + "learning_rate": 3.055034911425055e-06, + "loss": 0.9811849, + "num_input_tokens_seen": 2433965, + "step": 115, + "time_per_iteration": 2.894174098968506 + }, + { + "auxiliary_loss_clip": 0.01740412, + "auxiliary_loss_mlp": 0.01571456, + "balance_loss_clip": 1.39395809, + "balance_loss_mlp": 1.21726108, + "epoch": 0.006974297309484443, + "flos": 16290989913600.0, + "grad_norm": 2.5111618359662655, + "language_loss": 0.81935644, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.85247511, + "num_input_tokens_seen": 2451605, + "step": 116, + "time_per_iteration": 2.89799427986145 + }, + { + "auxiliary_loss_clip": 0.01756091, + "auxiliary_loss_mlp": 0.01597898, + "balance_loss_clip": 1.40819979, + "balance_loss_mlp": 1.23836279, + "epoch": 0.007034420562152413, + "flos": 26106402657600.0, + "grad_norm": 2.3546775422577593, + "language_loss": 0.88289732, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.91643715, + "num_input_tokens_seen": 2472035, + "step": 117, + "time_per_iteration": 2.967754364013672 + }, + { + "auxiliary_loss_clip": 0.01747683, + "auxiliary_loss_mlp": 0.01578624, + "balance_loss_clip": 1.40041351, + "balance_loss_mlp": 1.21260357, + "epoch": 0.007094543814820382, + "flos": 14206058192160.0, + "grad_norm": 4.037397401219647, + "language_loss": 0.84733576, + "learning_rate": 3.071615712271274e-06, + "loss": 0.88059878, + "num_input_tokens_seen": 2489285, + "step": 118, + "time_per_iteration": 2.883877992630005 + }, + { + "auxiliary_loss_clip": 0.01739556, + "auxiliary_loss_mlp": 0.0153911, + "balance_loss_clip": 1.39012754, + "balance_loss_mlp": 1.17289925, + "epoch": 0.007154667067488351, + "flos": 14977790114400.0, + "grad_norm": 4.165754678434279, + "language_loss": 0.99251837, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.02530503, + "num_input_tokens_seen": 2506460, + "step": 119, + "time_per_iteration": 2.857454776763916 + }, + { + "auxiliary_loss_clip": 0.01744942, + "auxiliary_loss_mlp": 0.01563572, + "balance_loss_clip": 1.39650273, + "balance_loss_mlp": 1.21452713, + "epoch": 0.00721479032015632, + "flos": 20195353686240.0, + "grad_norm": 13.768772038946743, + "language_loss": 0.89245725, + "learning_rate": 3.082437012097686e-06, + "loss": 0.92554235, + "num_input_tokens_seen": 2525565, + "step": 120, + "time_per_iteration": 2.9515414237976074 + }, + { + "auxiliary_loss_clip": 0.01749639, + "auxiliary_loss_mlp": 0.01565016, + "balance_loss_clip": 1.40118742, + "balance_loss_mlp": 1.20166636, + "epoch": 0.00727491357282429, + "flos": 23149380009600.0, + "grad_norm": 1.7836997040494085, + "language_loss": 0.9336713, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96681786, + "num_input_tokens_seen": 2546605, + "step": 121, + "time_per_iteration": 2.9649617671966553 + }, + { + "auxiliary_loss_clip": 0.01741118, + "auxiliary_loss_mlp": 0.01614394, + "balance_loss_clip": 1.39319777, + "balance_loss_mlp": 1.26077223, + "epoch": 0.007335036825492259, + "flos": 15523392160800.0, + "grad_norm": 2.66450680804015, + "language_loss": 0.90353173, + "learning_rate": 3.09307943925077e-06, + "loss": 0.93708682, + "num_input_tokens_seen": 2560730, + "step": 122, + "time_per_iteration": 2.8650472164154053 + }, + { + "auxiliary_loss_clip": 0.01737641, + "auxiliary_loss_mlp": 0.01588855, + "balance_loss_clip": 1.3884939, + "balance_loss_mlp": 1.23199034, + "epoch": 0.007395160078160229, + "flos": 24245666475840.0, + "grad_norm": 2.353328407007667, + "language_loss": 0.92548186, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95874679, + "num_input_tokens_seen": 2579550, + "step": 123, + "time_per_iteration": 2.922691583633423 + }, + { + "auxiliary_loss_clip": 0.01741655, + "auxiliary_loss_mlp": 0.01595396, + "balance_loss_clip": 1.3924334, + "balance_loss_mlp": 1.23586106, + "epoch": 0.007455283330828198, + "flos": 31762533915360.0, + "grad_norm": 13.31755688668241, + "language_loss": 0.71002614, + "learning_rate": 3.103548811118979e-06, + "loss": 0.74339664, + "num_input_tokens_seen": 2600390, + "step": 124, + "time_per_iteration": 2.9504404067993164 + }, + { + "auxiliary_loss_clip": 0.01736632, + "auxiliary_loss_mlp": 0.01560081, + "balance_loss_clip": 1.38876033, + "balance_loss_mlp": 1.21160841, + "epoch": 0.007515406583496167, + "flos": 26617603498560.0, + "grad_norm": 2.2834658083833106, + "language_loss": 0.8824088, + "learning_rate": 3.108720342404542e-06, + "loss": 0.91537589, + "num_input_tokens_seen": 2620770, + "step": 125, + "time_per_iteration": 2.8969600200653076 + }, + { + "auxiliary_loss_clip": 0.01748806, + "auxiliary_loss_mlp": 0.01557336, + "balance_loss_clip": 1.39972019, + "balance_loss_mlp": 1.20123386, + "epoch": 0.007575529836164136, + "flos": 18225686736000.0, + "grad_norm": 2.823815699764895, + "language_loss": 0.82210815, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.85516959, + "num_input_tokens_seen": 2639900, + "step": 126, + "time_per_iteration": 3.0311837196350098 + }, + { + "auxiliary_loss_clip": 0.01728145, + "auxiliary_loss_mlp": 0.0159492, + "balance_loss_clip": 1.3780905, + "balance_loss_mlp": 1.24892652, + "epoch": 0.007635653088832106, + "flos": 21582589982400.0, + "grad_norm": 3.0152895408101696, + "language_loss": 0.67418092, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.70741159, + "num_input_tokens_seen": 2657450, + "step": 127, + "time_per_iteration": 2.885483980178833 + }, + { + "auxiliary_loss_clip": 0.01736791, + "auxiliary_loss_mlp": 0.01620987, + "balance_loss_clip": 1.38745105, + "balance_loss_mlp": 1.26450384, + "epoch": 0.007695776341500075, + "flos": 25377454064160.0, + "grad_norm": 2.0814222095860897, + "language_loss": 0.88237268, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.91595048, + "num_input_tokens_seen": 2678150, + "step": 128, + "time_per_iteration": 4.414397239685059 + }, + { + "auxiliary_loss_clip": 0.01736829, + "auxiliary_loss_mlp": 0.01580432, + "balance_loss_clip": 1.3883127, + "balance_loss_mlp": 1.22947943, + "epoch": 0.007755899594168045, + "flos": 22345977709440.0, + "grad_norm": 1.8427987810408712, + "language_loss": 0.84622967, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87940228, + "num_input_tokens_seen": 2698290, + "step": 129, + "time_per_iteration": 4.416522979736328 + }, + { + "auxiliary_loss_clip": 0.01737726, + "auxiliary_loss_mlp": 0.01587106, + "balance_loss_clip": 1.38849807, + "balance_loss_mlp": 1.23596263, + "epoch": 0.007816022846836013, + "flos": 22640454858240.0, + "grad_norm": 5.7070510224437365, + "language_loss": 0.97340685, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00665522, + "num_input_tokens_seen": 2717630, + "step": 130, + "time_per_iteration": 5.865610122680664 + }, + { + "auxiliary_loss_clip": 0.01733741, + "auxiliary_loss_mlp": 0.01546292, + "balance_loss_clip": 1.38591838, + "balance_loss_mlp": 1.19400454, + "epoch": 0.007876146099503984, + "flos": 18184648102560.0, + "grad_norm": 2.0000786407199747, + "language_loss": 0.82586932, + "learning_rate": 3.138906441556014e-06, + "loss": 0.8586697, + "num_input_tokens_seen": 2735835, + "step": 131, + "time_per_iteration": 2.9050519466400146 + }, + { + "auxiliary_loss_clip": 0.01736594, + "auxiliary_loss_mlp": 0.01577223, + "balance_loss_clip": 1.38702106, + "balance_loss_mlp": 1.22073996, + "epoch": 0.007936269352171952, + "flos": 27121597989120.0, + "grad_norm": 2.3799475931699896, + "language_loss": 0.83147812, + "learning_rate": 3.143802679474861e-06, + "loss": 0.86461627, + "num_input_tokens_seen": 2756335, + "step": 132, + "time_per_iteration": 2.951683282852173 + }, + { + "auxiliary_loss_clip": 0.0172519, + "auxiliary_loss_mlp": 0.01547216, + "balance_loss_clip": 1.37570214, + "balance_loss_mlp": 1.18234062, + "epoch": 0.007996392604839923, + "flos": 19028368401120.0, + "grad_norm": 2.490188596773428, + "language_loss": 0.9532305, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98595458, + "num_input_tokens_seen": 2775090, + "step": 133, + "time_per_iteration": 2.9053518772125244 + }, + { + "auxiliary_loss_clip": 0.01730014, + "auxiliary_loss_mlp": 0.01565315, + "balance_loss_clip": 1.38067448, + "balance_loss_mlp": 1.20139313, + "epoch": 0.008056515857507891, + "flos": 25486536545280.0, + "grad_norm": 2.1847620683345603, + "language_loss": 0.73396891, + "learning_rate": 3.153484849651286e-06, + "loss": 0.76692224, + "num_input_tokens_seen": 2795320, + "step": 134, + "time_per_iteration": 2.8329107761383057 + }, + { + "auxiliary_loss_clip": 0.01721423, + "auxiliary_loss_mlp": 0.01559107, + "balance_loss_clip": 1.37322807, + "balance_loss_mlp": 1.18488479, + "epoch": 0.00811663911017586, + "flos": 20559619378080.0, + "grad_norm": 3.2285186111506525, + "language_loss": 0.8876574, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.92046273, + "num_input_tokens_seen": 2812815, + "step": 135, + "time_per_iteration": 2.79179048538208 + }, + { + "auxiliary_loss_clip": 0.01735207, + "auxiliary_loss_mlp": 0.0156029, + "balance_loss_clip": 1.38543832, + "balance_loss_mlp": 1.18645, + "epoch": 0.00817676236284383, + "flos": 18801328249440.0, + "grad_norm": 1.9643895811442385, + "language_loss": 0.8908295, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.92378443, + "num_input_tokens_seen": 2830445, + "step": 136, + "time_per_iteration": 2.7759294509887695 + }, + { + "auxiliary_loss_clip": 0.01731063, + "auxiliary_loss_mlp": 0.01536095, + "balance_loss_clip": 1.38233066, + "balance_loss_mlp": 1.16091919, + "epoch": 0.008236885615511799, + "flos": 23875597775520.0, + "grad_norm": 2.2092794042251604, + "language_loss": 0.84441161, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.87708324, + "num_input_tokens_seen": 2846965, + "step": 137, + "time_per_iteration": 2.843644857406616 + }, + { + "auxiliary_loss_clip": 0.01726698, + "auxiliary_loss_mlp": 0.01559797, + "balance_loss_clip": 1.37709701, + "balance_loss_mlp": 1.18786395, + "epoch": 0.00829700886817977, + "flos": 24645888427680.0, + "grad_norm": 1.762521732408188, + "language_loss": 0.89963925, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.93250418, + "num_input_tokens_seen": 2867520, + "step": 138, + "time_per_iteration": 2.8695223331451416 + }, + { + "auxiliary_loss_clip": 0.01728609, + "auxiliary_loss_mlp": 0.01512603, + "balance_loss_clip": 1.38030958, + "balance_loss_mlp": 1.1452477, + "epoch": 0.008357132120847738, + "flos": 25264199485440.0, + "grad_norm": 2.347193852611685, + "language_loss": 0.91474783, + "learning_rate": 3.177071816289865e-06, + "loss": 0.94716001, + "num_input_tokens_seen": 2885675, + "step": 139, + "time_per_iteration": 2.8741543292999268 + }, + { + "auxiliary_loss_clip": 0.01730096, + "auxiliary_loss_mlp": 0.0156683, + "balance_loss_clip": 1.38181257, + "balance_loss_mlp": 1.20557785, + "epoch": 0.008417255373515706, + "flos": 27347386511520.0, + "grad_norm": 3.1114070231094866, + "language_loss": 0.85656482, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88953412, + "num_input_tokens_seen": 2905960, + "step": 140, + "time_per_iteration": 3.0525121688842773 + }, + { + "auxiliary_loss_clip": 0.01722527, + "auxiliary_loss_mlp": 0.01575111, + "balance_loss_clip": 1.37340045, + "balance_loss_mlp": 1.19726503, + "epoch": 0.008477378626183677, + "flos": 17640980392320.0, + "grad_norm": 2.6630567247028627, + "language_loss": 0.84601271, + "learning_rate": 3.186269861057098e-06, + "loss": 0.8789891, + "num_input_tokens_seen": 2922780, + "step": 141, + "time_per_iteration": 2.8714964389801025 + }, + { + "auxiliary_loss_clip": 0.01725572, + "auxiliary_loss_mlp": 0.01527909, + "balance_loss_clip": 1.3776958, + "balance_loss_mlp": 1.15464103, + "epoch": 0.008537501878851645, + "flos": 13883058702720.0, + "grad_norm": 2.472889981322148, + "language_loss": 0.81199896, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.8445338, + "num_input_tokens_seen": 2938765, + "step": 142, + "time_per_iteration": 2.8510332107543945 + }, + { + "auxiliary_loss_clip": 0.01943714, + "auxiliary_loss_mlp": 0.0157727, + "balance_loss_clip": 1.59179425, + "balance_loss_mlp": 1.25836182, + "epoch": 0.008597625131519616, + "flos": 71255700201600.0, + "grad_norm": 1.2169281812931712, + "language_loss": 0.66975504, + "learning_rate": 3.195338351584042e-06, + "loss": 0.70496494, + "num_input_tokens_seen": 3006665, + "step": 143, + "time_per_iteration": 3.502661943435669 + }, + { + "auxiliary_loss_clip": 0.01735207, + "auxiliary_loss_mlp": 0.01546251, + "balance_loss_clip": 1.38749325, + "balance_loss_mlp": 1.17431808, + "epoch": 0.008657748384187584, + "flos": 17604758635200.0, + "grad_norm": 1.9963256592660352, + "language_loss": 0.84398341, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.87679803, + "num_input_tokens_seen": 3024335, + "step": 144, + "time_per_iteration": 2.8525829315185547 + }, + { + "auxiliary_loss_clip": 0.01716125, + "auxiliary_loss_mlp": 0.0156646, + "balance_loss_clip": 1.36627352, + "balance_loss_mlp": 1.19147539, + "epoch": 0.008717871636855555, + "flos": 19717150708800.0, + "grad_norm": 2.2350085503222177, + "language_loss": 0.88699204, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91981792, + "num_input_tokens_seen": 3043300, + "step": 145, + "time_per_iteration": 2.911592483520508 + }, + { + "auxiliary_loss_clip": 0.01720657, + "auxiliary_loss_mlp": 0.01562251, + "balance_loss_clip": 1.37039948, + "balance_loss_mlp": 1.18116248, + "epoch": 0.008777994889523523, + "flos": 24719735283840.0, + "grad_norm": 2.1440934897697335, + "language_loss": 0.86068815, + "learning_rate": 3.208706005112005e-06, + "loss": 0.89351726, + "num_input_tokens_seen": 3064610, + "step": 146, + "time_per_iteration": 2.9139187335968018 + }, + { + "auxiliary_loss_clip": 0.01929677, + "auxiliary_loss_mlp": 0.01452957, + "balance_loss_clip": 1.57895803, + "balance_loss_mlp": 1.07530212, + "epoch": 0.008838118142191492, + "flos": 70138401534720.0, + "grad_norm": 0.8830721688214124, + "language_loss": 0.60123968, + "learning_rate": 3.213100917627104e-06, + "loss": 0.63506603, + "num_input_tokens_seen": 3130385, + "step": 147, + "time_per_iteration": 3.451489210128784 + }, + { + "auxiliary_loss_clip": 0.01725147, + "auxiliary_loss_mlp": 0.01559591, + "balance_loss_clip": 1.37743795, + "balance_loss_mlp": 1.17945635, + "epoch": 0.008898241394859462, + "flos": 20046711769920.0, + "grad_norm": 2.1290332128157683, + "language_loss": 0.84760761, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.88045496, + "num_input_tokens_seen": 3149760, + "step": 148, + "time_per_iteration": 2.8733620643615723 + }, + { + "auxiliary_loss_clip": 0.01726135, + "auxiliary_loss_mlp": 0.01575105, + "balance_loss_clip": 1.38000751, + "balance_loss_mlp": 1.18619657, + "epoch": 0.008958364647527431, + "flos": 10745154838080.0, + "grad_norm": 4.3613577979347955, + "language_loss": 0.88850296, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.92151535, + "num_input_tokens_seen": 3164500, + "step": 149, + "time_per_iteration": 2.8716866970062256 + }, + { + "auxiliary_loss_clip": 0.01717083, + "auxiliary_loss_mlp": 0.01540823, + "balance_loss_clip": 1.36745536, + "balance_loss_mlp": 1.15859008, + "epoch": 0.009018487900195401, + "flos": 29129762386080.0, + "grad_norm": 2.7593445880802467, + "language_loss": 0.93126714, + "learning_rate": 3.226108474846181e-06, + "loss": 0.96384615, + "num_input_tokens_seen": 3182455, + "step": 150, + "time_per_iteration": 2.9836010932922363 + }, + { + "auxiliary_loss_clip": 0.01711727, + "auxiliary_loss_mlp": 0.01546345, + "balance_loss_clip": 1.36259961, + "balance_loss_mlp": 1.16029787, + "epoch": 0.00907861115286337, + "flos": 32966461592640.0, + "grad_norm": 1.8004048924915004, + "language_loss": 0.74284309, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.77542388, + "num_input_tokens_seen": 3203995, + "step": 151, + "time_per_iteration": 2.9564008712768555 + }, + { + "auxiliary_loss_clip": 0.01725139, + "auxiliary_loss_mlp": 0.01551248, + "balance_loss_clip": 1.37567556, + "balance_loss_mlp": 1.16672647, + "epoch": 0.009138734405531338, + "flos": 21764609043840.0, + "grad_norm": 2.6561672304416377, + "language_loss": 0.88486606, + "learning_rate": 3.234636443010188e-06, + "loss": 0.91762996, + "num_input_tokens_seen": 3222575, + "step": 152, + "time_per_iteration": 2.877098560333252 + }, + { + "auxiliary_loss_clip": 0.01724278, + "auxiliary_loss_mlp": 0.01558887, + "balance_loss_clip": 1.37773931, + "balance_loss_mlp": 1.18962431, + "epoch": 0.009198857658199309, + "flos": 20844197277120.0, + "grad_norm": 6.996507599604795, + "language_loss": 0.84308064, + "learning_rate": 3.238858439669943e-06, + "loss": 0.87591231, + "num_input_tokens_seen": 3240180, + "step": 153, + "time_per_iteration": 2.825929641723633 + }, + { + "auxiliary_loss_clip": 0.01711297, + "auxiliary_loss_mlp": 0.01521308, + "balance_loss_clip": 1.36346388, + "balance_loss_mlp": 1.13945723, + "epoch": 0.009258980910867277, + "flos": 24829728040800.0, + "grad_norm": 2.9337563003028206, + "language_loss": 0.89949131, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.93181741, + "num_input_tokens_seen": 3259800, + "step": 154, + "time_per_iteration": 2.8418450355529785 + }, + { + "auxiliary_loss_clip": 0.01715966, + "auxiliary_loss_mlp": 0.01543632, + "balance_loss_clip": 1.36889601, + "balance_loss_mlp": 1.16445065, + "epoch": 0.009319104163535248, + "flos": 28770351498720.0, + "grad_norm": 2.12195388527671, + "language_loss": 0.89866269, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.93125868, + "num_input_tokens_seen": 3280400, + "step": 155, + "time_per_iteration": 2.8672049045562744 + }, + { + "auxiliary_loss_clip": 0.01719394, + "auxiliary_loss_mlp": 0.0157538, + "balance_loss_clip": 1.37225676, + "balance_loss_mlp": 1.20211184, + "epoch": 0.009379227416203216, + "flos": 16583912007840.0, + "grad_norm": 2.75688650650059, + "language_loss": 0.86794782, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.9008956, + "num_input_tokens_seen": 3297600, + "step": 156, + "time_per_iteration": 2.901200294494629 + }, + { + "auxiliary_loss_clip": 0.01711674, + "auxiliary_loss_mlp": 0.01548115, + "balance_loss_clip": 1.36356497, + "balance_loss_mlp": 1.1885798, + "epoch": 0.009439350668871187, + "flos": 18331734964320.0, + "grad_norm": 2.3454657836917505, + "language_loss": 0.9975999, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.03019774, + "num_input_tokens_seen": 3313635, + "step": 157, + "time_per_iteration": 2.8727312088012695 + }, + { + "auxiliary_loss_clip": 0.01719335, + "auxiliary_loss_mlp": 0.01559899, + "balance_loss_clip": 1.37146461, + "balance_loss_mlp": 1.18891931, + "epoch": 0.009499473921539155, + "flos": 24352056057600.0, + "grad_norm": 2.1555555588881274, + "language_loss": 0.88211221, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.91490453, + "num_input_tokens_seen": 3333735, + "step": 158, + "time_per_iteration": 2.8293707370758057 + }, + { + "auxiliary_loss_clip": 0.01715572, + "auxiliary_loss_mlp": 0.0153204, + "balance_loss_clip": 1.36750221, + "balance_loss_mlp": 1.16506624, + "epoch": 0.009559597174207124, + "flos": 16401210239520.0, + "grad_norm": 5.420302658834396, + "language_loss": 0.86566061, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89813673, + "num_input_tokens_seen": 3348800, + "step": 159, + "time_per_iteration": 2.8185412883758545 + }, + { + "auxiliary_loss_clip": 0.0171384, + "auxiliary_loss_mlp": 0.01550332, + "balance_loss_clip": 1.36598253, + "balance_loss_mlp": 1.17763615, + "epoch": 0.009619720426875094, + "flos": 22859112886560.0, + "grad_norm": 1.6442429450234395, + "language_loss": 0.866157, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.8987987, + "num_input_tokens_seen": 3368595, + "step": 160, + "time_per_iteration": 2.8044028282165527 + }, + { + "auxiliary_loss_clip": 0.01709766, + "auxiliary_loss_mlp": 0.01536656, + "balance_loss_clip": 1.36188006, + "balance_loss_mlp": 1.16682112, + "epoch": 0.009679843679543063, + "flos": 19136995744320.0, + "grad_norm": 2.352684631790906, + "language_loss": 0.91573834, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94820255, + "num_input_tokens_seen": 3384975, + "step": 161, + "time_per_iteration": 2.8148436546325684 + }, + { + "auxiliary_loss_clip": 0.01722924, + "auxiliary_loss_mlp": 0.01573235, + "balance_loss_clip": 1.37298512, + "balance_loss_mlp": 1.21656096, + "epoch": 0.009739966932211033, + "flos": 20305080946080.0, + "grad_norm": 2.4212853321208465, + "language_loss": 0.91630435, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94926596, + "num_input_tokens_seen": 3404755, + "step": 162, + "time_per_iteration": 2.831850051879883 + }, + { + "auxiliary_loss_clip": 0.01867766, + "auxiliary_loss_mlp": 0.01462891, + "balance_loss_clip": 1.51778185, + "balance_loss_mlp": 1.09973145, + "epoch": 0.009800090184879002, + "flos": 67040588099520.0, + "grad_norm": 1.195957073672525, + "language_loss": 0.7235657, + "learning_rate": 3.279622189013474e-06, + "loss": 0.75687224, + "num_input_tokens_seen": 3467210, + "step": 163, + "time_per_iteration": 3.362168073654175 + }, + { + "auxiliary_loss_clip": 0.01719813, + "auxiliary_loss_mlp": 0.01561163, + "balance_loss_clip": 1.37090206, + "balance_loss_mlp": 1.1962868, + "epoch": 0.00986021343754697, + "flos": 17166570230880.0, + "grad_norm": 11.366824917909527, + "language_loss": 0.84556752, + "learning_rate": 3.283560135133457e-06, + "loss": 0.87837732, + "num_input_tokens_seen": 3483220, + "step": 164, + "time_per_iteration": 2.7935991287231445 + }, + { + "auxiliary_loss_clip": 0.01702741, + "auxiliary_loss_mlp": 0.01565294, + "balance_loss_clip": 1.3527441, + "balance_loss_mlp": 1.21052706, + "epoch": 0.00992033669021494, + "flos": 17751390359040.0, + "grad_norm": 2.1942438360557235, + "language_loss": 0.88950086, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.92218119, + "num_input_tokens_seen": 3501465, + "step": 165, + "time_per_iteration": 2.769026756286621 + }, + { + "auxiliary_loss_clip": 0.01702929, + "auxiliary_loss_mlp": 0.01531108, + "balance_loss_clip": 1.35396397, + "balance_loss_mlp": 1.16737688, + "epoch": 0.00998045994288291, + "flos": 25299283397760.0, + "grad_norm": 2.025965332734924, + "language_loss": 0.79996282, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.83230317, + "num_input_tokens_seen": 3520480, + "step": 166, + "time_per_iteration": 4.463557720184326 + }, + { + "auxiliary_loss_clip": 0.01703807, + "auxiliary_loss_mlp": 0.01555821, + "balance_loss_clip": 1.35345018, + "balance_loss_mlp": 1.20010042, + "epoch": 0.01004058319555088, + "flos": 32301270964800.0, + "grad_norm": 2.2850880721775395, + "language_loss": 0.92015392, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.95275021, + "num_input_tokens_seen": 3539570, + "step": 167, + "time_per_iteration": 5.872593879699707 + }, + { + "auxiliary_loss_clip": 0.01699729, + "auxiliary_loss_mlp": 0.01538584, + "balance_loss_clip": 1.34902227, + "balance_loss_mlp": 1.17637801, + "epoch": 0.010100706448218848, + "flos": 11321099776800.0, + "grad_norm": 3.049994399903218, + "language_loss": 0.90450084, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93688399, + "num_input_tokens_seen": 3555465, + "step": 168, + "time_per_iteration": 4.349759101867676 + }, + { + "auxiliary_loss_clip": 0.01702493, + "auxiliary_loss_mlp": 0.01548905, + "balance_loss_clip": 1.35163808, + "balance_loss_mlp": 1.19032347, + "epoch": 0.010160829700886819, + "flos": 29722281930720.0, + "grad_norm": 1.8080219438608431, + "language_loss": 0.87280345, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.90531743, + "num_input_tokens_seen": 3578970, + "step": 169, + "time_per_iteration": 2.89070200920105 + }, + { + "auxiliary_loss_clip": 0.01700104, + "auxiliary_loss_mlp": 0.01550363, + "balance_loss_clip": 1.34978199, + "balance_loss_mlp": 1.19006443, + "epoch": 0.010220952953554787, + "flos": 20414428924320.0, + "grad_norm": 1.828648440418742, + "language_loss": 0.84737921, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87988389, + "num_input_tokens_seen": 3597275, + "step": 170, + "time_per_iteration": 2.856459140777588 + }, + { + "auxiliary_loss_clip": 0.01700668, + "auxiliary_loss_mlp": 0.01582541, + "balance_loss_clip": 1.34972644, + "balance_loss_mlp": 1.22853696, + "epoch": 0.010281076206222756, + "flos": 31287820328640.0, + "grad_norm": 2.178603519051588, + "language_loss": 0.90081751, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.9336496, + "num_input_tokens_seen": 3618905, + "step": 171, + "time_per_iteration": 2.872664451599121 + }, + { + "auxiliary_loss_clip": 0.01705793, + "auxiliary_loss_mlp": 0.01576427, + "balance_loss_clip": 1.35493422, + "balance_loss_mlp": 1.21116984, + "epoch": 0.010341199458890726, + "flos": 21984518701440.0, + "grad_norm": 2.095616406327937, + "language_loss": 0.8895877, + "learning_rate": 3.314225558471224e-06, + "loss": 0.92240989, + "num_input_tokens_seen": 3639610, + "step": 172, + "time_per_iteration": 2.8006510734558105 + }, + { + "auxiliary_loss_clip": 0.01704863, + "auxiliary_loss_mlp": 0.01602379, + "balance_loss_clip": 1.35435545, + "balance_loss_mlp": 1.24780321, + "epoch": 0.010401322711558695, + "flos": 30813182598240.0, + "grad_norm": 2.132551711851256, + "language_loss": 0.81149018, + "learning_rate": 3.317958045350308e-06, + "loss": 0.84456253, + "num_input_tokens_seen": 3664030, + "step": 173, + "time_per_iteration": 2.843600034713745 + }, + { + "auxiliary_loss_clip": 0.01703392, + "auxiliary_loss_mlp": 0.01551082, + "balance_loss_clip": 1.35318375, + "balance_loss_mlp": 1.18811345, + "epoch": 0.010461445964226665, + "flos": 24717307881600.0, + "grad_norm": 2.1744383175475885, + "language_loss": 0.82886952, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.86141419, + "num_input_tokens_seen": 3683615, + "step": 174, + "time_per_iteration": 2.8507792949676514 + }, + { + "auxiliary_loss_clip": 0.0169844, + "auxiliary_loss_mlp": 0.01551568, + "balance_loss_clip": 1.34729385, + "balance_loss_mlp": 1.18402243, + "epoch": 0.010521569216894634, + "flos": 27712903832640.0, + "grad_norm": 3.662113990685609, + "language_loss": 0.73016095, + "learning_rate": 3.325358726641591e-06, + "loss": 0.76266104, + "num_input_tokens_seen": 3704540, + "step": 175, + "time_per_iteration": 2.815857172012329 + }, + { + "auxiliary_loss_clip": 0.01696195, + "auxiliary_loss_mlp": 0.01558007, + "balance_loss_clip": 1.34536684, + "balance_loss_mlp": 1.19732738, + "epoch": 0.010581692469562603, + "flos": 12459979931040.0, + "grad_norm": 2.6751651807074817, + "language_loss": 0.9820956, + "learning_rate": 3.329027409977902e-06, + "loss": 1.01463759, + "num_input_tokens_seen": 3721320, + "step": 176, + "time_per_iteration": 2.786452293395996 + }, + { + "auxiliary_loss_clip": 0.01715495, + "auxiliary_loss_mlp": 0.01551001, + "balance_loss_clip": 1.36581874, + "balance_loss_mlp": 1.17944956, + "epoch": 0.010641815722230573, + "flos": 19429652341440.0, + "grad_norm": 2.656934518548229, + "language_loss": 0.77134031, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.80400527, + "num_input_tokens_seen": 3739385, + "step": 177, + "time_per_iteration": 2.811265230178833 + }, + { + "auxiliary_loss_clip": 0.01696151, + "auxiliary_loss_mlp": 0.01536909, + "balance_loss_clip": 1.34335732, + "balance_loss_mlp": 1.17622948, + "epoch": 0.010701938974898541, + "flos": 18334541648160.0, + "grad_norm": 3.1194623571973814, + "language_loss": 0.76766741, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79999799, + "num_input_tokens_seen": 3756360, + "step": 178, + "time_per_iteration": 2.8490772247314453 + }, + { + "auxiliary_loss_clip": 0.01706358, + "auxiliary_loss_mlp": 0.01538964, + "balance_loss_clip": 1.35557723, + "balance_loss_mlp": 1.16569543, + "epoch": 0.010762062227566512, + "flos": 19205760227040.0, + "grad_norm": 3.8205219704341205, + "language_loss": 0.84394693, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.87640023, + "num_input_tokens_seen": 3773930, + "step": 179, + "time_per_iteration": 2.897707462310791 + }, + { + "auxiliary_loss_clip": 0.01697939, + "auxiliary_loss_mlp": 0.01543431, + "balance_loss_clip": 1.34681225, + "balance_loss_mlp": 1.18084407, + "epoch": 0.01082218548023448, + "flos": 31427814624480.0, + "grad_norm": 2.063199257680312, + "language_loss": 0.838539, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.87095261, + "num_input_tokens_seen": 3793630, + "step": 180, + "time_per_iteration": 2.9501585960388184 + }, + { + "auxiliary_loss_clip": 0.01696971, + "auxiliary_loss_mlp": 0.01522402, + "balance_loss_clip": 1.34546828, + "balance_loss_mlp": 1.16668105, + "epoch": 0.01088230873290245, + "flos": 25048803278880.0, + "grad_norm": 2.869909358821831, + "language_loss": 0.7767657, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80895936, + "num_input_tokens_seen": 3813610, + "step": 181, + "time_per_iteration": 3.0081374645233154 + }, + { + "auxiliary_loss_clip": 0.01691784, + "auxiliary_loss_mlp": 0.01518475, + "balance_loss_clip": 1.33902824, + "balance_loss_mlp": 1.15264571, + "epoch": 0.01094243198557042, + "flos": 22895638068960.0, + "grad_norm": 2.827841035803279, + "language_loss": 0.76505697, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.79715955, + "num_input_tokens_seen": 3831390, + "step": 182, + "time_per_iteration": 2.8215620517730713 + }, + { + "auxiliary_loss_clip": 0.01691542, + "auxiliary_loss_mlp": 0.0153763, + "balance_loss_clip": 1.3392756, + "balance_loss_mlp": 1.16760421, + "epoch": 0.011002555238238388, + "flos": 17166949512480.0, + "grad_norm": 2.3041410538917275, + "language_loss": 0.87784827, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.91013998, + "num_input_tokens_seen": 3849705, + "step": 183, + "time_per_iteration": 2.8305158615112305 + }, + { + "auxiliary_loss_clip": 0.01697247, + "auxiliary_loss_mlp": 0.01539508, + "balance_loss_clip": 1.34515762, + "balance_loss_mlp": 1.17329741, + "epoch": 0.011062678490906358, + "flos": 22312372995360.0, + "grad_norm": 2.1734573352521, + "language_loss": 0.8680886, + "learning_rate": 3.357647774369736e-06, + "loss": 0.90045619, + "num_input_tokens_seen": 3869230, + "step": 184, + "time_per_iteration": 2.865373134613037 + }, + { + "auxiliary_loss_clip": 0.01695651, + "auxiliary_loss_mlp": 0.01527724, + "balance_loss_clip": 1.34222472, + "balance_loss_mlp": 1.1460638, + "epoch": 0.011122801743574327, + "flos": 24390629360640.0, + "grad_norm": 1.8408781718147509, + "language_loss": 0.83616418, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86839795, + "num_input_tokens_seen": 3889735, + "step": 185, + "time_per_iteration": 2.80383038520813 + }, + { + "auxiliary_loss_clip": 0.01711947, + "auxiliary_loss_mlp": 0.01531756, + "balance_loss_clip": 1.36006951, + "balance_loss_mlp": 1.16401958, + "epoch": 0.011182924996242297, + "flos": 18152219161440.0, + "grad_norm": 4.6245480717457195, + "language_loss": 0.71153557, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.74397266, + "num_input_tokens_seen": 3908855, + "step": 186, + "time_per_iteration": 2.7682807445526123 + }, + { + "auxiliary_loss_clip": 0.01705043, + "auxiliary_loss_mlp": 0.01511889, + "balance_loss_clip": 1.35331547, + "balance_loss_mlp": 1.12927496, + "epoch": 0.011243048248910266, + "flos": 15488611673760.0, + "grad_norm": 2.2469952345307203, + "language_loss": 1.02204967, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.05421901, + "num_input_tokens_seen": 3923865, + "step": 187, + "time_per_iteration": 2.7634239196777344 + }, + { + "auxiliary_loss_clip": 0.01698203, + "auxiliary_loss_mlp": 0.01551083, + "balance_loss_clip": 1.3466543, + "balance_loss_mlp": 1.17724323, + "epoch": 0.011303171501578235, + "flos": 40920076166400.0, + "grad_norm": 1.9788922462265854, + "language_loss": 0.7521584, + "learning_rate": 3.371494591560139e-06, + "loss": 0.78465128, + "num_input_tokens_seen": 3946870, + "step": 188, + "time_per_iteration": 2.9576871395111084 + }, + { + "auxiliary_loss_clip": 0.01870022, + "auxiliary_loss_mlp": 0.01467651, + "balance_loss_clip": 1.51418829, + "balance_loss_mlp": 1.12585449, + "epoch": 0.011363294754246205, + "flos": 66308908678560.0, + "grad_norm": 0.7825617772952161, + "language_loss": 0.56253612, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.59591287, + "num_input_tokens_seen": 4010005, + "step": 189, + "time_per_iteration": 3.3918678760528564 + }, + { + "auxiliary_loss_clip": 0.01691063, + "auxiliary_loss_mlp": 0.01552314, + "balance_loss_clip": 1.3394258, + "balance_loss_mlp": 1.18190694, + "epoch": 0.011423418006914174, + "flos": 24902626692960.0, + "grad_norm": 2.2146069780177036, + "language_loss": 0.94904017, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.98147392, + "num_input_tokens_seen": 4029035, + "step": 190, + "time_per_iteration": 2.834447145462036 + }, + { + "auxiliary_loss_clip": 0.01702178, + "auxiliary_loss_mlp": 0.0155194, + "balance_loss_clip": 1.34989762, + "balance_loss_mlp": 1.18477559, + "epoch": 0.011483541259582144, + "flos": 19793918033280.0, + "grad_norm": 3.1285342875551456, + "language_loss": 0.84787691, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.88041812, + "num_input_tokens_seen": 4046995, + "step": 191, + "time_per_iteration": 2.805155038833618 + }, + { + "auxiliary_loss_clip": 0.01694557, + "auxiliary_loss_mlp": 0.01541419, + "balance_loss_clip": 1.34277177, + "balance_loss_mlp": 1.176162, + "epoch": 0.011543664512250112, + "flos": 26179339237920.0, + "grad_norm": 1.9831511869088834, + "language_loss": 0.9185127, + "learning_rate": 3.385049875042367e-06, + "loss": 0.95087248, + "num_input_tokens_seen": 4065865, + "step": 192, + "time_per_iteration": 2.9093499183654785 + }, + { + "auxiliary_loss_clip": 0.01688751, + "auxiliary_loss_mlp": 0.01553585, + "balance_loss_clip": 1.33552814, + "balance_loss_mlp": 1.18718326, + "epoch": 0.011603787764918083, + "flos": 23771256314400.0, + "grad_norm": 2.6510668185977035, + "language_loss": 0.86961138, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.90203476, + "num_input_tokens_seen": 4085305, + "step": 193, + "time_per_iteration": 2.8885316848754883 + }, + { + "auxiliary_loss_clip": 0.01692355, + "auxiliary_loss_mlp": 0.01549839, + "balance_loss_clip": 1.33946395, + "balance_loss_mlp": 1.1725651, + "epoch": 0.011663911017586051, + "flos": 25956888393600.0, + "grad_norm": 2.086642094680133, + "language_loss": 0.91992021, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.95234209, + "num_input_tokens_seen": 4105185, + "step": 194, + "time_per_iteration": 2.8166868686676025 + }, + { + "auxiliary_loss_clip": 0.01697581, + "auxiliary_loss_mlp": 0.01536334, + "balance_loss_clip": 1.34543514, + "balance_loss_mlp": 1.162112, + "epoch": 0.01172403427025402, + "flos": 17897111807040.0, + "grad_norm": 2.3763450210259105, + "language_loss": 0.90011328, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.93245244, + "num_input_tokens_seen": 4123160, + "step": 195, + "time_per_iteration": 2.761023998260498 + }, + { + "auxiliary_loss_clip": 0.01702205, + "auxiliary_loss_mlp": 0.01559601, + "balance_loss_clip": 1.34890616, + "balance_loss_mlp": 1.18232751, + "epoch": 0.01178415752292199, + "flos": 17896467028320.0, + "grad_norm": 6.007963742714345, + "language_loss": 0.8594197, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.89203775, + "num_input_tokens_seen": 4140425, + "step": 196, + "time_per_iteration": 2.716693639755249 + }, + { + "auxiliary_loss_clip": 0.01690547, + "auxiliary_loss_mlp": 0.01555613, + "balance_loss_clip": 1.33849967, + "balance_loss_mlp": 1.18291736, + "epoch": 0.011844280775589959, + "flos": 22895789781600.0, + "grad_norm": 2.3915002153282816, + "language_loss": 0.93134719, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.96380877, + "num_input_tokens_seen": 4159555, + "step": 197, + "time_per_iteration": 2.839121103286743 + }, + { + "auxiliary_loss_clip": 0.0169488, + "auxiliary_loss_mlp": 0.01560296, + "balance_loss_clip": 1.3432374, + "balance_loss_mlp": 1.19275033, + "epoch": 0.01190440402825793, + "flos": 26982817394400.0, + "grad_norm": 2.763234268171346, + "language_loss": 0.79082739, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.82337916, + "num_input_tokens_seen": 4180480, + "step": 198, + "time_per_iteration": 2.834224224090576 + }, + { + "auxiliary_loss_clip": 0.01685908, + "auxiliary_loss_mlp": 0.01534095, + "balance_loss_clip": 1.33410287, + "balance_loss_mlp": 1.16044569, + "epoch": 0.011964527280925898, + "flos": 20523663118080.0, + "grad_norm": 1.8719981552930918, + "language_loss": 0.88083422, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.9130342, + "num_input_tokens_seen": 4198835, + "step": 199, + "time_per_iteration": 2.778709650039673 + }, + { + "auxiliary_loss_clip": 0.01688846, + "auxiliary_loss_mlp": 0.01534796, + "balance_loss_clip": 1.33703005, + "balance_loss_mlp": 1.15828562, + "epoch": 0.012024650533593867, + "flos": 27748480811040.0, + "grad_norm": 1.918867181444552, + "language_loss": 0.81377161, + "learning_rate": 3.411333205349222e-06, + "loss": 0.84600806, + "num_input_tokens_seen": 4219335, + "step": 200, + "time_per_iteration": 2.9232215881347656 + }, + { + "auxiliary_loss_clip": 0.01681776, + "auxiliary_loss_mlp": 0.015354, + "balance_loss_clip": 1.32874727, + "balance_loss_mlp": 1.15526533, + "epoch": 0.012084773786261837, + "flos": 10453560229440.0, + "grad_norm": 2.7321659559021616, + "language_loss": 0.87714136, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90931308, + "num_input_tokens_seen": 4236940, + "step": 201, + "time_per_iteration": 2.7345151901245117 + }, + { + "auxiliary_loss_clip": 0.01693023, + "auxiliary_loss_mlp": 0.01567344, + "balance_loss_clip": 1.34213126, + "balance_loss_mlp": 1.19445753, + "epoch": 0.012144897038929806, + "flos": 23107620741120.0, + "grad_norm": 1.8702569041277353, + "language_loss": 0.84252739, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.87513107, + "num_input_tokens_seen": 4256755, + "step": 202, + "time_per_iteration": 2.8304929733276367 + }, + { + "auxiliary_loss_clip": 0.01684009, + "auxiliary_loss_mlp": 0.01544792, + "balance_loss_clip": 1.33160472, + "balance_loss_mlp": 1.177055, + "epoch": 0.012205020291597776, + "flos": 21035357025120.0, + "grad_norm": 2.057107781114424, + "language_loss": 0.90109992, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.93338799, + "num_input_tokens_seen": 4276505, + "step": 203, + "time_per_iteration": 2.8524169921875 + }, + { + "auxiliary_loss_clip": 0.01854512, + "auxiliary_loss_mlp": 0.01465897, + "balance_loss_clip": 1.49588132, + "balance_loss_mlp": 1.10044861, + "epoch": 0.012265143544265745, + "flos": 68453691765120.0, + "grad_norm": 1.0317782798947996, + "language_loss": 0.61268771, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.64589179, + "num_input_tokens_seen": 4330965, + "step": 204, + "time_per_iteration": 3.304670572280884 + }, + { + "auxiliary_loss_clip": 0.01692137, + "auxiliary_loss_mlp": 0.01576769, + "balance_loss_clip": 1.33987463, + "balance_loss_mlp": 1.20693445, + "epoch": 0.012325266796933715, + "flos": 17021152208160.0, + "grad_norm": 4.050737366672003, + "language_loss": 0.91553247, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.94822145, + "num_input_tokens_seen": 4348200, + "step": 205, + "time_per_iteration": 4.355324983596802 + }, + { + "auxiliary_loss_clip": 0.01686511, + "auxiliary_loss_mlp": 0.0154205, + "balance_loss_clip": 1.33508766, + "balance_loss_mlp": 1.17583895, + "epoch": 0.012385390049601683, + "flos": 20191940151840.0, + "grad_norm": 2.044755999026082, + "language_loss": 0.89395511, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.92624068, + "num_input_tokens_seen": 4365460, + "step": 206, + "time_per_iteration": 4.280594825744629 + }, + { + "auxiliary_loss_clip": 0.01685188, + "auxiliary_loss_mlp": 0.0156641, + "balance_loss_clip": 1.33127737, + "balance_loss_mlp": 1.20973539, + "epoch": 0.012445513302269652, + "flos": 16254995725440.0, + "grad_norm": 2.5706806827043898, + "language_loss": 0.95565253, + "learning_rate": 3.43348263905683e-06, + "loss": 0.98816848, + "num_input_tokens_seen": 4383650, + "step": 207, + "time_per_iteration": 2.7732980251312256 + }, + { + "auxiliary_loss_clip": 0.01687933, + "auxiliary_loss_mlp": 0.01538027, + "balance_loss_clip": 1.33426118, + "balance_loss_mlp": 1.17276955, + "epoch": 0.012505636554937622, + "flos": 23771673524160.0, + "grad_norm": 1.9014365076485493, + "language_loss": 0.7617833, + "learning_rate": 3.436585547151547e-06, + "loss": 0.79404289, + "num_input_tokens_seen": 4403765, + "step": 208, + "time_per_iteration": 2.8138091564178467 + }, + { + "auxiliary_loss_clip": 0.01688557, + "auxiliary_loss_mlp": 0.01521014, + "balance_loss_clip": 1.3353765, + "balance_loss_mlp": 1.14850926, + "epoch": 0.012565759807605591, + "flos": 30594183216480.0, + "grad_norm": 3.083677546924525, + "language_loss": 0.98424929, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01634502, + "num_input_tokens_seen": 4421935, + "step": 209, + "time_per_iteration": 2.8303275108337402 + }, + { + "auxiliary_loss_clip": 0.01699589, + "auxiliary_loss_mlp": 0.01546901, + "balance_loss_clip": 1.34614325, + "balance_loss_mlp": 1.18927312, + "epoch": 0.012625883060273561, + "flos": 40116635938080.0, + "grad_norm": 4.517419643107152, + "language_loss": 0.85425204, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.8867169, + "num_input_tokens_seen": 4441470, + "step": 210, + "time_per_iteration": 2.9440701007843018 + }, + { + "auxiliary_loss_clip": 0.01684929, + "auxiliary_loss_mlp": 0.01544437, + "balance_loss_clip": 1.33234191, + "balance_loss_mlp": 1.18680882, + "epoch": 0.01268600631294153, + "flos": 27091634378400.0, + "grad_norm": 2.523297547028659, + "language_loss": 0.9709872, + "learning_rate": 3.445805545042314e-06, + "loss": 1.00328088, + "num_input_tokens_seen": 4459950, + "step": 211, + "time_per_iteration": 2.8086068630218506 + }, + { + "auxiliary_loss_clip": 0.01700762, + "auxiliary_loss_mlp": 0.0154594, + "balance_loss_clip": 1.34538972, + "balance_loss_mlp": 1.17648649, + "epoch": 0.012746129565609499, + "flos": 16984664953920.0, + "grad_norm": 2.4017535922299604, + "language_loss": 0.95292199, + "learning_rate": 3.448849769075239e-06, + "loss": 0.98538905, + "num_input_tokens_seen": 4478390, + "step": 212, + "time_per_iteration": 2.7818140983581543 + }, + { + "auxiliary_loss_clip": 0.01697492, + "auxiliary_loss_mlp": 0.01506539, + "balance_loss_clip": 1.34383631, + "balance_loss_mlp": 1.13384318, + "epoch": 0.012806252818277469, + "flos": 46536875557920.0, + "grad_norm": 2.226600623291917, + "language_loss": 0.76118517, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.79322553, + "num_input_tokens_seen": 4501665, + "step": 213, + "time_per_iteration": 3.0989186763763428 + }, + { + "auxiliary_loss_clip": 0.01688609, + "auxiliary_loss_mlp": 0.01558181, + "balance_loss_clip": 1.33484936, + "balance_loss_mlp": 1.19559383, + "epoch": 0.012866376070945438, + "flos": 14390163302400.0, + "grad_norm": 5.623493818493, + "language_loss": 0.86624074, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89870864, + "num_input_tokens_seen": 4519055, + "step": 214, + "time_per_iteration": 2.739504098892212 + }, + { + "auxiliary_loss_clip": 0.01689994, + "auxiliary_loss_mlp": 0.01539145, + "balance_loss_clip": 1.33581042, + "balance_loss_mlp": 1.18094468, + "epoch": 0.012926499323613408, + "flos": 26143231265280.0, + "grad_norm": 2.722672994598692, + "language_loss": 0.77557433, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.80786574, + "num_input_tokens_seen": 4540870, + "step": 215, + "time_per_iteration": 2.8244988918304443 + }, + { + "auxiliary_loss_clip": 0.01696417, + "auxiliary_loss_mlp": 0.01539175, + "balance_loss_clip": 1.34090984, + "balance_loss_mlp": 1.16457224, + "epoch": 0.012986622576281377, + "flos": 30119810983200.0, + "grad_norm": 3.38944731756912, + "language_loss": 0.90455598, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93691194, + "num_input_tokens_seen": 4560395, + "step": 216, + "time_per_iteration": 2.8154568672180176 + }, + { + "auxiliary_loss_clip": 0.01689287, + "auxiliary_loss_mlp": 0.01546115, + "balance_loss_clip": 1.33538043, + "balance_loss_mlp": 1.18848753, + "epoch": 0.013046745828949347, + "flos": 13955729785920.0, + "grad_norm": 8.765678491123934, + "language_loss": 0.93807268, + "learning_rate": 3.463858658104523e-06, + "loss": 0.97042668, + "num_input_tokens_seen": 4575785, + "step": 217, + "time_per_iteration": 2.75640606880188 + }, + { + "auxiliary_loss_clip": 0.01693587, + "auxiliary_loss_mlp": 0.01540793, + "balance_loss_clip": 1.33998537, + "balance_loss_mlp": 1.18183041, + "epoch": 0.013106869081617315, + "flos": 17349765065280.0, + "grad_norm": 2.157812131289258, + "language_loss": 0.93563455, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96797836, + "num_input_tokens_seen": 4594985, + "step": 218, + "time_per_iteration": 2.7661092281341553 + }, + { + "auxiliary_loss_clip": 0.01696226, + "auxiliary_loss_mlp": 0.01562011, + "balance_loss_clip": 1.34220266, + "balance_loss_mlp": 1.20304823, + "epoch": 0.013166992334285284, + "flos": 25887289491360.0, + "grad_norm": 1.905875817245893, + "language_loss": 0.86135423, + "learning_rate": 3.46976560030214e-06, + "loss": 0.89393663, + "num_input_tokens_seen": 4616125, + "step": 219, + "time_per_iteration": 2.836292028427124 + }, + { + "auxiliary_loss_clip": 0.0168433, + "auxiliary_loss_mlp": 0.01548337, + "balance_loss_clip": 1.32970405, + "balance_loss_mlp": 1.18021834, + "epoch": 0.013227115586953254, + "flos": 31178661991200.0, + "grad_norm": 2.2311219191981233, + "language_loss": 0.87493622, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90726292, + "num_input_tokens_seen": 4637795, + "step": 220, + "time_per_iteration": 2.8863656520843506 + }, + { + "auxiliary_loss_clip": 0.01691309, + "auxiliary_loss_mlp": 0.01563831, + "balance_loss_clip": 1.3369292, + "balance_loss_mlp": 1.2107805, + "epoch": 0.013287238839621223, + "flos": 20411394671520.0, + "grad_norm": 1.8796814110327285, + "language_loss": 0.86717319, + "learning_rate": 3.475618842282164e-06, + "loss": 0.8997246, + "num_input_tokens_seen": 4656835, + "step": 221, + "time_per_iteration": 2.8192145824432373 + }, + { + "auxiliary_loss_clip": 0.01689396, + "auxiliary_loss_mlp": 0.01573924, + "balance_loss_clip": 1.33530724, + "balance_loss_mlp": 1.22011137, + "epoch": 0.013347362092289193, + "flos": 14138886692160.0, + "grad_norm": 3.7548799739885887, + "language_loss": 0.92322665, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.9558599, + "num_input_tokens_seen": 4673015, + "step": 222, + "time_per_iteration": 2.767164468765259 + }, + { + "auxiliary_loss_clip": 0.01704214, + "auxiliary_loss_mlp": 0.01560469, + "balance_loss_clip": 1.35106111, + "balance_loss_mlp": 1.19521165, + "epoch": 0.013407485344957162, + "flos": 21799806740640.0, + "grad_norm": 3.7341333225039017, + "language_loss": 0.95956993, + "learning_rate": 3.481419351635897e-06, + "loss": 0.99221671, + "num_input_tokens_seen": 4692355, + "step": 223, + "time_per_iteration": 2.760019540786743 + }, + { + "auxiliary_loss_clip": 0.01700607, + "auxiliary_loss_mlp": 0.01575193, + "balance_loss_clip": 1.34558082, + "balance_loss_mlp": 1.21413195, + "epoch": 0.013467608597625132, + "flos": 18623443357440.0, + "grad_norm": 3.675212203464107, + "language_loss": 0.88235414, + "learning_rate": 3.484300126837776e-06, + "loss": 0.91511214, + "num_input_tokens_seen": 4710080, + "step": 224, + "time_per_iteration": 2.8537540435791016 + }, + { + "auxiliary_loss_clip": 0.01695528, + "auxiliary_loss_mlp": 0.01561548, + "balance_loss_clip": 1.34202051, + "balance_loss_mlp": 1.19629097, + "epoch": 0.013527731850293101, + "flos": 18554337521280.0, + "grad_norm": 1.9584593754396649, + "language_loss": 0.89430404, + "learning_rate": 3.487168070036317e-06, + "loss": 0.92687476, + "num_input_tokens_seen": 4728980, + "step": 225, + "time_per_iteration": 2.821449041366577 + }, + { + "auxiliary_loss_clip": 0.01695155, + "auxiliary_loss_mlp": 0.01542026, + "balance_loss_clip": 1.34169102, + "balance_loss_mlp": 1.17600608, + "epoch": 0.01358785510296107, + "flos": 19167035211360.0, + "grad_norm": 2.083778956670331, + "language_loss": 0.99271524, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.025087, + "num_input_tokens_seen": 4747020, + "step": 226, + "time_per_iteration": 2.81668758392334 + }, + { + "auxiliary_loss_clip": 0.01697485, + "auxiliary_loss_mlp": 0.01560788, + "balance_loss_clip": 1.34283841, + "balance_loss_mlp": 1.19076288, + "epoch": 0.01364797835562904, + "flos": 23332119706080.0, + "grad_norm": 4.2359512939757, + "language_loss": 0.91050708, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.94308984, + "num_input_tokens_seen": 4765000, + "step": 227, + "time_per_iteration": 2.7931067943573 + }, + { + "auxiliary_loss_clip": 0.01863722, + "auxiliary_loss_mlp": 0.0148111, + "balance_loss_clip": 1.50392509, + "balance_loss_mlp": 1.15838623, + "epoch": 0.013708101608297009, + "flos": 71001389338560.0, + "grad_norm": 0.9511751740443302, + "language_loss": 0.57704532, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.6104936, + "num_input_tokens_seen": 4833210, + "step": 228, + "time_per_iteration": 3.4798483848571777 + }, + { + "auxiliary_loss_clip": 0.01688406, + "auxiliary_loss_mlp": 0.01529326, + "balance_loss_clip": 1.33440042, + "balance_loss_mlp": 1.15052676, + "epoch": 0.013768224860964979, + "flos": 16326984101760.0, + "grad_norm": 2.6634723467544417, + "language_loss": 0.88070887, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.91288626, + "num_input_tokens_seen": 4850120, + "step": 229, + "time_per_iteration": 2.787541627883911 + }, + { + "auxiliary_loss_clip": 0.01712089, + "auxiliary_loss_mlp": 0.01536662, + "balance_loss_clip": 1.35724604, + "balance_loss_mlp": 1.15824425, + "epoch": 0.013828348113632948, + "flos": 20195088189120.0, + "grad_norm": 6.08626064162757, + "language_loss": 0.8416934, + "learning_rate": 3.501319237118231e-06, + "loss": 0.87418091, + "num_input_tokens_seen": 4866215, + "step": 230, + "time_per_iteration": 2.8080224990844727 + }, + { + "auxiliary_loss_clip": 0.0169142, + "auxiliary_loss_mlp": 0.01529983, + "balance_loss_clip": 1.33808398, + "balance_loss_mlp": 1.14832234, + "epoch": 0.013888471366300916, + "flos": 20743193494080.0, + "grad_norm": 2.166544397504632, + "language_loss": 0.90397227, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.93618625, + "num_input_tokens_seen": 4885630, + "step": 231, + "time_per_iteration": 2.860902786254883 + }, + { + "auxiliary_loss_clip": 0.01700944, + "auxiliary_loss_mlp": 0.01525007, + "balance_loss_clip": 1.34806156, + "balance_loss_mlp": 1.14849615, + "epoch": 0.013948594618968886, + "flos": 22092804691200.0, + "grad_norm": 2.138196877825659, + "language_loss": 0.83518463, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.8674441, + "num_input_tokens_seen": 4905570, + "step": 232, + "time_per_iteration": 2.9156434535980225 + }, + { + "auxiliary_loss_clip": 0.01693731, + "auxiliary_loss_mlp": 0.01503061, + "balance_loss_clip": 1.34023964, + "balance_loss_mlp": 1.12540555, + "epoch": 0.014008717871636855, + "flos": 19065159080640.0, + "grad_norm": 3.1526724356087095, + "language_loss": 0.74024785, + "learning_rate": 3.509663010692652e-06, + "loss": 0.77221572, + "num_input_tokens_seen": 4923535, + "step": 233, + "time_per_iteration": 2.852116346359253 + }, + { + "auxiliary_loss_clip": 0.01700504, + "auxiliary_loss_mlp": 0.01550868, + "balance_loss_clip": 1.3473618, + "balance_loss_mlp": 1.17168701, + "epoch": 0.014068841124304825, + "flos": 14532167790720.0, + "grad_norm": 3.00835166919091, + "language_loss": 0.85821402, + "learning_rate": 3.512420411838642e-06, + "loss": 0.89072782, + "num_input_tokens_seen": 4939200, + "step": 234, + "time_per_iteration": 2.8559811115264893 + }, + { + "auxiliary_loss_clip": 0.01705012, + "auxiliary_loss_mlp": 0.01533433, + "balance_loss_clip": 1.35153782, + "balance_loss_mlp": 1.15825713, + "epoch": 0.014128964376972794, + "flos": 18079358437440.0, + "grad_norm": 2.684818354863506, + "language_loss": 0.89416373, + "learning_rate": 3.515166054308634e-06, + "loss": 0.92654824, + "num_input_tokens_seen": 4956620, + "step": 235, + "time_per_iteration": 2.8330349922180176 + }, + { + "auxiliary_loss_clip": 0.01702487, + "auxiliary_loss_mlp": 0.01545154, + "balance_loss_clip": 1.34945273, + "balance_loss_mlp": 1.15796185, + "epoch": 0.014189087629640764, + "flos": 25336491287040.0, + "grad_norm": 3.024877981843501, + "language_loss": 0.85433221, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88680857, + "num_input_tokens_seen": 4975650, + "step": 236, + "time_per_iteration": 2.8755457401275635 + }, + { + "auxiliary_loss_clip": 0.01700305, + "auxiliary_loss_mlp": 0.01510944, + "balance_loss_clip": 1.3466841, + "balance_loss_mlp": 1.12985563, + "epoch": 0.014249210882308733, + "flos": 36142824975840.0, + "grad_norm": 1.9815163017115387, + "language_loss": 0.82309687, + "learning_rate": 3.520622461401154e-06, + "loss": 0.85520941, + "num_input_tokens_seen": 4997415, + "step": 237, + "time_per_iteration": 2.9138455390930176 + }, + { + "auxiliary_loss_clip": 0.01708353, + "auxiliary_loss_mlp": 0.01524552, + "balance_loss_clip": 1.35694027, + "balance_loss_mlp": 1.14003074, + "epoch": 0.014309334134976702, + "flos": 12934655589600.0, + "grad_norm": 2.997937288466017, + "language_loss": 0.77394056, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.80626965, + "num_input_tokens_seen": 5013905, + "step": 238, + "time_per_iteration": 2.810596466064453 + }, + { + "auxiliary_loss_clip": 0.01704992, + "auxiliary_loss_mlp": 0.01554898, + "balance_loss_clip": 1.35189891, + "balance_loss_mlp": 1.18334639, + "epoch": 0.014369457387644672, + "flos": 20779642820160.0, + "grad_norm": 7.832249317372601, + "language_loss": 0.87145317, + "learning_rate": 3.526033015791284e-06, + "loss": 0.90405202, + "num_input_tokens_seen": 5033645, + "step": 239, + "time_per_iteration": 2.7360951900482178 + }, + { + "auxiliary_loss_clip": 0.01697967, + "auxiliary_loss_mlp": 0.01522211, + "balance_loss_clip": 1.34546232, + "balance_loss_mlp": 1.13540041, + "epoch": 0.01442958064031264, + "flos": 25850916021600.0, + "grad_norm": 3.339017256256772, + "language_loss": 0.93321234, + "learning_rate": 3.528721337790862e-06, + "loss": 0.96541405, + "num_input_tokens_seen": 5052875, + "step": 240, + "time_per_iteration": 2.7987143993377686 + }, + { + "auxiliary_loss_clip": 0.01701819, + "auxiliary_loss_mlp": 0.0152238, + "balance_loss_clip": 1.34823, + "balance_loss_mlp": 1.14796734, + "epoch": 0.014489703892980611, + "flos": 28222208265600.0, + "grad_norm": 2.283961163635765, + "language_loss": 0.84761381, + "learning_rate": 3.531398481704111e-06, + "loss": 0.87985575, + "num_input_tokens_seen": 5075005, + "step": 241, + "time_per_iteration": 2.8851492404937744 + }, + { + "auxiliary_loss_clip": 0.01704463, + "auxiliary_loss_mlp": 0.01543307, + "balance_loss_clip": 1.34982276, + "balance_loss_mlp": 1.16012025, + "epoch": 0.01454982714564858, + "flos": 22493064571200.0, + "grad_norm": 2.2231437129035627, + "language_loss": 0.8870855, + "learning_rate": 3.534064540103573e-06, + "loss": 0.91956317, + "num_input_tokens_seen": 5091875, + "step": 242, + "time_per_iteration": 2.770881175994873 + }, + { + "auxiliary_loss_clip": 0.01704922, + "auxiliary_loss_mlp": 0.01513205, + "balance_loss_clip": 1.35144758, + "balance_loss_mlp": 1.13612223, + "epoch": 0.014609950398316548, + "flos": 21655374850080.0, + "grad_norm": 2.2978941799048216, + "language_loss": 0.86624271, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89842397, + "num_input_tokens_seen": 5111290, + "step": 243, + "time_per_iteration": 4.265936374664307 + }, + { + "auxiliary_loss_clip": 0.01711423, + "auxiliary_loss_mlp": 0.01535812, + "balance_loss_clip": 1.35858309, + "balance_loss_mlp": 1.1579659, + "epoch": 0.014670073650984519, + "flos": 21872060614080.0, + "grad_norm": 1.584795801891008, + "language_loss": 0.8414253, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.87389767, + "num_input_tokens_seen": 5132265, + "step": 244, + "time_per_iteration": 5.912152051925659 + }, + { + "auxiliary_loss_clip": 0.01701937, + "auxiliary_loss_mlp": 0.01542488, + "balance_loss_clip": 1.34868515, + "balance_loss_mlp": 1.16845691, + "epoch": 0.014730196903652487, + "flos": 23185867263840.0, + "grad_norm": 2.35355327839266, + "language_loss": 0.79121792, + "learning_rate": 3.54199711087864e-06, + "loss": 0.82366216, + "num_input_tokens_seen": 5148575, + "step": 245, + "time_per_iteration": 2.8626179695129395 + }, + { + "auxiliary_loss_clip": 0.0169795, + "auxiliary_loss_mlp": 0.01527106, + "balance_loss_clip": 1.34487545, + "balance_loss_mlp": 1.15898776, + "epoch": 0.014790320156320457, + "flos": 23224895704800.0, + "grad_norm": 2.3074297555938204, + "language_loss": 0.84406215, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.87631273, + "num_input_tokens_seen": 5170415, + "step": 246, + "time_per_iteration": 2.8378124237060547 + }, + { + "auxiliary_loss_clip": 0.01715092, + "auxiliary_loss_mlp": 0.01543232, + "balance_loss_clip": 1.36133397, + "balance_loss_mlp": 1.17606783, + "epoch": 0.014850443408988426, + "flos": 15817338315360.0, + "grad_norm": 2.1651905947486743, + "language_loss": 0.90130997, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.93389326, + "num_input_tokens_seen": 5188565, + "step": 247, + "time_per_iteration": 2.8541717529296875 + }, + { + "auxiliary_loss_clip": 0.01698317, + "auxiliary_loss_mlp": 0.0152598, + "balance_loss_clip": 1.34522939, + "balance_loss_mlp": 1.14889705, + "epoch": 0.014910566661656396, + "flos": 22783748904000.0, + "grad_norm": 3.201284302954759, + "language_loss": 0.78071582, + "learning_rate": 3.549833136812155e-06, + "loss": 0.81295884, + "num_input_tokens_seen": 5207810, + "step": 248, + "time_per_iteration": 2.801250696182251 + }, + { + "auxiliary_loss_clip": 0.01702463, + "auxiliary_loss_mlp": 0.01522173, + "balance_loss_clip": 1.34801936, + "balance_loss_mlp": 1.14737868, + "epoch": 0.014970689914324365, + "flos": 26867173341600.0, + "grad_norm": 2.3206241030290133, + "language_loss": 0.84016007, + "learning_rate": 3.552424094769381e-06, + "loss": 0.87240642, + "num_input_tokens_seen": 5226210, + "step": 249, + "time_per_iteration": 2.8275837898254395 + }, + { + "auxiliary_loss_clip": 0.01708075, + "auxiliary_loss_mlp": 0.01544023, + "balance_loss_clip": 1.35409355, + "balance_loss_mlp": 1.17819357, + "epoch": 0.015030813166992334, + "flos": 13987096738560.0, + "grad_norm": 2.1594646866011966, + "language_loss": 0.93336535, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.96588635, + "num_input_tokens_seen": 5241660, + "step": 250, + "time_per_iteration": 2.834414005279541 + }, + { + "auxiliary_loss_clip": 0.01697428, + "auxiliary_loss_mlp": 0.01524858, + "balance_loss_clip": 1.34236968, + "balance_loss_mlp": 1.15292478, + "epoch": 0.015090936419660304, + "flos": 24720152493600.0, + "grad_norm": 2.7257519438345272, + "language_loss": 0.96847618, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.00069904, + "num_input_tokens_seen": 5261090, + "step": 251, + "time_per_iteration": 2.8050317764282227 + }, + { + "auxiliary_loss_clip": 0.01705829, + "auxiliary_loss_mlp": 0.01529954, + "balance_loss_clip": 1.35228789, + "balance_loss_mlp": 1.15744865, + "epoch": 0.015151059672328273, + "flos": 25741150833600.0, + "grad_norm": 6.276606435776289, + "language_loss": 0.84452081, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.87687862, + "num_input_tokens_seen": 5279175, + "step": 252, + "time_per_iteration": 2.8612284660339355 + }, + { + "auxiliary_loss_clip": 0.0169998, + "auxiliary_loss_mlp": 0.01527221, + "balance_loss_clip": 1.34563541, + "balance_loss_mlp": 1.15204585, + "epoch": 0.015211182924996243, + "flos": 21873350171520.0, + "grad_norm": 3.7602740864211626, + "language_loss": 0.9814496, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.01372159, + "num_input_tokens_seen": 5296975, + "step": 253, + "time_per_iteration": 2.897636890411377 + }, + { + "auxiliary_loss_clip": 0.01837951, + "auxiliary_loss_mlp": 0.01502777, + "balance_loss_clip": 1.47854328, + "balance_loss_mlp": 1.10604858, + "epoch": 0.015271306177664212, + "flos": 66901807504800.0, + "grad_norm": 0.8887050880791969, + "language_loss": 0.55595851, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.58936578, + "num_input_tokens_seen": 5358375, + "step": 254, + "time_per_iteration": 3.3541040420532227 + }, + { + "auxiliary_loss_clip": 0.01693475, + "auxiliary_loss_mlp": 0.015334, + "balance_loss_clip": 1.33919001, + "balance_loss_mlp": 1.16451883, + "epoch": 0.01533142943033218, + "flos": 26836527024000.0, + "grad_norm": 2.423730119278883, + "language_loss": 0.90070426, + "learning_rate": 3.567754632921479e-06, + "loss": 0.93297297, + "num_input_tokens_seen": 5377255, + "step": 255, + "time_per_iteration": 2.996474027633667 + }, + { + "auxiliary_loss_clip": 0.01692761, + "auxiliary_loss_mlp": 0.01541812, + "balance_loss_clip": 1.3379662, + "balance_loss_mlp": 1.18628192, + "epoch": 0.01539155268300015, + "flos": 20815864577280.0, + "grad_norm": 2.353866593693959, + "language_loss": 0.85563087, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.88797653, + "num_input_tokens_seen": 5395320, + "step": 256, + "time_per_iteration": 2.785595417022705 + }, + { + "auxiliary_loss_clip": 0.01694046, + "auxiliary_loss_mlp": 0.01539609, + "balance_loss_clip": 1.34014988, + "balance_loss_mlp": 1.18198168, + "epoch": 0.01545167593566812, + "flos": 15963742470240.0, + "grad_norm": 2.6990648928175784, + "language_loss": 0.71113992, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.74347639, + "num_input_tokens_seen": 5411970, + "step": 257, + "time_per_iteration": 2.818587064743042 + }, + { + "auxiliary_loss_clip": 0.01698794, + "auxiliary_loss_mlp": 0.01557914, + "balance_loss_clip": 1.34604573, + "balance_loss_mlp": 1.20143104, + "epoch": 0.01551179918833609, + "flos": 22603967604000.0, + "grad_norm": 2.302298522865782, + "language_loss": 0.94700915, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97957623, + "num_input_tokens_seen": 5430245, + "step": 258, + "time_per_iteration": 2.82804012298584 + }, + { + "auxiliary_loss_clip": 0.01695329, + "auxiliary_loss_mlp": 0.01547267, + "balance_loss_clip": 1.34283113, + "balance_loss_mlp": 1.18181896, + "epoch": 0.015571922441004058, + "flos": 22818719031840.0, + "grad_norm": 2.5130064418457962, + "language_loss": 0.93086171, + "learning_rate": 3.577775880881658e-06, + "loss": 0.96328771, + "num_input_tokens_seen": 5448905, + "step": 259, + "time_per_iteration": 2.85992693901062 + }, + { + "auxiliary_loss_clip": 0.01705957, + "auxiliary_loss_mlp": 0.01531938, + "balance_loss_clip": 1.35169744, + "balance_loss_mlp": 1.16935062, + "epoch": 0.015632045693672027, + "flos": 18949135746240.0, + "grad_norm": 1.9713639044383962, + "language_loss": 0.97173953, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.00411844, + "num_input_tokens_seen": 5466405, + "step": 260, + "time_per_iteration": 2.759908676147461 + }, + { + "auxiliary_loss_clip": 0.01687475, + "auxiliary_loss_mlp": 0.0153202, + "balance_loss_clip": 1.33316207, + "balance_loss_mlp": 1.17820692, + "epoch": 0.015692168946339995, + "flos": 29974355032320.0, + "grad_norm": 3.6620240079803565, + "language_loss": 0.88052511, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.91272008, + "num_input_tokens_seen": 5487055, + "step": 261, + "time_per_iteration": 2.884453058242798 + }, + { + "auxiliary_loss_clip": 0.01691413, + "auxiliary_loss_mlp": 0.01561595, + "balance_loss_clip": 1.33825648, + "balance_loss_mlp": 1.20854509, + "epoch": 0.015752292199007967, + "flos": 19394189147520.0, + "grad_norm": 2.0534348343754156, + "language_loss": 0.67447591, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.70700598, + "num_input_tokens_seen": 5506600, + "step": 262, + "time_per_iteration": 2.815612554550171 + }, + { + "auxiliary_loss_clip": 0.01683461, + "auxiliary_loss_mlp": 0.01588587, + "balance_loss_clip": 1.32889771, + "balance_loss_mlp": 1.23515558, + "epoch": 0.015812415451675936, + "flos": 20341909553760.0, + "grad_norm": 2.812863731041719, + "language_loss": 0.68293595, + "learning_rate": 3.587643540438383e-06, + "loss": 0.71565652, + "num_input_tokens_seen": 5524350, + "step": 263, + "time_per_iteration": 2.7782416343688965 + }, + { + "auxiliary_loss_clip": 0.01694509, + "auxiliary_loss_mlp": 0.01559213, + "balance_loss_clip": 1.33976007, + "balance_loss_mlp": 1.20482802, + "epoch": 0.015872538704343905, + "flos": 17527081034880.0, + "grad_norm": 3.578376673728216, + "language_loss": 0.8539086, + "learning_rate": 3.590087005168037e-06, + "loss": 0.88644582, + "num_input_tokens_seen": 5542145, + "step": 264, + "time_per_iteration": 2.802534818649292 + }, + { + "auxiliary_loss_clip": 0.01691259, + "auxiliary_loss_mlp": 0.01558014, + "balance_loss_clip": 1.33703804, + "balance_loss_mlp": 1.19428277, + "epoch": 0.015932661957011873, + "flos": 15261874947360.0, + "grad_norm": 2.862116234192427, + "language_loss": 1.04201388, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.07450652, + "num_input_tokens_seen": 5557920, + "step": 265, + "time_per_iteration": 2.902754068374634 + }, + { + "auxiliary_loss_clip": 0.01699978, + "auxiliary_loss_mlp": 0.0157068, + "balance_loss_clip": 1.34795046, + "balance_loss_mlp": 1.21267104, + "epoch": 0.015992785209679845, + "flos": 20304587880000.0, + "grad_norm": 3.4598212951304297, + "language_loss": 0.75299299, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.78569955, + "num_input_tokens_seen": 5576290, + "step": 266, + "time_per_iteration": 2.8600358963012695 + }, + { + "auxiliary_loss_clip": 0.0169109, + "auxiliary_loss_mlp": 0.01538521, + "balance_loss_clip": 1.3376286, + "balance_loss_mlp": 1.17803216, + "epoch": 0.016052908462347814, + "flos": 23364321078240.0, + "grad_norm": 2.2224552364742878, + "language_loss": 0.90561724, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93791342, + "num_input_tokens_seen": 5595205, + "step": 267, + "time_per_iteration": 2.7614662647247314 + }, + { + "auxiliary_loss_clip": 0.01686402, + "auxiliary_loss_mlp": 0.01580435, + "balance_loss_clip": 1.33313084, + "balance_loss_mlp": 1.21727562, + "epoch": 0.016113031715015783, + "flos": 21288643827840.0, + "grad_norm": 3.0739080901830333, + "language_loss": 0.85915017, + "learning_rate": 3.599769175344462e-06, + "loss": 0.89181852, + "num_input_tokens_seen": 5612645, + "step": 268, + "time_per_iteration": 2.771570920944214 + }, + { + "auxiliary_loss_clip": 0.01701405, + "auxiliary_loss_mlp": 0.01574099, + "balance_loss_clip": 1.34850883, + "balance_loss_mlp": 1.20388258, + "epoch": 0.01617315496768375, + "flos": 18916593020640.0, + "grad_norm": 2.2420465157034295, + "language_loss": 0.88458395, + "learning_rate": 3.602167137831432e-06, + "loss": 0.91733897, + "num_input_tokens_seen": 5628345, + "step": 269, + "time_per_iteration": 2.953961133956909 + }, + { + "auxiliary_loss_clip": 0.01677906, + "auxiliary_loss_mlp": 0.01558725, + "balance_loss_clip": 1.32337427, + "balance_loss_mlp": 1.19804513, + "epoch": 0.01623327822035172, + "flos": 16548448813920.0, + "grad_norm": 4.001503979448871, + "language_loss": 0.97115433, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.00352061, + "num_input_tokens_seen": 5645940, + "step": 270, + "time_per_iteration": 2.935643434524536 + }, + { + "auxiliary_loss_clip": 0.01697185, + "auxiliary_loss_mlp": 0.01544312, + "balance_loss_clip": 1.34463286, + "balance_loss_mlp": 1.16818309, + "epoch": 0.016293401473019692, + "flos": 23515731750240.0, + "grad_norm": 3.194543148698401, + "language_loss": 0.85855675, + "learning_rate": 3.606936435072361e-06, + "loss": 0.89097172, + "num_input_tokens_seen": 5665690, + "step": 271, + "time_per_iteration": 2.895630121231079 + }, + { + "auxiliary_loss_clip": 0.01685877, + "auxiliary_loss_mlp": 0.01551278, + "balance_loss_clip": 1.33129048, + "balance_loss_mlp": 1.18602049, + "epoch": 0.01635352472568766, + "flos": 29018252502720.0, + "grad_norm": 3.0633596529092295, + "language_loss": 0.81096154, + "learning_rate": 3.609307900676025e-06, + "loss": 0.84333313, + "num_input_tokens_seen": 5683190, + "step": 272, + "time_per_iteration": 2.9611899852752686 + }, + { + "auxiliary_loss_clip": 0.01688073, + "auxiliary_loss_mlp": 0.01515555, + "balance_loss_clip": 1.33426499, + "balance_loss_mlp": 1.13885355, + "epoch": 0.01641364797835563, + "flos": 13372199215200.0, + "grad_norm": 4.1275055773825216, + "language_loss": 0.81307542, + "learning_rate": 3.611670663634051e-06, + "loss": 0.84511173, + "num_input_tokens_seen": 5699780, + "step": 273, + "time_per_iteration": 2.8927292823791504 + }, + { + "auxiliary_loss_clip": 0.01685209, + "auxiliary_loss_mlp": 0.01546791, + "balance_loss_clip": 1.33018219, + "balance_loss_mlp": 1.17256927, + "epoch": 0.016473771231023598, + "flos": 18880143694560.0, + "grad_norm": 2.958477749315801, + "language_loss": 0.91533542, + "learning_rate": 3.614024787585744e-06, + "loss": 0.94765538, + "num_input_tokens_seen": 5716980, + "step": 274, + "time_per_iteration": 2.9690566062927246 + }, + { + "auxiliary_loss_clip": 0.01680648, + "auxiliary_loss_mlp": 0.01532952, + "balance_loss_clip": 1.32747769, + "balance_loss_mlp": 1.1642611, + "epoch": 0.016533894483691566, + "flos": 22603853819520.0, + "grad_norm": 2.0386759338629648, + "language_loss": 0.88191932, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.91405535, + "num_input_tokens_seen": 5737780, + "step": 275, + "time_per_iteration": 2.8767857551574707 + }, + { + "auxiliary_loss_clip": 0.01681189, + "auxiliary_loss_mlp": 0.0153348, + "balance_loss_clip": 1.32721019, + "balance_loss_mlp": 1.16688752, + "epoch": 0.01659401773635954, + "flos": 21509349976800.0, + "grad_norm": 1.9084582202194027, + "language_loss": 0.8086738, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.84082055, + "num_input_tokens_seen": 5758330, + "step": 276, + "time_per_iteration": 2.9738059043884277 + }, + { + "auxiliary_loss_clip": 0.01689028, + "auxiliary_loss_mlp": 0.01534831, + "balance_loss_clip": 1.33428431, + "balance_loss_mlp": 1.1628983, + "epoch": 0.016654140989027507, + "flos": 32853586295520.0, + "grad_norm": 2.424344533648599, + "language_loss": 0.81130803, + "learning_rate": 3.621035951423551e-06, + "loss": 0.84354663, + "num_input_tokens_seen": 5778340, + "step": 277, + "time_per_iteration": 2.9642205238342285 + }, + { + "auxiliary_loss_clip": 0.01680964, + "auxiliary_loss_mlp": 0.01549871, + "balance_loss_clip": 1.32745838, + "balance_loss_mlp": 1.1796546, + "epoch": 0.016714264241695476, + "flos": 12307507270560.0, + "grad_norm": 3.638048574851431, + "language_loss": 0.8041153, + "learning_rate": 3.623356141983041e-06, + "loss": 0.83642364, + "num_input_tokens_seen": 5794295, + "step": 278, + "time_per_iteration": 3.0096347332000732 + }, + { + "auxiliary_loss_clip": 0.01688838, + "auxiliary_loss_mlp": 0.01508368, + "balance_loss_clip": 1.33451235, + "balance_loss_mlp": 1.13281131, + "epoch": 0.016774387494363444, + "flos": 27126149368320.0, + "grad_norm": 2.3461435783765783, + "language_loss": 0.90540552, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.93737757, + "num_input_tokens_seen": 5814405, + "step": 279, + "time_per_iteration": 2.8766064643859863 + }, + { + "auxiliary_loss_clip": 0.01680497, + "auxiliary_loss_mlp": 0.01536645, + "balance_loss_clip": 1.32588959, + "balance_loss_mlp": 1.1568923, + "epoch": 0.016834510747031413, + "flos": 20193457278240.0, + "grad_norm": 5.97677896753103, + "language_loss": 0.94105864, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.97323006, + "num_input_tokens_seen": 5832795, + "step": 280, + "time_per_iteration": 2.9562277793884277 + }, + { + "auxiliary_loss_clip": 0.01673314, + "auxiliary_loss_mlp": 0.01509957, + "balance_loss_clip": 1.31842494, + "balance_loss_mlp": 1.14317369, + "epoch": 0.016894633999699385, + "flos": 27276877333440.0, + "grad_norm": 2.2609930799549898, + "language_loss": 0.7421211, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.77395374, + "num_input_tokens_seen": 5855750, + "step": 281, + "time_per_iteration": 2.9312903881073 + }, + { + "auxiliary_loss_clip": 0.01686118, + "auxiliary_loss_mlp": 0.01561521, + "balance_loss_clip": 1.32931852, + "balance_loss_mlp": 1.18768096, + "epoch": 0.016954757252367354, + "flos": 14904739749600.0, + "grad_norm": 2.7059437306441034, + "language_loss": 0.7998482, + "learning_rate": 3.632554186750274e-06, + "loss": 0.83232462, + "num_input_tokens_seen": 5872610, + "step": 282, + "time_per_iteration": 7.513384819030762 + }, + { + "auxiliary_loss_clip": 0.01681469, + "auxiliary_loss_mlp": 0.0153502, + "balance_loss_clip": 1.3262558, + "balance_loss_mlp": 1.1642313, + "epoch": 0.017014880505035322, + "flos": 21360821844960.0, + "grad_norm": 3.6731982652481743, + "language_loss": 0.77819848, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.81036341, + "num_input_tokens_seen": 5892985, + "step": 283, + "time_per_iteration": 4.307174444198608 + }, + { + "auxiliary_loss_clip": 0.01679805, + "auxiliary_loss_mlp": 0.0151748, + "balance_loss_clip": 1.32464409, + "balance_loss_mlp": 1.15012455, + "epoch": 0.01707500375770329, + "flos": 35335971213120.0, + "grad_norm": 2.8064524192435374, + "language_loss": 0.84411293, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.87608576, + "num_input_tokens_seen": 5914060, + "step": 284, + "time_per_iteration": 2.851940393447876 + }, + { + "auxiliary_loss_clip": 0.01674232, + "auxiliary_loss_mlp": 0.01529494, + "balance_loss_clip": 1.3173039, + "balance_loss_mlp": 1.1546998, + "epoch": 0.01713512701037126, + "flos": 23584079023200.0, + "grad_norm": 3.7662353891409213, + "language_loss": 0.96927428, + "learning_rate": 3.639367500948819e-06, + "loss": 1.00131166, + "num_input_tokens_seen": 5932860, + "step": 285, + "time_per_iteration": 2.808007001876831 + }, + { + "auxiliary_loss_clip": 0.01676262, + "auxiliary_loss_mlp": 0.0150033, + "balance_loss_clip": 1.31907725, + "balance_loss_mlp": 1.11962366, + "epoch": 0.01719525026303923, + "flos": 27637046784000.0, + "grad_norm": 2.857322350637043, + "language_loss": 0.9369005, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96866637, + "num_input_tokens_seen": 5952725, + "step": 286, + "time_per_iteration": 2.867645740509033 + }, + { + "auxiliary_loss_clip": 0.01676783, + "auxiliary_loss_mlp": 0.01530603, + "balance_loss_clip": 1.32055557, + "balance_loss_mlp": 1.1590513, + "epoch": 0.0172553735157072, + "flos": 26982172615680.0, + "grad_norm": 1.7972481410253192, + "language_loss": 0.9218117, + "learning_rate": 3.643869982119001e-06, + "loss": 0.9538855, + "num_input_tokens_seen": 5970560, + "step": 287, + "time_per_iteration": 2.8519022464752197 + }, + { + "auxiliary_loss_clip": 0.01675541, + "auxiliary_loss_mlp": 0.01522328, + "balance_loss_clip": 1.31706166, + "balance_loss_mlp": 1.14886916, + "epoch": 0.01731549676837517, + "flos": 14057947270080.0, + "grad_norm": 3.0359648110554565, + "language_loss": 1.01869011, + "learning_rate": 3.646109470232502e-06, + "loss": 1.05066884, + "num_input_tokens_seen": 5982980, + "step": 288, + "time_per_iteration": 2.740353584289551 + }, + { + "auxiliary_loss_clip": 0.01784454, + "auxiliary_loss_mlp": 0.01617798, + "balance_loss_clip": 1.4182992, + "balance_loss_mlp": 1.32254028, + "epoch": 0.017375620021043137, + "flos": 66518349799680.0, + "grad_norm": 1.0105307437482658, + "language_loss": 0.63853657, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.67255908, + "num_input_tokens_seen": 6049445, + "step": 289, + "time_per_iteration": 3.437382221221924 + }, + { + "auxiliary_loss_clip": 0.01676918, + "auxiliary_loss_mlp": 0.01519611, + "balance_loss_clip": 1.31979704, + "balance_loss_mlp": 1.1394763, + "epoch": 0.01743574327371111, + "flos": 15226601394240.0, + "grad_norm": 3.5416426658609543, + "language_loss": 0.88719785, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.91916311, + "num_input_tokens_seen": 6064150, + "step": 290, + "time_per_iteration": 2.775041341781616 + }, + { + "auxiliary_loss_clip": 0.01673034, + "auxiliary_loss_mlp": 0.01516351, + "balance_loss_clip": 1.31540275, + "balance_loss_mlp": 1.13068557, + "epoch": 0.017495866526379078, + "flos": 25376202434880.0, + "grad_norm": 2.156708159970782, + "language_loss": 0.84805536, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87994921, + "num_input_tokens_seen": 6083920, + "step": 291, + "time_per_iteration": 2.816286325454712 + }, + { + "auxiliary_loss_clip": 0.01689791, + "auxiliary_loss_mlp": 0.01535775, + "balance_loss_clip": 1.33261144, + "balance_loss_mlp": 1.14896488, + "epoch": 0.017555989779047047, + "flos": 26361889293600.0, + "grad_norm": 1.6720964947924126, + "language_loss": 0.72708035, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75933599, + "num_input_tokens_seen": 6105460, + "step": 292, + "time_per_iteration": 2.8503129482269287 + }, + { + "auxiliary_loss_clip": 0.01685147, + "auxiliary_loss_mlp": 0.01540696, + "balance_loss_clip": 1.32730722, + "balance_loss_mlp": 1.14682841, + "epoch": 0.017616113031715015, + "flos": 22340591910720.0, + "grad_norm": 2.5932205282218663, + "language_loss": 0.87378931, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.9060477, + "num_input_tokens_seen": 6122890, + "step": 293, + "time_per_iteration": 2.816728353500366 + }, + { + "auxiliary_loss_clip": 0.01691938, + "auxiliary_loss_mlp": 0.01545702, + "balance_loss_clip": 1.33389473, + "balance_loss_mlp": 1.15622115, + "epoch": 0.017676236284382984, + "flos": 20158980216480.0, + "grad_norm": 2.6222979362199648, + "language_loss": 0.81134832, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.84372473, + "num_input_tokens_seen": 6142890, + "step": 294, + "time_per_iteration": 2.763911485671997 + }, + { + "auxiliary_loss_clip": 0.01673511, + "auxiliary_loss_mlp": 0.01515895, + "balance_loss_clip": 1.31530404, + "balance_loss_mlp": 1.12488866, + "epoch": 0.017736359537050956, + "flos": 25225398613440.0, + "grad_norm": 1.9090894513805776, + "language_loss": 0.83813691, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.87003094, + "num_input_tokens_seen": 6162030, + "step": 295, + "time_per_iteration": 2.830528497695923 + }, + { + "auxiliary_loss_clip": 0.0168585, + "auxiliary_loss_mlp": 0.0157155, + "balance_loss_clip": 1.32780862, + "balance_loss_mlp": 1.18054342, + "epoch": 0.017796482789718925, + "flos": 20340847565280.0, + "grad_norm": 2.3997832257768934, + "language_loss": 0.84610474, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87867868, + "num_input_tokens_seen": 6180540, + "step": 296, + "time_per_iteration": 2.7751951217651367 + }, + { + "auxiliary_loss_clip": 0.01677777, + "auxiliary_loss_mlp": 0.01536973, + "balance_loss_clip": 1.32028103, + "balance_loss_mlp": 1.14539385, + "epoch": 0.017856606042386893, + "flos": 22381213334400.0, + "grad_norm": 2.041140450094627, + "language_loss": 0.87547731, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90762484, + "num_input_tokens_seen": 6199425, + "step": 297, + "time_per_iteration": 2.7792744636535645 + }, + { + "auxiliary_loss_clip": 0.01682836, + "auxiliary_loss_mlp": 0.01505178, + "balance_loss_clip": 1.32405293, + "balance_loss_mlp": 1.11398077, + "epoch": 0.017916729295054862, + "flos": 20232144365760.0, + "grad_norm": 2.273950277968618, + "language_loss": 0.88515842, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.91703862, + "num_input_tokens_seen": 6219170, + "step": 298, + "time_per_iteration": 2.896641731262207 + }, + { + "auxiliary_loss_clip": 0.01673573, + "auxiliary_loss_mlp": 0.01514223, + "balance_loss_clip": 1.31580412, + "balance_loss_mlp": 1.12226284, + "epoch": 0.01797685254772283, + "flos": 19393013374560.0, + "grad_norm": 1.9933276937280702, + "language_loss": 0.88724256, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91912055, + "num_input_tokens_seen": 6237930, + "step": 299, + "time_per_iteration": 2.7768776416778564 + }, + { + "auxiliary_loss_clip": 0.01672069, + "auxiliary_loss_mlp": 0.0151638, + "balance_loss_clip": 1.31328952, + "balance_loss_mlp": 1.12384772, + "epoch": 0.018036975800390802, + "flos": 24428330316000.0, + "grad_norm": 3.033457277795304, + "language_loss": 0.6509524, + "learning_rate": 3.672392800539357e-06, + "loss": 0.68283689, + "num_input_tokens_seen": 6257170, + "step": 300, + "time_per_iteration": 2.9143149852752686 + }, + { + "auxiliary_loss_clip": 0.01678367, + "auxiliary_loss_mlp": 0.01519835, + "balance_loss_clip": 1.32028651, + "balance_loss_mlp": 1.13855624, + "epoch": 0.01809709905305877, + "flos": 15780775204800.0, + "grad_norm": 2.423423344616232, + "language_loss": 0.88466859, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.91665053, + "num_input_tokens_seen": 6274780, + "step": 301, + "time_per_iteration": 2.810411214828491 + }, + { + "auxiliary_loss_clip": 0.01803113, + "auxiliary_loss_mlp": 0.01572735, + "balance_loss_clip": 1.43232417, + "balance_loss_mlp": 1.13175583, + "epoch": 0.01815722230572674, + "flos": 67356001592640.0, + "grad_norm": 0.8455310040408494, + "language_loss": 0.62193966, + "learning_rate": 3.676670903877158e-06, + "loss": 0.65569818, + "num_input_tokens_seen": 6340435, + "step": 302, + "time_per_iteration": 3.5265212059020996 + }, + { + "auxiliary_loss_clip": 0.01666096, + "auxiliary_loss_mlp": 0.01518266, + "balance_loss_clip": 1.30656469, + "balance_loss_mlp": 1.13755918, + "epoch": 0.01821734555839471, + "flos": 15487360044480.0, + "grad_norm": 2.4454892215783257, + "language_loss": 0.89870471, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.93054831, + "num_input_tokens_seen": 6358160, + "step": 303, + "time_per_iteration": 2.7488744258880615 + }, + { + "auxiliary_loss_clip": 0.0168545, + "auxiliary_loss_mlp": 0.01546413, + "balance_loss_clip": 1.32729554, + "balance_loss_mlp": 1.17009282, + "epoch": 0.018277468811062677, + "flos": 24099717458880.0, + "grad_norm": 2.1949592392655815, + "language_loss": 0.80253816, + "learning_rate": 3.680920768703364e-06, + "loss": 0.83485681, + "num_input_tokens_seen": 6378485, + "step": 304, + "time_per_iteration": 2.8658978939056396 + }, + { + "auxiliary_loss_clip": 0.0168094, + "auxiliary_loss_mlp": 0.01543776, + "balance_loss_clip": 1.32279515, + "balance_loss_mlp": 1.17088938, + "epoch": 0.01833759206373065, + "flos": 20961434312640.0, + "grad_norm": 1.9307314110017482, + "language_loss": 0.8282578, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.86050498, + "num_input_tokens_seen": 6397845, + "step": 305, + "time_per_iteration": 2.828540563583374 + }, + { + "auxiliary_loss_clip": 0.01671885, + "auxiliary_loss_mlp": 0.01538593, + "balance_loss_clip": 1.31219745, + "balance_loss_mlp": 1.15922117, + "epoch": 0.018397715316398618, + "flos": 19392785805600.0, + "grad_norm": 1.8883826731671964, + "language_loss": 0.90987813, + "learning_rate": 3.685142765363119e-06, + "loss": 0.94198298, + "num_input_tokens_seen": 6416475, + "step": 306, + "time_per_iteration": 2.759188175201416 + }, + { + "auxiliary_loss_clip": 0.01669936, + "auxiliary_loss_mlp": 0.0153866, + "balance_loss_clip": 1.30885839, + "balance_loss_mlp": 1.17321157, + "epoch": 0.018457838569066586, + "flos": 29135110256640.0, + "grad_norm": 1.9806643100579042, + "language_loss": 0.86644292, + "learning_rate": 3.687243426879095e-06, + "loss": 0.89852881, + "num_input_tokens_seen": 6437520, + "step": 307, + "time_per_iteration": 2.9069161415100098 + }, + { + "auxiliary_loss_clip": 0.01678026, + "auxiliary_loss_mlp": 0.01602796, + "balance_loss_clip": 1.3201437, + "balance_loss_mlp": 1.24078107, + "epoch": 0.018517961821734555, + "flos": 19210690887840.0, + "grad_norm": 2.1582867540726665, + "language_loss": 0.71671474, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74952292, + "num_input_tokens_seen": 6455680, + "step": 308, + "time_per_iteration": 2.7401351928710938 + }, + { + "auxiliary_loss_clip": 0.01660762, + "auxiliary_loss_mlp": 0.01529806, + "balance_loss_clip": 1.29887986, + "balance_loss_mlp": 1.1628319, + "epoch": 0.018578085074402523, + "flos": 19864958205600.0, + "grad_norm": 2.1846912214370406, + "language_loss": 0.919595, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.95150077, + "num_input_tokens_seen": 6474880, + "step": 309, + "time_per_iteration": 2.885491132736206 + }, + { + "auxiliary_loss_clip": 0.01668582, + "auxiliary_loss_mlp": 0.01542412, + "balance_loss_clip": 1.30872154, + "balance_loss_mlp": 1.18135059, + "epoch": 0.018638208327070496, + "flos": 29609558346240.0, + "grad_norm": 2.196494677170974, + "language_loss": 0.72676003, + "learning_rate": 3.69350459956065e-06, + "loss": 0.75887001, + "num_input_tokens_seen": 6495945, + "step": 310, + "time_per_iteration": 2.8483104705810547 + }, + { + "auxiliary_loss_clip": 0.0167787, + "auxiliary_loss_mlp": 0.01541596, + "balance_loss_clip": 1.31786323, + "balance_loss_mlp": 1.17652953, + "epoch": 0.018698331579738464, + "flos": 45734952456000.0, + "grad_norm": 7.584723922561298, + "language_loss": 0.74179012, + "learning_rate": 3.695578199367497e-06, + "loss": 0.77398479, + "num_input_tokens_seen": 6519930, + "step": 311, + "time_per_iteration": 2.9565281867980957 + }, + { + "auxiliary_loss_clip": 0.01674124, + "auxiliary_loss_mlp": 0.01506224, + "balance_loss_clip": 1.31489992, + "balance_loss_mlp": 1.13772368, + "epoch": 0.018758454832406433, + "flos": 20485658737440.0, + "grad_norm": 2.5189410735080444, + "language_loss": 0.91607201, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.9478755, + "num_input_tokens_seen": 6535070, + "step": 312, + "time_per_iteration": 2.787569761276245 + }, + { + "auxiliary_loss_clip": 0.01671786, + "auxiliary_loss_mlp": 0.01542693, + "balance_loss_clip": 1.31270289, + "balance_loss_mlp": 1.18220389, + "epoch": 0.0188185780850744, + "flos": 15779561503680.0, + "grad_norm": 4.554407654225719, + "language_loss": 0.89906907, + "learning_rate": 3.699705471087043e-06, + "loss": 0.93121392, + "num_input_tokens_seen": 6554135, + "step": 313, + "time_per_iteration": 2.749643087387085 + }, + { + "auxiliary_loss_clip": 0.01673957, + "auxiliary_loss_mlp": 0.01536073, + "balance_loss_clip": 1.31443226, + "balance_loss_mlp": 1.17787325, + "epoch": 0.018878701337742373, + "flos": 22457942730720.0, + "grad_norm": 2.862574908659048, + "language_loss": 0.73449332, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.76659364, + "num_input_tokens_seen": 6572275, + "step": 314, + "time_per_iteration": 2.8539602756500244 + }, + { + "auxiliary_loss_clip": 0.016685, + "auxiliary_loss_mlp": 0.01557329, + "balance_loss_clip": 1.30761194, + "balance_loss_mlp": 1.20008254, + "epoch": 0.018938824590410342, + "flos": 30996225720000.0, + "grad_norm": 3.614430169825856, + "language_loss": 0.89869416, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.93095243, + "num_input_tokens_seen": 6594520, + "step": 315, + "time_per_iteration": 2.9068667888641357 + }, + { + "auxiliary_loss_clip": 0.01667813, + "auxiliary_loss_mlp": 0.01557193, + "balance_loss_clip": 1.30813515, + "balance_loss_mlp": 1.2001375, + "epoch": 0.01899894784307831, + "flos": 23261193318240.0, + "grad_norm": 1.9878046793833055, + "language_loss": 0.80782765, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.8400777, + "num_input_tokens_seen": 6614245, + "step": 316, + "time_per_iteration": 2.8197529315948486 + }, + { + "auxiliary_loss_clip": 0.016667, + "auxiliary_loss_mlp": 0.01563719, + "balance_loss_clip": 1.30787826, + "balance_loss_mlp": 1.21066916, + "epoch": 0.01905907109574628, + "flos": 17459871606720.0, + "grad_norm": 3.610172614029438, + "language_loss": 0.90014768, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.93245184, + "num_input_tokens_seen": 6632015, + "step": 317, + "time_per_iteration": 2.765554904937744 + }, + { + "auxiliary_loss_clip": 0.01666792, + "auxiliary_loss_mlp": 0.01568636, + "balance_loss_clip": 1.30719543, + "balance_loss_mlp": 1.21177089, + "epoch": 0.019119194348414248, + "flos": 14971683680640.0, + "grad_norm": 3.5791127816037696, + "language_loss": 0.90622067, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93857497, + "num_input_tokens_seen": 6649015, + "step": 318, + "time_per_iteration": 2.799828052520752 + }, + { + "auxiliary_loss_clip": 0.01670513, + "auxiliary_loss_mlp": 0.01544215, + "balance_loss_clip": 1.30932856, + "balance_loss_mlp": 1.1848706, + "epoch": 0.01917931760108222, + "flos": 25485133203360.0, + "grad_norm": 3.3434681757500897, + "language_loss": 0.93879992, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.97094715, + "num_input_tokens_seen": 6669225, + "step": 319, + "time_per_iteration": 4.397897005081177 + }, + { + "auxiliary_loss_clip": 0.0179214, + "auxiliary_loss_mlp": 0.01458435, + "balance_loss_clip": 1.41696465, + "balance_loss_mlp": 1.06170654, + "epoch": 0.01923944085375019, + "flos": 71563490498880.0, + "grad_norm": 0.938747091802421, + "language_loss": 0.59836018, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.63086593, + "num_input_tokens_seen": 6725775, + "step": 320, + "time_per_iteration": 6.335074186325073 + }, + { + "auxiliary_loss_clip": 0.01663114, + "auxiliary_loss_mlp": 0.01542464, + "balance_loss_clip": 1.30361962, + "balance_loss_mlp": 1.17911398, + "epoch": 0.019299564106418157, + "flos": 19684532126880.0, + "grad_norm": 3.3672363213780327, + "language_loss": 0.90380633, + "learning_rate": 3.715954969092154e-06, + "loss": 0.93586218, + "num_input_tokens_seen": 6744170, + "step": 321, + "time_per_iteration": 4.416521787643433 + }, + { + "auxiliary_loss_clip": 0.01683677, + "auxiliary_loss_mlp": 0.01560813, + "balance_loss_clip": 1.32232285, + "balance_loss_mlp": 1.18983328, + "epoch": 0.019359687359086126, + "flos": 24389339803200.0, + "grad_norm": 4.099544271147796, + "language_loss": 0.82884121, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.86128604, + "num_input_tokens_seen": 6764565, + "step": 322, + "time_per_iteration": 2.857023239135742 + }, + { + "auxiliary_loss_clip": 0.01670123, + "auxiliary_loss_mlp": 0.01535292, + "balance_loss_clip": 1.31008637, + "balance_loss_mlp": 1.16755545, + "epoch": 0.019419810611754094, + "flos": 23953654657440.0, + "grad_norm": 2.1575744250768336, + "language_loss": 0.72717237, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75922656, + "num_input_tokens_seen": 6785310, + "step": 323, + "time_per_iteration": 2.8061163425445557 + }, + { + "auxiliary_loss_clip": 0.01661664, + "auxiliary_loss_mlp": 0.01547762, + "balance_loss_clip": 1.30059028, + "balance_loss_mlp": 1.19070625, + "epoch": 0.019479933864422067, + "flos": 22162213952640.0, + "grad_norm": 2.680112808479712, + "language_loss": 0.92373252, + "learning_rate": 3.721944334919596e-06, + "loss": 0.95582676, + "num_input_tokens_seen": 6803290, + "step": 324, + "time_per_iteration": 2.833314895629883 + }, + { + "auxiliary_loss_clip": 0.01673304, + "auxiliary_loss_mlp": 0.01556721, + "balance_loss_clip": 1.31260026, + "balance_loss_mlp": 1.19813919, + "epoch": 0.019540057117090035, + "flos": 22239019205280.0, + "grad_norm": 4.9635178080417575, + "language_loss": 0.65534192, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.68764222, + "num_input_tokens_seen": 6822570, + "step": 325, + "time_per_iteration": 2.7981584072113037 + }, + { + "auxiliary_loss_clip": 0.0167349, + "auxiliary_loss_mlp": 0.01517606, + "balance_loss_clip": 1.31138563, + "balance_loss_mlp": 1.14967811, + "epoch": 0.019600180369758004, + "flos": 23079060472320.0, + "grad_norm": 2.2588050785375966, + "language_loss": 0.76765978, + "learning_rate": 3.72590651470665e-06, + "loss": 0.7995708, + "num_input_tokens_seen": 6841910, + "step": 326, + "time_per_iteration": 2.820613145828247 + }, + { + "auxiliary_loss_clip": 0.01670732, + "auxiliary_loss_mlp": 0.01520428, + "balance_loss_clip": 1.3127743, + "balance_loss_mlp": 1.15688777, + "epoch": 0.019660303622425972, + "flos": 25413220683360.0, + "grad_norm": 6.127910067596255, + "language_loss": 0.79806459, + "learning_rate": 3.727878498433505e-06, + "loss": 0.8299762, + "num_input_tokens_seen": 6862480, + "step": 327, + "time_per_iteration": 2.8370063304901123 + }, + { + "auxiliary_loss_clip": 0.0168792, + "auxiliary_loss_mlp": 0.01564845, + "balance_loss_clip": 1.32764375, + "balance_loss_mlp": 1.20378375, + "epoch": 0.01972042687509394, + "flos": 23659670574720.0, + "grad_norm": 2.238319182563457, + "language_loss": 0.80969208, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.84221977, + "num_input_tokens_seen": 6882015, + "step": 328, + "time_per_iteration": 2.8214173316955566 + }, + { + "auxiliary_loss_clip": 0.01664767, + "auxiliary_loss_mlp": 0.01533799, + "balance_loss_clip": 1.30379462, + "balance_loss_mlp": 1.1597681, + "epoch": 0.019780550127761913, + "flos": 18225459167040.0, + "grad_norm": 3.4787087777175896, + "language_loss": 0.93710554, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96909124, + "num_input_tokens_seen": 6899785, + "step": 329, + "time_per_iteration": 2.8976056575775146 + }, + { + "auxiliary_loss_clip": 0.01665897, + "auxiliary_loss_mlp": 0.01536237, + "balance_loss_clip": 1.30667591, + "balance_loss_mlp": 1.17250514, + "epoch": 0.01984067338042988, + "flos": 22420962410400.0, + "grad_norm": 3.8183278030369374, + "language_loss": 0.74794948, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.77997082, + "num_input_tokens_seen": 6918575, + "step": 330, + "time_per_iteration": 2.830643653869629 + }, + { + "auxiliary_loss_clip": 0.01668717, + "auxiliary_loss_mlp": 0.01516134, + "balance_loss_clip": 1.30941534, + "balance_loss_mlp": 1.13828814, + "epoch": 0.01990079663309785, + "flos": 17057070540000.0, + "grad_norm": 2.8080035078433796, + "language_loss": 0.94022298, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.97207153, + "num_input_tokens_seen": 6936965, + "step": 331, + "time_per_iteration": 2.8130505084991455 + }, + { + "auxiliary_loss_clip": 0.01678062, + "auxiliary_loss_mlp": 0.01513074, + "balance_loss_clip": 1.31714892, + "balance_loss_mlp": 1.13675404, + "epoch": 0.01996091988576582, + "flos": 15963666613920.0, + "grad_norm": 2.3545941220536117, + "language_loss": 0.92734218, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95925361, + "num_input_tokens_seen": 6953475, + "step": 332, + "time_per_iteration": 2.76668643951416 + }, + { + "auxiliary_loss_clip": 0.01663452, + "auxiliary_loss_mlp": 0.01526031, + "balance_loss_clip": 1.30274773, + "balance_loss_mlp": 1.15428925, + "epoch": 0.02002104313843379, + "flos": 23588706258720.0, + "grad_norm": 3.225548507525634, + "language_loss": 0.75480801, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78670287, + "num_input_tokens_seen": 6971630, + "step": 333, + "time_per_iteration": 2.850853443145752 + }, + { + "auxiliary_loss_clip": 0.01669647, + "auxiliary_loss_mlp": 0.01516107, + "balance_loss_clip": 1.30864954, + "balance_loss_mlp": 1.14322054, + "epoch": 0.02008116639110176, + "flos": 34097756114880.0, + "grad_norm": 2.3519158931180133, + "language_loss": 0.79055518, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.82241273, + "num_input_tokens_seen": 6992775, + "step": 334, + "time_per_iteration": 2.874162197113037 + }, + { + "auxiliary_loss_clip": 0.01665278, + "auxiliary_loss_mlp": 0.01528309, + "balance_loss_clip": 1.30465126, + "balance_loss_mlp": 1.15027225, + "epoch": 0.020141289643769728, + "flos": 19685821684320.0, + "grad_norm": 2.1859551894531246, + "language_loss": 0.83082914, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.86276501, + "num_input_tokens_seen": 7011425, + "step": 335, + "time_per_iteration": 2.7858188152313232 + }, + { + "auxiliary_loss_clip": 0.01663005, + "auxiliary_loss_mlp": 0.01514409, + "balance_loss_clip": 1.30101109, + "balance_loss_mlp": 1.14285791, + "epoch": 0.020201412896437697, + "flos": 20742662499840.0, + "grad_norm": 2.8082569010059406, + "language_loss": 0.92442828, + "learning_rate": 3.745359722027911e-06, + "loss": 0.95620239, + "num_input_tokens_seen": 7029450, + "step": 336, + "time_per_iteration": 2.8005473613739014 + }, + { + "auxiliary_loss_clip": 0.01661642, + "auxiliary_loss_mlp": 0.01513796, + "balance_loss_clip": 1.3011775, + "balance_loss_mlp": 1.14186287, + "epoch": 0.020261536149105665, + "flos": 20268593691840.0, + "grad_norm": 1.7329821554077753, + "language_loss": 0.88487697, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.91663134, + "num_input_tokens_seen": 7047555, + "step": 337, + "time_per_iteration": 2.7849841117858887 + }, + { + "auxiliary_loss_clip": 0.01665511, + "auxiliary_loss_mlp": 0.0151626, + "balance_loss_clip": 1.30407906, + "balance_loss_mlp": 1.14222848, + "epoch": 0.020321659401773638, + "flos": 25851143590560.0, + "grad_norm": 1.6898621640333773, + "language_loss": 0.89948356, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.9313013, + "num_input_tokens_seen": 7068185, + "step": 338, + "time_per_iteration": 2.8918988704681396 + }, + { + "auxiliary_loss_clip": 0.01662719, + "auxiliary_loss_mlp": 0.01504559, + "balance_loss_clip": 1.30021286, + "balance_loss_mlp": 1.12690353, + "epoch": 0.020381782654441606, + "flos": 17497041567840.0, + "grad_norm": 2.3763777441732605, + "language_loss": 0.85108125, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.88275403, + "num_input_tokens_seen": 7085955, + "step": 339, + "time_per_iteration": 2.8077492713928223 + }, + { + "auxiliary_loss_clip": 0.01674819, + "auxiliary_loss_mlp": 0.01527085, + "balance_loss_clip": 1.31151199, + "balance_loss_mlp": 1.14733183, + "epoch": 0.020441905907109575, + "flos": 24246273326400.0, + "grad_norm": 1.7741228717626254, + "language_loss": 0.8894102, + "learning_rate": 3.75297936342452e-06, + "loss": 0.92142928, + "num_input_tokens_seen": 7106345, + "step": 340, + "time_per_iteration": 2.8508310317993164 + }, + { + "auxiliary_loss_clip": 0.01670862, + "auxiliary_loss_mlp": 0.01509182, + "balance_loss_clip": 1.30793953, + "balance_loss_mlp": 1.12809372, + "epoch": 0.020502029159777543, + "flos": 22235302245600.0, + "grad_norm": 3.2844054456142975, + "language_loss": 0.88342941, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.9152298, + "num_input_tokens_seen": 7125070, + "step": 341, + "time_per_iteration": 2.8164596557617188 + }, + { + "auxiliary_loss_clip": 0.01666212, + "auxiliary_loss_mlp": 0.01500453, + "balance_loss_clip": 1.30214524, + "balance_loss_mlp": 1.12298894, + "epoch": 0.020562152412445512, + "flos": 23990141911680.0, + "grad_norm": 3.344007693290106, + "language_loss": 0.81026661, + "learning_rate": 3.756755633390458e-06, + "loss": 0.84193325, + "num_input_tokens_seen": 7144675, + "step": 342, + "time_per_iteration": 2.7788681983947754 + }, + { + "auxiliary_loss_clip": 0.01667204, + "auxiliary_loss_mlp": 0.01522586, + "balance_loss_clip": 1.30397618, + "balance_loss_mlp": 1.14588428, + "epoch": 0.020622275665113484, + "flos": 26977507452000.0, + "grad_norm": 1.6970537890112505, + "language_loss": 0.89779663, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.92969453, + "num_input_tokens_seen": 7165505, + "step": 343, + "time_per_iteration": 2.8354055881500244 + }, + { + "auxiliary_loss_clip": 0.01671007, + "auxiliary_loss_mlp": 0.01529029, + "balance_loss_clip": 1.30701745, + "balance_loss_mlp": 1.15957546, + "epoch": 0.020682398917781453, + "flos": 22602867687360.0, + "grad_norm": 2.6722633649724328, + "language_loss": 0.78051472, + "learning_rate": 3.7605098841644e-06, + "loss": 0.81251514, + "num_input_tokens_seen": 7184605, + "step": 344, + "time_per_iteration": 2.7873382568359375 + }, + { + "auxiliary_loss_clip": 0.01658101, + "auxiliary_loss_mlp": 0.01522821, + "balance_loss_clip": 1.29326022, + "balance_loss_mlp": 1.14974344, + "epoch": 0.02074252217044942, + "flos": 15015453141600.0, + "grad_norm": 2.9665574704940423, + "language_loss": 0.75003225, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.78184146, + "num_input_tokens_seen": 7203065, + "step": 345, + "time_per_iteration": 2.776715040206909 + }, + { + "auxiliary_loss_clip": 0.01662178, + "auxiliary_loss_mlp": 0.01542119, + "balance_loss_clip": 1.29936683, + "balance_loss_mlp": 1.16961384, + "epoch": 0.02080264542311739, + "flos": 25340322031200.0, + "grad_norm": 2.1122776216718124, + "language_loss": 0.90337539, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.93541837, + "num_input_tokens_seen": 7222995, + "step": 346, + "time_per_iteration": 2.773411512374878 + }, + { + "auxiliary_loss_clip": 0.01661, + "auxiliary_loss_mlp": 0.01501229, + "balance_loss_clip": 1.29739642, + "balance_loss_mlp": 1.1214757, + "epoch": 0.02086276867578536, + "flos": 24391084498560.0, + "grad_norm": 2.1525585596182983, + "language_loss": 0.78784537, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81946766, + "num_input_tokens_seen": 7244625, + "step": 347, + "time_per_iteration": 2.8652265071868896 + }, + { + "auxiliary_loss_clip": 0.01670177, + "auxiliary_loss_mlp": 0.01506163, + "balance_loss_clip": 1.30663621, + "balance_loss_mlp": 1.11496615, + "epoch": 0.02092289192845333, + "flos": 24464514144960.0, + "grad_norm": 2.7970709396974427, + "language_loss": 0.7158106, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.74757397, + "num_input_tokens_seen": 7263255, + "step": 348, + "time_per_iteration": 2.806077241897583 + }, + { + "auxiliary_loss_clip": 0.01657706, + "auxiliary_loss_mlp": 0.01505688, + "balance_loss_clip": 1.29242396, + "balance_loss_mlp": 1.12364554, + "epoch": 0.0209830151811213, + "flos": 17452779040800.0, + "grad_norm": 2.7927051838183226, + "language_loss": 0.76834273, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79997665, + "num_input_tokens_seen": 7279275, + "step": 349, + "time_per_iteration": 2.788390636444092 + }, + { + "auxiliary_loss_clip": 0.01669175, + "auxiliary_loss_mlp": 0.01558303, + "balance_loss_clip": 1.30416989, + "balance_loss_mlp": 1.1949532, + "epoch": 0.021043138433789268, + "flos": 24576441238080.0, + "grad_norm": 1.8899220146655675, + "language_loss": 0.85087579, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.88315058, + "num_input_tokens_seen": 7300180, + "step": 350, + "time_per_iteration": 2.7981173992156982 + }, + { + "auxiliary_loss_clip": 0.01670446, + "auxiliary_loss_mlp": 0.01532751, + "balance_loss_clip": 1.30625415, + "balance_loss_mlp": 1.15681207, + "epoch": 0.021103261686457236, + "flos": 24455904452640.0, + "grad_norm": 2.110858754261019, + "language_loss": 0.79888415, + "learning_rate": 3.773480007028776e-06, + "loss": 0.83091611, + "num_input_tokens_seen": 7317430, + "step": 351, + "time_per_iteration": 2.837265968322754 + }, + { + "auxiliary_loss_clip": 0.01655237, + "auxiliary_loss_mlp": 0.0150154, + "balance_loss_clip": 1.29073215, + "balance_loss_mlp": 1.13094223, + "epoch": 0.021163384939125205, + "flos": 14684792163840.0, + "grad_norm": 2.024967319340129, + "language_loss": 0.8734147, + "learning_rate": 3.775311735671078e-06, + "loss": 0.90498245, + "num_input_tokens_seen": 7334875, + "step": 352, + "time_per_iteration": 2.7219574451446533 + }, + { + "auxiliary_loss_clip": 0.01662664, + "auxiliary_loss_mlp": 0.01531725, + "balance_loss_clip": 1.29794383, + "balance_loss_mlp": 1.16246223, + "epoch": 0.021223508191793177, + "flos": 24495198390720.0, + "grad_norm": 3.9295577695170736, + "language_loss": 0.82471019, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.85665411, + "num_input_tokens_seen": 7355185, + "step": 353, + "time_per_iteration": 2.82306170463562 + }, + { + "auxiliary_loss_clip": 0.01666119, + "auxiliary_loss_mlp": 0.01544417, + "balance_loss_clip": 1.30154657, + "balance_loss_mlp": 1.17477322, + "epoch": 0.021283631444461146, + "flos": 24128884578240.0, + "grad_norm": 2.1966545692258768, + "language_loss": 0.81192935, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.84403467, + "num_input_tokens_seen": 7374425, + "step": 354, + "time_per_iteration": 2.8458476066589355 + }, + { + "auxiliary_loss_clip": 0.01653941, + "auxiliary_loss_mlp": 0.01505545, + "balance_loss_clip": 1.28904986, + "balance_loss_mlp": 1.13456559, + "epoch": 0.021343754697129114, + "flos": 25194600583200.0, + "grad_norm": 2.2870666409934746, + "language_loss": 0.81256413, + "learning_rate": 3.780775860546545e-06, + "loss": 0.84415901, + "num_input_tokens_seen": 7394175, + "step": 355, + "time_per_iteration": 2.7766451835632324 + }, + { + "auxiliary_loss_clip": 0.01648582, + "auxiliary_loss_mlp": 0.01526645, + "balance_loss_clip": 1.28444982, + "balance_loss_mlp": 1.15776408, + "epoch": 0.021403877949797083, + "flos": 17276069921760.0, + "grad_norm": 2.7439223862907673, + "language_loss": 0.89601648, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.92776871, + "num_input_tokens_seen": 7412645, + "step": 356, + "time_per_iteration": 2.774798631668091 + }, + { + "auxiliary_loss_clip": 0.01664656, + "auxiliary_loss_mlp": 0.01517939, + "balance_loss_clip": 1.29900885, + "balance_loss_mlp": 1.1523006, + "epoch": 0.021464001202465055, + "flos": 30919951461600.0, + "grad_norm": 3.4166621595205027, + "language_loss": 0.80453104, + "learning_rate": 3.784393017158528e-06, + "loss": 0.836357, + "num_input_tokens_seen": 7432275, + "step": 357, + "time_per_iteration": 4.390174865722656 + }, + { + "auxiliary_loss_clip": 0.01661078, + "auxiliary_loss_mlp": 0.01527729, + "balance_loss_clip": 1.29558003, + "balance_loss_mlp": 1.15980172, + "epoch": 0.021524124455133024, + "flos": 18188327134080.0, + "grad_norm": 2.9292524551034536, + "language_loss": 0.76460701, + "learning_rate": 3.786194003461506e-06, + "loss": 0.79649508, + "num_input_tokens_seen": 7450245, + "step": 358, + "time_per_iteration": 5.749543190002441 + }, + { + "auxiliary_loss_clip": 0.01649287, + "auxiliary_loss_mlp": 0.0150062, + "balance_loss_clip": 1.28314996, + "balance_loss_mlp": 1.12754273, + "epoch": 0.021584247707800992, + "flos": 13807353366720.0, + "grad_norm": 4.55557651157651, + "language_loss": 0.88166285, + "learning_rate": 3.787989966086264e-06, + "loss": 0.91316187, + "num_input_tokens_seen": 7466845, + "step": 359, + "time_per_iteration": 4.261540174484253 + }, + { + "auxiliary_loss_clip": 0.01669705, + "auxiliary_loss_mlp": 0.01537486, + "balance_loss_clip": 1.30593467, + "balance_loss_mlp": 1.1747086, + "epoch": 0.02164437096046896, + "flos": 23296959937440.0, + "grad_norm": 3.2493479030404435, + "language_loss": 0.76201636, + "learning_rate": 3.789780932980997e-06, + "loss": 0.79408824, + "num_input_tokens_seen": 7485450, + "step": 360, + "time_per_iteration": 2.8496994972229004 + }, + { + "auxiliary_loss_clip": 0.01827593, + "auxiliary_loss_mlp": 0.01452599, + "balance_loss_clip": 1.44727468, + "balance_loss_mlp": 1.10927582, + "epoch": 0.02170449421313693, + "flos": 68906292870240.0, + "grad_norm": 0.8635258993041504, + "language_loss": 0.64894372, + "learning_rate": 3.79156693186132e-06, + "loss": 0.68174565, + "num_input_tokens_seen": 7553780, + "step": 361, + "time_per_iteration": 3.465531349182129 + }, + { + "auxiliary_loss_clip": 0.01654858, + "auxiliary_loss_mlp": 0.01518208, + "balance_loss_clip": 1.2891736, + "balance_loss_mlp": 1.15237844, + "epoch": 0.0217646174658049, + "flos": 25230898196640.0, + "grad_norm": 4.318098010484871, + "language_loss": 0.78477871, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.81650937, + "num_input_tokens_seen": 7574155, + "step": 362, + "time_per_iteration": 2.8500776290893555 + }, + { + "auxiliary_loss_clip": 0.01654306, + "auxiliary_loss_mlp": 0.01539245, + "balance_loss_clip": 1.29061842, + "balance_loss_mlp": 1.1726526, + "epoch": 0.02182474071847287, + "flos": 22895334643680.0, + "grad_norm": 2.0792735973927665, + "language_loss": 0.92570531, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.95764083, + "num_input_tokens_seen": 7592320, + "step": 363, + "time_per_iteration": 2.8143386840820312 + }, + { + "auxiliary_loss_clip": 0.01662804, + "auxiliary_loss_mlp": 0.01537935, + "balance_loss_clip": 1.29695344, + "balance_loss_mlp": 1.16867185, + "epoch": 0.02188486397114084, + "flos": 23661225629280.0, + "grad_norm": 2.3956629102743348, + "language_loss": 0.89847916, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.93048644, + "num_input_tokens_seen": 7611185, + "step": 364, + "time_per_iteration": 2.8888142108917236 + }, + { + "auxiliary_loss_clip": 0.01658799, + "auxiliary_loss_mlp": 0.01540666, + "balance_loss_clip": 1.29306328, + "balance_loss_mlp": 1.16797054, + "epoch": 0.021944987223808807, + "flos": 21545799302880.0, + "grad_norm": 2.2785950990116453, + "language_loss": 0.79393017, + "learning_rate": 3.798661793553676e-06, + "loss": 0.82592487, + "num_input_tokens_seen": 7631970, + "step": 365, + "time_per_iteration": 2.810436964035034 + }, + { + "auxiliary_loss_clip": 0.01661545, + "auxiliary_loss_mlp": 0.01535826, + "balance_loss_clip": 1.29624152, + "balance_loss_mlp": 1.16656327, + "epoch": 0.022005110476476776, + "flos": 16072749095040.0, + "grad_norm": 4.890260699237242, + "language_loss": 0.84436715, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.87634087, + "num_input_tokens_seen": 7649745, + "step": 366, + "time_per_iteration": 2.8153350353240967 + }, + { + "auxiliary_loss_clip": 0.01659704, + "auxiliary_loss_mlp": 0.01569717, + "balance_loss_clip": 1.29500449, + "balance_loss_mlp": 1.19835627, + "epoch": 0.022065233729144748, + "flos": 21435844474080.0, + "grad_norm": 2.0221323530527693, + "language_loss": 0.86991584, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.90221006, + "num_input_tokens_seen": 7668830, + "step": 367, + "time_per_iteration": 2.7973530292510986 + }, + { + "auxiliary_loss_clip": 0.01648439, + "auxiliary_loss_mlp": 0.01510536, + "balance_loss_clip": 1.28357327, + "balance_loss_mlp": 1.13154602, + "epoch": 0.022125356981812717, + "flos": 21545875159200.0, + "grad_norm": 22.875049537116166, + "language_loss": 0.8454529, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87704265, + "num_input_tokens_seen": 7687240, + "step": 368, + "time_per_iteration": 2.8261241912841797 + }, + { + "auxiliary_loss_clip": 0.01655525, + "auxiliary_loss_mlp": 0.01502823, + "balance_loss_clip": 1.29110122, + "balance_loss_mlp": 1.12306976, + "epoch": 0.022185480234480685, + "flos": 20706251101920.0, + "grad_norm": 3.873875480216643, + "language_loss": 0.75426328, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.78584677, + "num_input_tokens_seen": 7704440, + "step": 369, + "time_per_iteration": 2.763079881668091 + }, + { + "auxiliary_loss_clip": 0.01655226, + "auxiliary_loss_mlp": 0.01520773, + "balance_loss_clip": 1.28996003, + "balance_loss_mlp": 1.15284514, + "epoch": 0.022245603487148654, + "flos": 25195624643520.0, + "grad_norm": 2.0110374012827044, + "language_loss": 0.83198851, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.86374855, + "num_input_tokens_seen": 7727160, + "step": 370, + "time_per_iteration": 2.8127501010894775 + }, + { + "auxiliary_loss_clip": 0.0166169, + "auxiliary_loss_mlp": 0.01539047, + "balance_loss_clip": 1.29702616, + "balance_loss_mlp": 1.17188287, + "epoch": 0.022305726739816623, + "flos": 21397991806080.0, + "grad_norm": 2.7492992603824478, + "language_loss": 0.81733811, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.84934545, + "num_input_tokens_seen": 7747730, + "step": 371, + "time_per_iteration": 2.8958115577697754 + }, + { + "auxiliary_loss_clip": 0.0165731, + "auxiliary_loss_mlp": 0.01510578, + "balance_loss_clip": 1.2943449, + "balance_loss_mlp": 1.1354022, + "epoch": 0.022365849992484595, + "flos": 22494354128640.0, + "grad_norm": 2.1252087304725826, + "language_loss": 0.83383071, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.86550957, + "num_input_tokens_seen": 7766765, + "step": 372, + "time_per_iteration": 2.7354915142059326 + }, + { + "auxiliary_loss_clip": 0.01649315, + "auxiliary_loss_mlp": 0.01500641, + "balance_loss_clip": 1.28585911, + "balance_loss_mlp": 1.12451148, + "epoch": 0.022425973245152563, + "flos": 17858424719520.0, + "grad_norm": 4.513462471907784, + "language_loss": 0.78365338, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.81515288, + "num_input_tokens_seen": 7784010, + "step": 373, + "time_per_iteration": 2.7984185218811035 + }, + { + "auxiliary_loss_clip": 0.01658493, + "auxiliary_loss_mlp": 0.01516056, + "balance_loss_clip": 1.29293215, + "balance_loss_mlp": 1.13286948, + "epoch": 0.022486096497820532, + "flos": 15484629216960.0, + "grad_norm": 4.2397264763577285, + "language_loss": 0.78005654, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.81180203, + "num_input_tokens_seen": 7801305, + "step": 374, + "time_per_iteration": 2.749908447265625 + }, + { + "auxiliary_loss_clip": 0.01652449, + "auxiliary_loss_mlp": 0.01497776, + "balance_loss_clip": 1.2878871, + "balance_loss_mlp": 1.12813175, + "epoch": 0.0225462197504885, + "flos": 27786750688800.0, + "grad_norm": 1.910273112649104, + "language_loss": 0.86380744, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.89530969, + "num_input_tokens_seen": 7823965, + "step": 375, + "time_per_iteration": 2.860337257385254 + }, + { + "auxiliary_loss_clip": 0.01655843, + "auxiliary_loss_mlp": 0.01520249, + "balance_loss_clip": 1.29034948, + "balance_loss_mlp": 1.13572693, + "epoch": 0.02260634300315647, + "flos": 19977605933760.0, + "grad_norm": 3.00399721418333, + "language_loss": 0.89273196, + "learning_rate": 3.817778917253314e-06, + "loss": 0.9244929, + "num_input_tokens_seen": 7842115, + "step": 376, + "time_per_iteration": 2.7806079387664795 + }, + { + "auxiliary_loss_clip": 0.01663734, + "auxiliary_loss_mlp": 0.01513422, + "balance_loss_clip": 1.29931414, + "balance_loss_mlp": 1.13939071, + "epoch": 0.02266646625582444, + "flos": 16029700269120.0, + "grad_norm": 2.725793422128946, + "language_loss": 0.74998021, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.78175175, + "num_input_tokens_seen": 7857830, + "step": 377, + "time_per_iteration": 2.7711825370788574 + }, + { + "auxiliary_loss_clip": 0.01657805, + "auxiliary_loss_mlp": 0.01512426, + "balance_loss_clip": 1.29159081, + "balance_loss_mlp": 1.13286364, + "epoch": 0.02272658950849241, + "flos": 20406274369920.0, + "grad_norm": 2.41296729225282, + "language_loss": 0.99705529, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.02875757, + "num_input_tokens_seen": 7875840, + "step": 378, + "time_per_iteration": 2.7684834003448486 + }, + { + "auxiliary_loss_clip": 0.01798541, + "auxiliary_loss_mlp": 0.01420273, + "balance_loss_clip": 1.41539025, + "balance_loss_mlp": 1.06855774, + "epoch": 0.02278671276116038, + "flos": 69854051204640.0, + "grad_norm": 0.9827340748220446, + "language_loss": 0.7541275, + "learning_rate": 3.822895650276492e-06, + "loss": 0.78631556, + "num_input_tokens_seen": 7940190, + "step": 379, + "time_per_iteration": 3.4302470684051514 + }, + { + "auxiliary_loss_clip": 0.01649191, + "auxiliary_loss_mlp": 0.01525262, + "balance_loss_clip": 1.28365839, + "balance_loss_mlp": 1.14989626, + "epoch": 0.022846836013828347, + "flos": 38511841530240.0, + "grad_norm": 2.473535424230106, + "language_loss": 0.78412402, + "learning_rate": 3.824592231451859e-06, + "loss": 0.81586856, + "num_input_tokens_seen": 7960840, + "step": 380, + "time_per_iteration": 2.96722149848938 + }, + { + "auxiliary_loss_clip": 0.01657571, + "auxiliary_loss_mlp": 0.01508469, + "balance_loss_clip": 1.29343677, + "balance_loss_mlp": 1.13920593, + "epoch": 0.02290695926649632, + "flos": 20961586025280.0, + "grad_norm": 4.95485753643894, + "language_loss": 0.96898115, + "learning_rate": 3.826284353801652e-06, + "loss": 1.00064147, + "num_input_tokens_seen": 7975500, + "step": 381, + "time_per_iteration": 2.8666152954101562 + }, + { + "auxiliary_loss_clip": 0.0165935, + "auxiliary_loss_mlp": 0.01524499, + "balance_loss_clip": 1.29428685, + "balance_loss_mlp": 1.15142179, + "epoch": 0.022967082519164288, + "flos": 24024581045280.0, + "grad_norm": 2.3423026942487257, + "language_loss": 0.87700695, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90884542, + "num_input_tokens_seen": 7993880, + "step": 382, + "time_per_iteration": 2.9100594520568848 + }, + { + "auxiliary_loss_clip": 0.01653972, + "auxiliary_loss_mlp": 0.01513929, + "balance_loss_clip": 1.28936875, + "balance_loss_mlp": 1.14638305, + "epoch": 0.023027205771832256, + "flos": 20999324908800.0, + "grad_norm": 2.028360908434398, + "language_loss": 0.8481729, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87985194, + "num_input_tokens_seen": 8012730, + "step": 383, + "time_per_iteration": 2.8179190158843994 + }, + { + "auxiliary_loss_clip": 0.01659158, + "auxiliary_loss_mlp": 0.01525557, + "balance_loss_clip": 1.29650187, + "balance_loss_mlp": 1.15343356, + "epoch": 0.023087329024500225, + "flos": 21362945821920.0, + "grad_norm": 33.89948210391325, + "language_loss": 0.83099109, + "learning_rate": 3.831334200735543e-06, + "loss": 0.86283827, + "num_input_tokens_seen": 8031275, + "step": 384, + "time_per_iteration": 2.805922269821167 + }, + { + "auxiliary_loss_clip": 0.01658303, + "auxiliary_loss_mlp": 0.01522267, + "balance_loss_clip": 1.29339886, + "balance_loss_mlp": 1.15071559, + "epoch": 0.023147452277168194, + "flos": 21874639728960.0, + "grad_norm": 1.8838927284101412, + "language_loss": 0.8933447, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.92515039, + "num_input_tokens_seen": 8051600, + "step": 385, + "time_per_iteration": 2.8305270671844482 + }, + { + "auxiliary_loss_clip": 0.01664875, + "auxiliary_loss_mlp": 0.01500523, + "balance_loss_clip": 1.30160666, + "balance_loss_mlp": 1.13068795, + "epoch": 0.023207575529836166, + "flos": 18918716997600.0, + "grad_norm": 1.797927211061626, + "language_loss": 0.69808298, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72973692, + "num_input_tokens_seen": 8070600, + "step": 386, + "time_per_iteration": 2.7852303981781006 + }, + { + "auxiliary_loss_clip": 0.01655112, + "auxiliary_loss_mlp": 0.01496149, + "balance_loss_clip": 1.29262042, + "balance_loss_mlp": 1.12669563, + "epoch": 0.023267698782504134, + "flos": 25048348140960.0, + "grad_norm": 1.9090765826823348, + "language_loss": 0.88152713, + "learning_rate": 3.836344748851495e-06, + "loss": 0.91303974, + "num_input_tokens_seen": 8090680, + "step": 387, + "time_per_iteration": 2.896854877471924 + }, + { + "auxiliary_loss_clip": 0.0165073, + "auxiliary_loss_mlp": 0.01526356, + "balance_loss_clip": 1.28571975, + "balance_loss_mlp": 1.16281497, + "epoch": 0.023327822035172103, + "flos": 28881899310240.0, + "grad_norm": 2.9921320995156493, + "language_loss": 0.833094, + "learning_rate": 3.838006303795566e-06, + "loss": 0.86486483, + "num_input_tokens_seen": 8114610, + "step": 388, + "time_per_iteration": 2.797048807144165 + }, + { + "auxiliary_loss_clip": 0.01651999, + "auxiliary_loss_mlp": 0.01506947, + "balance_loss_clip": 1.28668475, + "balance_loss_mlp": 1.14283395, + "epoch": 0.02338794528784007, + "flos": 27123759894240.0, + "grad_norm": 2.645611711002063, + "language_loss": 0.93839669, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96998608, + "num_input_tokens_seen": 8133975, + "step": 389, + "time_per_iteration": 2.88600754737854 + }, + { + "auxiliary_loss_clip": 0.01658504, + "auxiliary_loss_mlp": 0.01537213, + "balance_loss_clip": 1.29390407, + "balance_loss_mlp": 1.18225527, + "epoch": 0.02344806854050804, + "flos": 21324372518880.0, + "grad_norm": 4.367515662960993, + "language_loss": 0.87968802, + "learning_rate": 3.841316605090178e-06, + "loss": 0.91164517, + "num_input_tokens_seen": 8153570, + "step": 390, + "time_per_iteration": 2.8082776069641113 + }, + { + "auxiliary_loss_clip": 0.01662362, + "auxiliary_loss_mlp": 0.0152956, + "balance_loss_clip": 1.29786038, + "balance_loss_mlp": 1.16182375, + "epoch": 0.023508191793176012, + "flos": 24792292582560.0, + "grad_norm": 2.460108619328274, + "language_loss": 0.89439756, + "learning_rate": 3.842965395193529e-06, + "loss": 0.92631686, + "num_input_tokens_seen": 8170075, + "step": 391, + "time_per_iteration": 2.8116531372070312 + }, + { + "auxiliary_loss_clip": 0.01660698, + "auxiliary_loss_mlp": 0.0153927, + "balance_loss_clip": 1.29606414, + "balance_loss_mlp": 1.17859077, + "epoch": 0.02356831504584398, + "flos": 25997813242560.0, + "grad_norm": 9.69268397812041, + "language_loss": 0.86220515, + "learning_rate": 3.84460997382332e-06, + "loss": 0.8942048, + "num_input_tokens_seen": 8190420, + "step": 392, + "time_per_iteration": 2.861618757247925 + }, + { + "auxiliary_loss_clip": 0.01657385, + "auxiliary_loss_mlp": 0.01513265, + "balance_loss_clip": 1.29323637, + "balance_loss_mlp": 1.14114118, + "epoch": 0.02362843829851195, + "flos": 19064476373760.0, + "grad_norm": 1.8779503823218846, + "language_loss": 0.89173663, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.92344314, + "num_input_tokens_seen": 8208790, + "step": 393, + "time_per_iteration": 2.8068349361419678 + }, + { + "auxiliary_loss_clip": 0.01666426, + "auxiliary_loss_mlp": 0.01528934, + "balance_loss_clip": 1.302917, + "balance_loss_mlp": 1.16081595, + "epoch": 0.023688561551179918, + "flos": 16072900807680.0, + "grad_norm": 2.5400202251896795, + "language_loss": 0.81674886, + "learning_rate": 3.84788658233771e-06, + "loss": 0.84870249, + "num_input_tokens_seen": 8226885, + "step": 394, + "time_per_iteration": 2.8509469032287598 + }, + { + "auxiliary_loss_clip": 0.01664009, + "auxiliary_loss_mlp": 0.01525628, + "balance_loss_clip": 1.29885817, + "balance_loss_mlp": 1.15636504, + "epoch": 0.023748684803847887, + "flos": 21726452950560.0, + "grad_norm": 3.7604261525382365, + "language_loss": 0.8601082, + "learning_rate": 3.84951865465269e-06, + "loss": 0.89200461, + "num_input_tokens_seen": 8246825, + "step": 395, + "time_per_iteration": 4.401262044906616 + }, + { + "auxiliary_loss_clip": 0.01785334, + "auxiliary_loss_mlp": 0.01479553, + "balance_loss_clip": 1.40170252, + "balance_loss_mlp": 1.15911865, + "epoch": 0.02380880805651586, + "flos": 61932258721440.0, + "grad_norm": 0.9477340627009901, + "language_loss": 0.63804621, + "learning_rate": 3.851146600358172e-06, + "loss": 0.67069507, + "num_input_tokens_seen": 8302835, + "step": 396, + "time_per_iteration": 6.123257875442505 + }, + { + "auxiliary_loss_clip": 0.0166101, + "auxiliary_loss_mlp": 0.01525707, + "balance_loss_clip": 1.29610562, + "balance_loss_mlp": 1.15606236, + "epoch": 0.023868931309183827, + "flos": 20268404051040.0, + "grad_norm": 3.3929245382772475, + "language_loss": 0.84075338, + "learning_rate": 3.852770440269372e-06, + "loss": 0.87262052, + "num_input_tokens_seen": 8320745, + "step": 397, + "time_per_iteration": 4.284027099609375 + }, + { + "auxiliary_loss_clip": 0.01662035, + "auxiliary_loss_mlp": 0.01519541, + "balance_loss_clip": 1.29752088, + "balance_loss_mlp": 1.1477989, + "epoch": 0.023929054561851796, + "flos": 21141063900000.0, + "grad_norm": 2.6542208836062064, + "language_loss": 0.84386468, + "learning_rate": 3.854390195044404e-06, + "loss": 0.87568045, + "num_input_tokens_seen": 8339540, + "step": 398, + "time_per_iteration": 2.7697954177856445 + }, + { + "auxiliary_loss_clip": 0.01655343, + "auxiliary_loss_mlp": 0.01502665, + "balance_loss_clip": 1.29130054, + "balance_loss_mlp": 1.13492775, + "epoch": 0.023989177814519765, + "flos": 13700205221760.0, + "grad_norm": 13.594921095946441, + "language_loss": 0.86375678, + "learning_rate": 3.856005885185868e-06, + "loss": 0.89533687, + "num_input_tokens_seen": 8354890, + "step": 399, + "time_per_iteration": 2.766400098800659 + }, + { + "auxiliary_loss_clip": 0.01659337, + "auxiliary_loss_mlp": 0.01498859, + "balance_loss_clip": 1.29438007, + "balance_loss_mlp": 1.12272954, + "epoch": 0.024049301067187733, + "flos": 26324377979040.0, + "grad_norm": 2.096349323292437, + "language_loss": 0.86356056, + "learning_rate": 3.857617531042398e-06, + "loss": 0.8951425, + "num_input_tokens_seen": 8375845, + "step": 400, + "time_per_iteration": 2.8016343116760254 + }, + { + "auxiliary_loss_clip": 0.01669563, + "auxiliary_loss_mlp": 0.01543757, + "balance_loss_clip": 1.3060962, + "balance_loss_mlp": 1.16610229, + "epoch": 0.024109424319855705, + "flos": 24427723465440.0, + "grad_norm": 1.8058187752254802, + "language_loss": 0.79327923, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.82541251, + "num_input_tokens_seen": 8395240, + "step": 401, + "time_per_iteration": 2.8738462924957275 + }, + { + "auxiliary_loss_clip": 0.0165441, + "auxiliary_loss_mlp": 0.01502448, + "balance_loss_clip": 1.28941822, + "balance_loss_mlp": 1.12765348, + "epoch": 0.024169547572523674, + "flos": 29606865446880.0, + "grad_norm": 2.289372171793729, + "language_loss": 0.78432196, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.81589055, + "num_input_tokens_seen": 8416950, + "step": 402, + "time_per_iteration": 2.865419864654541 + }, + { + "auxiliary_loss_clip": 0.01661889, + "auxiliary_loss_mlp": 0.0150338, + "balance_loss_clip": 1.29699874, + "balance_loss_mlp": 1.1297307, + "epoch": 0.024229670825191642, + "flos": 22603664178720.0, + "grad_norm": 6.029662034913224, + "language_loss": 0.9469496, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97860229, + "num_input_tokens_seen": 8433660, + "step": 403, + "time_per_iteration": 2.8460497856140137 + }, + { + "auxiliary_loss_clip": 0.01648469, + "auxiliary_loss_mlp": 0.01510613, + "balance_loss_clip": 1.28306293, + "balance_loss_mlp": 1.1260916, + "epoch": 0.02428979407785961, + "flos": 18152219161440.0, + "grad_norm": 2.9949233496552967, + "language_loss": 0.99485993, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02645075, + "num_input_tokens_seen": 8450180, + "step": 404, + "time_per_iteration": 2.8744139671325684 + }, + { + "auxiliary_loss_clip": 0.01660341, + "auxiliary_loss_mlp": 0.01503856, + "balance_loss_clip": 1.29520011, + "balance_loss_mlp": 1.12524748, + "epoch": 0.024349917330527583, + "flos": 15306820181280.0, + "grad_norm": 2.408670864731236, + "language_loss": 0.87829006, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90993208, + "num_input_tokens_seen": 8467775, + "step": 405, + "time_per_iteration": 2.743396759033203 + }, + { + "auxiliary_loss_clip": 0.01665643, + "auxiliary_loss_mlp": 0.01546412, + "balance_loss_clip": 1.2996161, + "balance_loss_mlp": 1.16646767, + "epoch": 0.024410040583195552, + "flos": 20775812076000.0, + "grad_norm": 5.5041206077075415, + "language_loss": 0.93322551, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9653461, + "num_input_tokens_seen": 8486765, + "step": 406, + "time_per_iteration": 2.987781047821045 + }, + { + "auxiliary_loss_clip": 0.01657329, + "auxiliary_loss_mlp": 0.01499697, + "balance_loss_clip": 1.29188824, + "balance_loss_mlp": 1.11937213, + "epoch": 0.02447016383586352, + "flos": 21801020441760.0, + "grad_norm": 2.3377594228328826, + "language_loss": 0.87068093, + "learning_rate": 3.86878748971496e-06, + "loss": 0.90225112, + "num_input_tokens_seen": 8506515, + "step": 407, + "time_per_iteration": 2.8557116985321045 + }, + { + "auxiliary_loss_clip": 0.01671079, + "auxiliary_loss_mlp": 0.01506138, + "balance_loss_clip": 1.30671334, + "balance_loss_mlp": 1.12390518, + "epoch": 0.02453028708853149, + "flos": 33950934750240.0, + "grad_norm": 4.21927561728325, + "language_loss": 0.74006361, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.77183574, + "num_input_tokens_seen": 8528035, + "step": 408, + "time_per_iteration": 2.9985828399658203 + }, + { + "auxiliary_loss_clip": 0.01666434, + "auxiliary_loss_mlp": 0.01505936, + "balance_loss_clip": 1.3031342, + "balance_loss_mlp": 1.12160492, + "epoch": 0.024590410341199458, + "flos": 21794838151680.0, + "grad_norm": 4.481475412061342, + "language_loss": 0.92553651, + "learning_rate": 3.871943634189376e-06, + "loss": 0.95726025, + "num_input_tokens_seen": 8546455, + "step": 409, + "time_per_iteration": 2.9073758125305176 + }, + { + "auxiliary_loss_clip": 0.01659289, + "auxiliary_loss_mlp": 0.01535996, + "balance_loss_clip": 1.29494309, + "balance_loss_mlp": 1.15204644, + "epoch": 0.02465053359386743, + "flos": 35117616610080.0, + "grad_norm": 2.7309857116200456, + "language_loss": 0.82620823, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85816109, + "num_input_tokens_seen": 8568450, + "step": 410, + "time_per_iteration": 3.0593247413635254 + }, + { + "auxiliary_loss_clip": 0.0165698, + "auxiliary_loss_mlp": 0.01498286, + "balance_loss_clip": 1.2922914, + "balance_loss_mlp": 1.12177527, + "epoch": 0.0247106568465354, + "flos": 27453738165120.0, + "grad_norm": 3.567856612288212, + "language_loss": 0.7759518, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80750448, + "num_input_tokens_seen": 8589340, + "step": 411, + "time_per_iteration": 2.9733963012695312 + }, + { + "auxiliary_loss_clip": 0.01664285, + "auxiliary_loss_mlp": 0.0151719, + "balance_loss_clip": 1.30101585, + "balance_loss_mlp": 1.13419414, + "epoch": 0.024770780099203367, + "flos": 20705871820320.0, + "grad_norm": 3.084974258775491, + "language_loss": 0.86702323, + "learning_rate": 3.87664903040738e-06, + "loss": 0.89883804, + "num_input_tokens_seen": 8607150, + "step": 412, + "time_per_iteration": 2.7960433959960938 + }, + { + "auxiliary_loss_clip": 0.01764787, + "auxiliary_loss_mlp": 0.0143869, + "balance_loss_clip": 1.38528073, + "balance_loss_mlp": 1.05874634, + "epoch": 0.024830903351871336, + "flos": 69558360354720.0, + "grad_norm": 0.9510074107827714, + "language_loss": 0.58527327, + "learning_rate": 3.878209884949994e-06, + "loss": 0.61730802, + "num_input_tokens_seen": 8669865, + "step": 413, + "time_per_iteration": 3.535040855407715 + }, + { + "auxiliary_loss_clip": 0.01654166, + "auxiliary_loss_mlp": 0.01504539, + "balance_loss_clip": 1.28877139, + "balance_loss_mlp": 1.13222492, + "epoch": 0.024891026604539304, + "flos": 32273241690240.0, + "grad_norm": 2.277042493131835, + "language_loss": 0.80433619, + "learning_rate": 3.879766964750006e-06, + "loss": 0.83592319, + "num_input_tokens_seen": 8690235, + "step": 414, + "time_per_iteration": 2.984839677810669 + }, + { + "auxiliary_loss_clip": 0.01657263, + "auxiliary_loss_mlp": 0.01518148, + "balance_loss_clip": 1.29141593, + "balance_loss_mlp": 1.13820434, + "epoch": 0.024951149857207276, + "flos": 18842101385760.0, + "grad_norm": 3.8768144198223142, + "language_loss": 0.80771524, + "learning_rate": 3.881320288020917e-06, + "loss": 0.83946931, + "num_input_tokens_seen": 8706295, + "step": 415, + "time_per_iteration": 2.867588996887207 + }, + { + "auxiliary_loss_clip": 0.01664384, + "auxiliary_loss_mlp": 0.0150601, + "balance_loss_clip": 1.29942322, + "balance_loss_mlp": 1.12454033, + "epoch": 0.025011273109875245, + "flos": 15378998198400.0, + "grad_norm": 7.420071949587525, + "language_loss": 0.9617222, + "learning_rate": 3.882869872844723e-06, + "loss": 0.99342614, + "num_input_tokens_seen": 8724200, + "step": 416, + "time_per_iteration": 2.904597043991089 + }, + { + "auxiliary_loss_clip": 0.01662541, + "auxiliary_loss_mlp": 0.01512378, + "balance_loss_clip": 1.2968421, + "balance_loss_mlp": 1.1408267, + "epoch": 0.025071396362543213, + "flos": 18917465368320.0, + "grad_norm": 3.994850091545749, + "language_loss": 0.77689183, + "learning_rate": 3.884415737173176e-06, + "loss": 0.80864096, + "num_input_tokens_seen": 8744170, + "step": 417, + "time_per_iteration": 2.8113133907318115 + }, + { + "auxiliary_loss_clip": 0.01671814, + "auxiliary_loss_mlp": 0.01541156, + "balance_loss_clip": 1.30602682, + "balance_loss_mlp": 1.17017651, + "epoch": 0.025131519615211182, + "flos": 25340322031200.0, + "grad_norm": 1.8593249802604788, + "language_loss": 0.77174616, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.80387586, + "num_input_tokens_seen": 8765120, + "step": 418, + "time_per_iteration": 2.942448139190674 + }, + { + "auxiliary_loss_clip": 0.01655439, + "auxiliary_loss_mlp": 0.01526037, + "balance_loss_clip": 1.28906608, + "balance_loss_mlp": 1.16325951, + "epoch": 0.02519164286787915, + "flos": 18955166323680.0, + "grad_norm": 3.1239820746992786, + "language_loss": 0.8185752, + "learning_rate": 3.887496375507294e-06, + "loss": 0.85038996, + "num_input_tokens_seen": 8783500, + "step": 419, + "time_per_iteration": 2.9454283714294434 + }, + { + "auxiliary_loss_clip": 0.0166492, + "auxiliary_loss_mlp": 0.01508401, + "balance_loss_clip": 1.30014253, + "balance_loss_mlp": 1.14295316, + "epoch": 0.025251766120547123, + "flos": 17423042999040.0, + "grad_norm": 5.311374154067744, + "language_loss": 0.73724425, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.76897752, + "num_input_tokens_seen": 8801175, + "step": 420, + "time_per_iteration": 2.8464035987854004 + }, + { + "auxiliary_loss_clip": 0.01647915, + "auxiliary_loss_mlp": 0.01511781, + "balance_loss_clip": 1.28255522, + "balance_loss_mlp": 1.14919353, + "epoch": 0.02531188937321509, + "flos": 25047741290400.0, + "grad_norm": 1.7029498433165484, + "language_loss": 0.78666902, + "learning_rate": 3.890562344079484e-06, + "loss": 0.81826597, + "num_input_tokens_seen": 8820215, + "step": 421, + "time_per_iteration": 2.818241834640503 + }, + { + "auxiliary_loss_clip": 0.01667123, + "auxiliary_loss_mlp": 0.01521036, + "balance_loss_clip": 1.30284464, + "balance_loss_mlp": 1.15616, + "epoch": 0.02537201262588306, + "flos": 30594903851520.0, + "grad_norm": 2.40409307413761, + "language_loss": 0.81935275, + "learning_rate": 3.89208987073549e-06, + "loss": 0.85123444, + "num_input_tokens_seen": 8839660, + "step": 422, + "time_per_iteration": 2.8118934631347656 + }, + { + "auxiliary_loss_clip": 0.01663365, + "auxiliary_loss_mlp": 0.01511288, + "balance_loss_clip": 1.29725623, + "balance_loss_mlp": 1.15194297, + "epoch": 0.02543213587855103, + "flos": 26067639713760.0, + "grad_norm": 4.016544342048583, + "language_loss": 0.83717012, + "learning_rate": 3.893613781940409e-06, + "loss": 0.86891663, + "num_input_tokens_seen": 8859280, + "step": 423, + "time_per_iteration": 2.8229174613952637 + }, + { + "auxiliary_loss_clip": 0.01658345, + "auxiliary_loss_mlp": 0.01532458, + "balance_loss_clip": 1.29362869, + "balance_loss_mlp": 1.15823603, + "epoch": 0.025492259131218997, + "flos": 36025474155840.0, + "grad_norm": 2.9324328625957703, + "language_loss": 0.7402494, + "learning_rate": 3.895134094768415e-06, + "loss": 0.77215743, + "num_input_tokens_seen": 8880560, + "step": 424, + "time_per_iteration": 2.8625619411468506 + }, + { + "auxiliary_loss_clip": 0.01659125, + "auxiliary_loss_mlp": 0.01542025, + "balance_loss_clip": 1.29450536, + "balance_loss_mlp": 1.17505133, + "epoch": 0.02555238238388697, + "flos": 18590028284160.0, + "grad_norm": 2.8048724535392804, + "language_loss": 0.83592874, + "learning_rate": 3.896650826173015e-06, + "loss": 0.86794031, + "num_input_tokens_seen": 8899155, + "step": 425, + "time_per_iteration": 2.9041635990142822 + }, + { + "auxiliary_loss_clip": 0.01661141, + "auxiliary_loss_mlp": 0.01554208, + "balance_loss_clip": 1.29570365, + "balance_loss_mlp": 1.18971372, + "epoch": 0.025612505636554938, + "flos": 24245476835040.0, + "grad_norm": 2.913258272521556, + "language_loss": 0.85537279, + "learning_rate": 3.898163992988186e-06, + "loss": 0.88752633, + "num_input_tokens_seen": 8917890, + "step": 426, + "time_per_iteration": 2.7886931896209717 + }, + { + "auxiliary_loss_clip": 0.01755701, + "auxiliary_loss_mlp": 0.01439255, + "balance_loss_clip": 1.37640715, + "balance_loss_mlp": 1.10203552, + "epoch": 0.025672628889222907, + "flos": 60593532906240.0, + "grad_norm": 0.8904096970709691, + "language_loss": 0.57215488, + "learning_rate": 3.899673611929491e-06, + "loss": 0.60410446, + "num_input_tokens_seen": 8978260, + "step": 427, + "time_per_iteration": 3.4364373683929443 + }, + { + "auxiliary_loss_clip": 0.01654662, + "auxiliary_loss_mlp": 0.01503764, + "balance_loss_clip": 1.28835177, + "balance_loss_mlp": 1.14174914, + "epoch": 0.025732752141890875, + "flos": 19575297933120.0, + "grad_norm": 2.9758835276475137, + "language_loss": 0.88464898, + "learning_rate": 3.901179699595194e-06, + "loss": 0.91623318, + "num_input_tokens_seen": 8994460, + "step": 428, + "time_per_iteration": 2.759230852127075 + }, + { + "auxiliary_loss_clip": 0.01643334, + "auxiliary_loss_mlp": 0.0151388, + "balance_loss_clip": 1.27711558, + "balance_loss_mlp": 1.14461708, + "epoch": 0.025792875394558847, + "flos": 31286910052800.0, + "grad_norm": 1.7438214184808454, + "language_loss": 0.85808873, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88966089, + "num_input_tokens_seen": 9016670, + "step": 429, + "time_per_iteration": 2.7846434116363525 + }, + { + "auxiliary_loss_clip": 0.01637068, + "auxiliary_loss_mlp": 0.01510716, + "balance_loss_clip": 1.27068019, + "balance_loss_mlp": 1.13821125, + "epoch": 0.025852998647226816, + "flos": 32382362099520.0, + "grad_norm": 2.7479298849201057, + "language_loss": 0.88032365, + "learning_rate": 3.904181346912895e-06, + "loss": 0.91180146, + "num_input_tokens_seen": 9039720, + "step": 430, + "time_per_iteration": 2.8243205547332764 + }, + { + "auxiliary_loss_clip": 0.01659115, + "auxiliary_loss_mlp": 0.01510466, + "balance_loss_clip": 1.29358959, + "balance_loss_mlp": 1.13910508, + "epoch": 0.025913121899894784, + "flos": 20195353686240.0, + "grad_norm": 1.7969141283376728, + "language_loss": 0.84254003, + "learning_rate": 3.905676939184698e-06, + "loss": 0.87423581, + "num_input_tokens_seen": 9059850, + "step": 431, + "time_per_iteration": 2.776992082595825 + }, + { + "auxiliary_loss_clip": 0.01650933, + "auxiliary_loss_mlp": 0.01504378, + "balance_loss_clip": 1.2844609, + "balance_loss_mlp": 1.1322546, + "epoch": 0.025973245152562753, + "flos": 14722189693920.0, + "grad_norm": 11.436123031584216, + "language_loss": 0.86713994, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89869303, + "num_input_tokens_seen": 9077590, + "step": 432, + "time_per_iteration": 2.723393678665161 + }, + { + "auxiliary_loss_clip": 0.01657655, + "auxiliary_loss_mlp": 0.01514303, + "balance_loss_clip": 1.29020441, + "balance_loss_mlp": 1.13817346, + "epoch": 0.02603336840523072, + "flos": 30995656797600.0, + "grad_norm": 3.1870333762101604, + "language_loss": 0.76220369, + "learning_rate": 3.908657741654636e-06, + "loss": 0.79392326, + "num_input_tokens_seen": 9099880, + "step": 433, + "time_per_iteration": 2.866650342941284 + }, + { + "auxiliary_loss_clip": 0.01648111, + "auxiliary_loss_mlp": 0.01495709, + "balance_loss_clip": 1.27993321, + "balance_loss_mlp": 1.11423874, + "epoch": 0.026093491657898694, + "flos": 17676102232800.0, + "grad_norm": 2.115790347630501, + "language_loss": 0.9003607, + "learning_rate": 3.910142983797699e-06, + "loss": 0.93179893, + "num_input_tokens_seen": 9118620, + "step": 434, + "time_per_iteration": 5.860326766967773 + }, + { + "auxiliary_loss_clip": 0.0165344, + "auxiliary_loss_mlp": 0.01518529, + "balance_loss_clip": 1.28800905, + "balance_loss_mlp": 1.13667834, + "epoch": 0.026153614910566662, + "flos": 17859790133280.0, + "grad_norm": 2.3273009348448217, + "language_loss": 0.80157775, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.83329749, + "num_input_tokens_seen": 9135655, + "step": 435, + "time_per_iteration": 4.183810472488403 + }, + { + "auxiliary_loss_clip": 0.01651382, + "auxiliary_loss_mlp": 0.01507506, + "balance_loss_clip": 1.28612113, + "balance_loss_mlp": 1.13366604, + "epoch": 0.02621373816323463, + "flos": 20013182912160.0, + "grad_norm": 2.993084405268855, + "language_loss": 0.86453724, + "learning_rate": 3.913103228936546e-06, + "loss": 0.89612615, + "num_input_tokens_seen": 9153520, + "step": 436, + "time_per_iteration": 2.791076421737671 + }, + { + "auxiliary_loss_clip": 0.01664341, + "auxiliary_loss_mlp": 0.01517799, + "balance_loss_clip": 1.29655111, + "balance_loss_mlp": 1.14128792, + "epoch": 0.0262738614159026, + "flos": 19283210258400.0, + "grad_norm": 2.7610549815968968, + "language_loss": 0.74799603, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77981746, + "num_input_tokens_seen": 9170750, + "step": 437, + "time_per_iteration": 2.8147523403167725 + }, + { + "auxiliary_loss_clip": 0.01669263, + "auxiliary_loss_mlp": 0.01492358, + "balance_loss_clip": 1.30379391, + "balance_loss_mlp": 1.11432195, + "epoch": 0.026333984668570568, + "flos": 18809217306720.0, + "grad_norm": 2.790641367142691, + "language_loss": 0.91360539, + "learning_rate": 3.916049925995316e-06, + "loss": 0.94522166, + "num_input_tokens_seen": 9188430, + "step": 438, + "time_per_iteration": 2.7960610389709473 + }, + { + "auxiliary_loss_clip": 0.01786674, + "auxiliary_loss_mlp": 0.01449783, + "balance_loss_clip": 1.41269219, + "balance_loss_mlp": 1.12782288, + "epoch": 0.02639410792123854, + "flos": 64579291238880.0, + "grad_norm": 0.875157008054153, + "language_loss": 0.62573457, + "learning_rate": 3.917518232637377e-06, + "loss": 0.65809917, + "num_input_tokens_seen": 9255835, + "step": 439, + "time_per_iteration": 3.4431517124176025 + }, + { + "auxiliary_loss_clip": 0.01669502, + "auxiliary_loss_mlp": 0.01514576, + "balance_loss_clip": 1.30367589, + "balance_loss_mlp": 1.13692117, + "epoch": 0.02645423117390651, + "flos": 28475874349920.0, + "grad_norm": 4.528500503329164, + "language_loss": 0.75887764, + "learning_rate": 3.918983198419573e-06, + "loss": 0.79071844, + "num_input_tokens_seen": 9276835, + "step": 440, + "time_per_iteration": 2.829258918762207 + }, + { + "auxiliary_loss_clip": 0.0165981, + "auxiliary_loss_mlp": 0.01488156, + "balance_loss_clip": 1.29251838, + "balance_loss_mlp": 1.10706747, + "epoch": 0.026514354426574478, + "flos": 18553199676480.0, + "grad_norm": 2.8060074569208866, + "language_loss": 0.83167952, + "learning_rate": 3.920444838510415e-06, + "loss": 0.86315918, + "num_input_tokens_seen": 9295075, + "step": 441, + "time_per_iteration": 2.7759592533111572 + }, + { + "auxiliary_loss_clip": 0.0166132, + "auxiliary_loss_mlp": 0.01519117, + "balance_loss_clip": 1.29454744, + "balance_loss_mlp": 1.14584923, + "epoch": 0.026574477679242446, + "flos": 20669915560320.0, + "grad_norm": 2.001610701638257, + "language_loss": 0.78465813, + "learning_rate": 3.92190316797534e-06, + "loss": 0.81646252, + "num_input_tokens_seen": 9314205, + "step": 442, + "time_per_iteration": 2.802563190460205 + }, + { + "auxiliary_loss_clip": 0.01779062, + "auxiliary_loss_mlp": 0.01430054, + "balance_loss_clip": 1.40629506, + "balance_loss_mlp": 1.09359741, + "epoch": 0.026634600931910415, + "flos": 57962354359680.0, + "grad_norm": 0.9646418820448297, + "language_loss": 0.64402771, + "learning_rate": 3.92335820177765e-06, + "loss": 0.67611891, + "num_input_tokens_seen": 9367395, + "step": 443, + "time_per_iteration": 3.2078306674957275 + }, + { + "auxiliary_loss_clip": 0.01670198, + "auxiliary_loss_mlp": 0.0150478, + "balance_loss_clip": 1.30590487, + "balance_loss_mlp": 1.12731576, + "epoch": 0.026694724184578387, + "flos": 15816959033760.0, + "grad_norm": 2.3960746511304043, + "language_loss": 0.82763928, + "learning_rate": 3.924809954779425e-06, + "loss": 0.85938907, + "num_input_tokens_seen": 9385185, + "step": 444, + "time_per_iteration": 2.8602428436279297 + }, + { + "auxiliary_loss_clip": 0.01655447, + "auxiliary_loss_mlp": 0.01531607, + "balance_loss_clip": 1.28998899, + "balance_loss_mlp": 1.16043735, + "epoch": 0.026754847437246355, + "flos": 23442491744640.0, + "grad_norm": 4.500983993571733, + "language_loss": 0.95945001, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.99132067, + "num_input_tokens_seen": 9403225, + "step": 445, + "time_per_iteration": 2.8450186252593994 + }, + { + "auxiliary_loss_clip": 0.01656684, + "auxiliary_loss_mlp": 0.0150414, + "balance_loss_clip": 1.2903136, + "balance_loss_mlp": 1.1217171, + "epoch": 0.026814970689914324, + "flos": 17343393134400.0, + "grad_norm": 6.592403398065877, + "language_loss": 0.92052424, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.95213246, + "num_input_tokens_seen": 9420540, + "step": 446, + "time_per_iteration": 2.7907333374023438 + }, + { + "auxiliary_loss_clip": 0.01660915, + "auxiliary_loss_mlp": 0.01500421, + "balance_loss_clip": 1.29688013, + "balance_loss_mlp": 1.11570883, + "epoch": 0.026875093942582293, + "flos": 17896656669120.0, + "grad_norm": 2.815876687102289, + "language_loss": 0.79907173, + "learning_rate": 3.92914567610317e-06, + "loss": 0.83068514, + "num_input_tokens_seen": 9438840, + "step": 447, + "time_per_iteration": 2.741558313369751 + }, + { + "auxiliary_loss_clip": 0.01671007, + "auxiliary_loss_mlp": 0.01511453, + "balance_loss_clip": 1.30580997, + "balance_loss_mlp": 1.14524233, + "epoch": 0.026935217195250265, + "flos": 21726035740800.0, + "grad_norm": 2.2010859276320724, + "language_loss": 0.86715412, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89897871, + "num_input_tokens_seen": 9457215, + "step": 448, + "time_per_iteration": 2.7656912803649902 + }, + { + "auxiliary_loss_clip": 0.01665629, + "auxiliary_loss_mlp": 0.0152573, + "balance_loss_clip": 1.30223012, + "balance_loss_mlp": 1.15398788, + "epoch": 0.026995340447918233, + "flos": 23624928015840.0, + "grad_norm": 3.38468052874263, + "language_loss": 0.88963324, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.92154682, + "num_input_tokens_seen": 9475615, + "step": 449, + "time_per_iteration": 2.832662582397461 + }, + { + "auxiliary_loss_clip": 0.01663683, + "auxiliary_loss_mlp": 0.01511174, + "balance_loss_clip": 1.30032218, + "balance_loss_mlp": 1.12627065, + "epoch": 0.027055463700586202, + "flos": 17933219779680.0, + "grad_norm": 2.940411014667852, + "language_loss": 0.80531633, + "learning_rate": 3.933452395729493e-06, + "loss": 0.83706486, + "num_input_tokens_seen": 9493975, + "step": 450, + "time_per_iteration": 2.7422828674316406 + }, + { + "auxiliary_loss_clip": 0.01664226, + "auxiliary_loss_mlp": 0.01521262, + "balance_loss_clip": 1.30017269, + "balance_loss_mlp": 1.14799404, + "epoch": 0.02711558695325417, + "flos": 25121246793120.0, + "grad_norm": 3.5059820715822476, + "language_loss": 0.81737077, + "learning_rate": 3.934881590952304e-06, + "loss": 0.8492257, + "num_input_tokens_seen": 9514810, + "step": 451, + "time_per_iteration": 2.834352493286133 + }, + { + "auxiliary_loss_clip": 0.01663205, + "auxiliary_loss_mlp": 0.01505888, + "balance_loss_clip": 1.29961526, + "balance_loss_mlp": 1.12231994, + "epoch": 0.02717571020592214, + "flos": 24241646090880.0, + "grad_norm": 1.8178044074305078, + "language_loss": 0.76900315, + "learning_rate": 3.936307620734599e-06, + "loss": 0.80069411, + "num_input_tokens_seen": 9533635, + "step": 452, + "time_per_iteration": 2.742245674133301 + }, + { + "auxiliary_loss_clip": 0.01657439, + "auxiliary_loss_mlp": 0.01516111, + "balance_loss_clip": 1.29337168, + "balance_loss_mlp": 1.13082659, + "epoch": 0.02723583345859011, + "flos": 25121284721280.0, + "grad_norm": 1.8621543576721384, + "language_loss": 0.7344408, + "learning_rate": 3.937730499067294e-06, + "loss": 0.76617628, + "num_input_tokens_seen": 9555420, + "step": 453, + "time_per_iteration": 2.8321690559387207 + }, + { + "auxiliary_loss_clip": 0.01662702, + "auxiliary_loss_mlp": 0.01496511, + "balance_loss_clip": 1.29792833, + "balance_loss_mlp": 1.11141682, + "epoch": 0.02729595671125808, + "flos": 42744931944480.0, + "grad_norm": 2.095844197229527, + "language_loss": 0.82385695, + "learning_rate": 3.939150239848748e-06, + "loss": 0.85544908, + "num_input_tokens_seen": 9578950, + "step": 454, + "time_per_iteration": 2.9414966106414795 + }, + { + "auxiliary_loss_clip": 0.01660888, + "auxiliary_loss_mlp": 0.01513812, + "balance_loss_clip": 1.29688454, + "balance_loss_mlp": 1.1331048, + "epoch": 0.02735607996392605, + "flos": 21432961933920.0, + "grad_norm": 3.6915280817975233, + "language_loss": 0.75480515, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.78655219, + "num_input_tokens_seen": 9598160, + "step": 455, + "time_per_iteration": 2.7628040313720703 + }, + { + "auxiliary_loss_clip": 0.01652486, + "auxiliary_loss_mlp": 0.01499675, + "balance_loss_clip": 1.28842247, + "balance_loss_mlp": 1.12106645, + "epoch": 0.027416203216594017, + "flos": 20853451748160.0, + "grad_norm": 2.7363994862767593, + "language_loss": 0.80974293, + "learning_rate": 3.941980363893499e-06, + "loss": 0.84126449, + "num_input_tokens_seen": 9616010, + "step": 456, + "time_per_iteration": 2.767620801925659 + }, + { + "auxiliary_loss_clip": 0.01657614, + "auxiliary_loss_mlp": 0.01540703, + "balance_loss_clip": 1.29195714, + "balance_loss_mlp": 1.17296648, + "epoch": 0.027476326469261986, + "flos": 13226174341920.0, + "grad_norm": 15.621864362871216, + "language_loss": 0.81807292, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.85005611, + "num_input_tokens_seen": 9634000, + "step": 457, + "time_per_iteration": 2.7986438274383545 + }, + { + "auxiliary_loss_clip": 0.01649754, + "auxiliary_loss_mlp": 0.01482745, + "balance_loss_clip": 1.2845751, + "balance_loss_mlp": 1.10852289, + "epoch": 0.027536449721929958, + "flos": 24026818806720.0, + "grad_norm": 2.485336140178222, + "language_loss": 0.94176048, + "learning_rate": 3.944798102235412e-06, + "loss": 0.97308552, + "num_input_tokens_seen": 9653455, + "step": 458, + "time_per_iteration": 2.8609542846679688 + }, + { + "auxiliary_loss_clip": 0.01654971, + "auxiliary_loss_mlp": 0.01542213, + "balance_loss_clip": 1.28989363, + "balance_loss_mlp": 1.18096173, + "epoch": 0.027596572974597926, + "flos": 13007743882560.0, + "grad_norm": 2.4230511547367586, + "language_loss": 0.79193401, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.82390583, + "num_input_tokens_seen": 9669650, + "step": 459, + "time_per_iteration": 2.7352633476257324 + }, + { + "auxiliary_loss_clip": 0.01661773, + "auxiliary_loss_mlp": 0.0151895, + "balance_loss_clip": 1.29594302, + "balance_loss_mlp": 1.1510222, + "epoch": 0.027656696227265895, + "flos": 26145620739360.0, + "grad_norm": 1.798010536723708, + "language_loss": 0.83382899, + "learning_rate": 3.947603562811407e-06, + "loss": 0.86563623, + "num_input_tokens_seen": 9691415, + "step": 460, + "time_per_iteration": 2.890153408050537 + }, + { + "auxiliary_loss_clip": 0.0180521, + "auxiliary_loss_mlp": 0.01547844, + "balance_loss_clip": 1.43858957, + "balance_loss_mlp": 1.25258636, + "epoch": 0.027716819479933864, + "flos": 60703867016640.0, + "grad_norm": 1.6439965121621565, + "language_loss": 0.73530269, + "learning_rate": 3.949001722282675e-06, + "loss": 0.76883316, + "num_input_tokens_seen": 9755605, + "step": 461, + "time_per_iteration": 3.35221791267395 + }, + { + "auxiliary_loss_clip": 0.01663865, + "auxiliary_loss_mlp": 0.01528199, + "balance_loss_clip": 1.29929304, + "balance_loss_mlp": 1.16599393, + "epoch": 0.027776942732601832, + "flos": 31214618251200.0, + "grad_norm": 3.2097251323428053, + "language_loss": 0.8108294, + "learning_rate": 3.950396852153582e-06, + "loss": 0.84275007, + "num_input_tokens_seen": 9776270, + "step": 462, + "time_per_iteration": 2.8561906814575195 + }, + { + "auxiliary_loss_clip": 0.0166468, + "auxiliary_loss_mlp": 0.01514382, + "balance_loss_clip": 1.30123091, + "balance_loss_mlp": 1.14645433, + "epoch": 0.027837065985269804, + "flos": 22676866256160.0, + "grad_norm": 3.377961442017947, + "language_loss": 0.90567267, + "learning_rate": 3.951788965525118e-06, + "loss": 0.93746328, + "num_input_tokens_seen": 9794465, + "step": 463, + "time_per_iteration": 2.85042667388916 + }, + { + "auxiliary_loss_clip": 0.01801203, + "auxiliary_loss_mlp": 0.01435966, + "balance_loss_clip": 1.43454027, + "balance_loss_mlp": 1.10713959, + "epoch": 0.027897189237937773, + "flos": 62188617705120.0, + "grad_norm": 0.8864078677198223, + "language_loss": 0.59034002, + "learning_rate": 3.953178075413476e-06, + "loss": 0.62271172, + "num_input_tokens_seen": 9849685, + "step": 464, + "time_per_iteration": 3.2430942058563232 + }, + { + "auxiliary_loss_clip": 0.01662207, + "auxiliary_loss_mlp": 0.01524366, + "balance_loss_clip": 1.29593647, + "balance_loss_mlp": 1.14995313, + "epoch": 0.02795731249060574, + "flos": 24495160462560.0, + "grad_norm": 14.975780627304465, + "language_loss": 0.81216681, + "learning_rate": 3.954564194750784e-06, + "loss": 0.84403253, + "num_input_tokens_seen": 9869505, + "step": 465, + "time_per_iteration": 2.824732542037964 + }, + { + "auxiliary_loss_clip": 0.01652351, + "auxiliary_loss_mlp": 0.01498597, + "balance_loss_clip": 1.28648663, + "balance_loss_mlp": 1.12323058, + "epoch": 0.02801743574327371, + "flos": 23735300054400.0, + "grad_norm": 4.472945369200903, + "language_loss": 0.78783792, + "learning_rate": 3.955947336385828e-06, + "loss": 0.81934738, + "num_input_tokens_seen": 9890950, + "step": 466, + "time_per_iteration": 2.850686550140381 + }, + { + "auxiliary_loss_clip": 0.01651069, + "auxiliary_loss_mlp": 0.01515896, + "balance_loss_clip": 1.28595328, + "balance_loss_mlp": 1.14052927, + "epoch": 0.02807755899594168, + "flos": 20631076760160.0, + "grad_norm": 2.1637213074975, + "language_loss": 0.87716514, + "learning_rate": 3.957327513084761e-06, + "loss": 0.90883482, + "num_input_tokens_seen": 9911265, + "step": 467, + "time_per_iteration": 2.853541374206543 + }, + { + "auxiliary_loss_clip": 0.01647922, + "auxiliary_loss_mlp": 0.01504467, + "balance_loss_clip": 1.28037953, + "balance_loss_mlp": 1.12032652, + "epoch": 0.02813768224860965, + "flos": 19246571291520.0, + "grad_norm": 2.2510650338758653, + "language_loss": 0.86190534, + "learning_rate": 3.958704737531818e-06, + "loss": 0.89342922, + "num_input_tokens_seen": 9929025, + "step": 468, + "time_per_iteration": 2.730509042739868 + }, + { + "auxiliary_loss_clip": 0.0165025, + "auxiliary_loss_mlp": 0.01530463, + "balance_loss_clip": 1.28262246, + "balance_loss_mlp": 1.15719533, + "epoch": 0.02819780550127762, + "flos": 20816092146240.0, + "grad_norm": 2.334546209038023, + "language_loss": 0.92148739, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.95329452, + "num_input_tokens_seen": 9945190, + "step": 469, + "time_per_iteration": 2.784810781478882 + }, + { + "auxiliary_loss_clip": 0.01642935, + "auxiliary_loss_mlp": 0.01498969, + "balance_loss_clip": 1.27481008, + "balance_loss_mlp": 1.1232208, + "epoch": 0.028257928753945588, + "flos": 19976164663680.0, + "grad_norm": 4.997675856036313, + "language_loss": 0.81748033, + "learning_rate": 3.96145038000181e-06, + "loss": 0.84889936, + "num_input_tokens_seen": 9962820, + "step": 470, + "time_per_iteration": 2.7782061100006104 + }, + { + "auxiliary_loss_clip": 0.01651093, + "auxiliary_loss_mlp": 0.01504506, + "balance_loss_clip": 1.2837944, + "balance_loss_mlp": 1.12456203, + "epoch": 0.028318052006613557, + "flos": 20486531085120.0, + "grad_norm": 2.0038908064209098, + "language_loss": 0.93412882, + "learning_rate": 3.962818822989861e-06, + "loss": 0.96568477, + "num_input_tokens_seen": 9982595, + "step": 471, + "time_per_iteration": 2.8056602478027344 + }, + { + "auxiliary_loss_clip": 0.01643371, + "auxiliary_loss_mlp": 0.01499845, + "balance_loss_clip": 1.27332819, + "balance_loss_mlp": 1.11856616, + "epoch": 0.02837817525928153, + "flos": 28517519833920.0, + "grad_norm": 1.8985732907416748, + "language_loss": 0.75990152, + "learning_rate": 3.964184363657625e-06, + "loss": 0.79133368, + "num_input_tokens_seen": 10004645, + "step": 472, + "time_per_iteration": 4.370975732803345 + }, + { + "auxiliary_loss_clip": 0.01643502, + "auxiliary_loss_mlp": 0.0149976, + "balance_loss_clip": 1.27419996, + "balance_loss_mlp": 1.11733627, + "epoch": 0.028438298511949497, + "flos": 18553654814400.0, + "grad_norm": 2.456320228241688, + "language_loss": 0.93592346, + "learning_rate": 3.965547014290071e-06, + "loss": 0.96735603, + "num_input_tokens_seen": 10022555, + "step": 473, + "time_per_iteration": 4.17650842666626 + }, + { + "auxiliary_loss_clip": 0.01644228, + "auxiliary_loss_mlp": 0.01515466, + "balance_loss_clip": 1.27534676, + "balance_loss_mlp": 1.12960958, + "epoch": 0.028498421764617466, + "flos": 16912145583360.0, + "grad_norm": 2.6993400246137824, + "language_loss": 0.88451016, + "learning_rate": 3.96690678709433e-06, + "loss": 0.91610706, + "num_input_tokens_seen": 10041025, + "step": 474, + "time_per_iteration": 4.079951763153076 + }, + { + "auxiliary_loss_clip": 0.01651782, + "auxiliary_loss_mlp": 0.01515092, + "balance_loss_clip": 1.28416181, + "balance_loss_mlp": 1.13858151, + "epoch": 0.028558545017285435, + "flos": 27780985608480.0, + "grad_norm": 2.5434260902135453, + "language_loss": 0.79028952, + "learning_rate": 3.968263694200355e-06, + "loss": 0.82195824, + "num_input_tokens_seen": 10060775, + "step": 475, + "time_per_iteration": 2.676699638366699 + }, + { + "auxiliary_loss_clip": 0.01727319, + "auxiliary_loss_mlp": 0.01481941, + "balance_loss_clip": 1.35994935, + "balance_loss_mlp": 1.10581207, + "epoch": 0.028618668269953403, + "flos": 65661013291680.0, + "grad_norm": 0.9841613891445412, + "language_loss": 0.66946661, + "learning_rate": 3.969617747661569e-06, + "loss": 0.70155919, + "num_input_tokens_seen": 10120225, + "step": 476, + "time_per_iteration": 3.218569040298462 + }, + { + "auxiliary_loss_clip": 0.01649661, + "auxiliary_loss_mlp": 0.01515307, + "balance_loss_clip": 1.28003466, + "balance_loss_mlp": 1.14280188, + "epoch": 0.028678791522621375, + "flos": 21938814904320.0, + "grad_norm": 3.998294966917167, + "language_loss": 0.83759862, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86924827, + "num_input_tokens_seen": 10137880, + "step": 477, + "time_per_iteration": 2.7224478721618652 + }, + { + "auxiliary_loss_clip": 0.0165029, + "auxiliary_loss_mlp": 0.01487566, + "balance_loss_clip": 1.2816422, + "balance_loss_mlp": 1.10838509, + "epoch": 0.028738914775289344, + "flos": 24574468973760.0, + "grad_norm": 2.9212663193249653, + "language_loss": 0.82564938, + "learning_rate": 3.97231734148446e-06, + "loss": 0.85702795, + "num_input_tokens_seen": 10156930, + "step": 478, + "time_per_iteration": 2.7384140491485596 + }, + { + "auxiliary_loss_clip": 0.01651245, + "auxiliary_loss_mlp": 0.01504369, + "balance_loss_clip": 1.283939, + "balance_loss_mlp": 1.13701367, + "epoch": 0.028799038027957313, + "flos": 23260207186080.0, + "grad_norm": 2.2576364480918567, + "language_loss": 0.81189513, + "learning_rate": 3.973662905576082e-06, + "loss": 0.84345132, + "num_input_tokens_seen": 10176295, + "step": 479, + "time_per_iteration": 2.768812656402588 + }, + { + "auxiliary_loss_clip": 0.01645315, + "auxiliary_loss_mlp": 0.01509414, + "balance_loss_clip": 1.27616525, + "balance_loss_mlp": 1.13748145, + "epoch": 0.02885916128062528, + "flos": 22166310193920.0, + "grad_norm": 2.5829391630885294, + "language_loss": 0.73749518, + "learning_rate": 3.975005663484038e-06, + "loss": 0.76904249, + "num_input_tokens_seen": 10195790, + "step": 480, + "time_per_iteration": 2.800854206085205 + }, + { + "auxiliary_loss_clip": 0.01643797, + "auxiliary_loss_mlp": 0.01498721, + "balance_loss_clip": 1.27528703, + "balance_loss_mlp": 1.12449932, + "epoch": 0.02891928453329325, + "flos": 22935235432320.0, + "grad_norm": 4.357445738066687, + "language_loss": 0.87805927, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90948445, + "num_input_tokens_seen": 10218405, + "step": 481, + "time_per_iteration": 2.859740972518921 + }, + { + "auxiliary_loss_clip": 0.01723419, + "auxiliary_loss_mlp": 0.01422455, + "balance_loss_clip": 1.35719407, + "balance_loss_mlp": 1.09133911, + "epoch": 0.028979407785961222, + "flos": 57438447585120.0, + "grad_norm": 0.8247449654856219, + "language_loss": 0.66001785, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.69147658, + "num_input_tokens_seen": 10271005, + "step": 482, + "time_per_iteration": 3.0713722705841064 + }, + { + "auxiliary_loss_clip": 0.01646489, + "auxiliary_loss_mlp": 0.01515228, + "balance_loss_clip": 1.27811658, + "balance_loss_mlp": 1.1493988, + "epoch": 0.02903953103862919, + "flos": 16724361441600.0, + "grad_norm": 42.188407595420394, + "language_loss": 0.78805089, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81966805, + "num_input_tokens_seen": 10288405, + "step": 483, + "time_per_iteration": 2.7539613246917725 + }, + { + "auxiliary_loss_clip": 0.01646617, + "auxiliary_loss_mlp": 0.0151621, + "balance_loss_clip": 1.2778914, + "balance_loss_mlp": 1.14618421, + "epoch": 0.02909965429129716, + "flos": 16765513859520.0, + "grad_norm": 2.4172869653296996, + "language_loss": 0.7620343, + "learning_rate": 3.980348865796749e-06, + "loss": 0.79366255, + "num_input_tokens_seen": 10306875, + "step": 484, + "time_per_iteration": 2.737678050994873 + }, + { + "auxiliary_loss_clip": 0.01639848, + "auxiliary_loss_mlp": 0.01514149, + "balance_loss_clip": 1.27070522, + "balance_loss_mlp": 1.14927351, + "epoch": 0.029159777543965128, + "flos": 19787015108160.0, + "grad_norm": 2.264988982271628, + "language_loss": 0.84168088, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.8732208, + "num_input_tokens_seen": 10323965, + "step": 485, + "time_per_iteration": 2.7538554668426514 + }, + { + "auxiliary_loss_clip": 0.01643928, + "auxiliary_loss_mlp": 0.01508143, + "balance_loss_clip": 1.27438235, + "balance_loss_mlp": 1.14574635, + "epoch": 0.029219900796633096, + "flos": 19644517553760.0, + "grad_norm": 2.772009522953434, + "language_loss": 0.84444404, + "learning_rate": 3.983003930109732e-06, + "loss": 0.87596482, + "num_input_tokens_seen": 10342620, + "step": 486, + "time_per_iteration": 2.781430721282959 + }, + { + "auxiliary_loss_clip": 0.01640403, + "auxiliary_loss_mlp": 0.01520745, + "balance_loss_clip": 1.27196312, + "balance_loss_mlp": 1.15834868, + "epoch": 0.02928002404930107, + "flos": 25888123910880.0, + "grad_norm": 2.3548755949585503, + "language_loss": 0.88955486, + "learning_rate": 3.984327367746315e-06, + "loss": 0.92116636, + "num_input_tokens_seen": 10364610, + "step": 487, + "time_per_iteration": 2.856297254562378 + }, + { + "auxiliary_loss_clip": 0.0164717, + "auxiliary_loss_mlp": 0.0152805, + "balance_loss_clip": 1.2776773, + "balance_loss_mlp": 1.17080331, + "epoch": 0.029340147301969037, + "flos": 20661912718560.0, + "grad_norm": 2.703362153789043, + "language_loss": 0.88956928, + "learning_rate": 3.985648090637122e-06, + "loss": 0.92132139, + "num_input_tokens_seen": 10380910, + "step": 488, + "time_per_iteration": 2.840446949005127 + }, + { + "auxiliary_loss_clip": 0.01638003, + "auxiliary_loss_mlp": 0.01505505, + "balance_loss_clip": 1.27114201, + "balance_loss_mlp": 1.13986659, + "epoch": 0.029400270554637006, + "flos": 24430454292960.0, + "grad_norm": 2.3918439242220444, + "language_loss": 0.88883227, + "learning_rate": 3.986966109896785e-06, + "loss": 0.92026734, + "num_input_tokens_seen": 10400665, + "step": 489, + "time_per_iteration": 2.899308681488037 + }, + { + "auxiliary_loss_clip": 0.01639723, + "auxiliary_loss_mlp": 0.01527325, + "balance_loss_clip": 1.27185941, + "balance_loss_mlp": 1.16683578, + "epoch": 0.029460393807304974, + "flos": 20122986028320.0, + "grad_norm": 2.20062555987085, + "language_loss": 0.88774717, + "learning_rate": 3.988281436571815e-06, + "loss": 0.91941762, + "num_input_tokens_seen": 10420150, + "step": 490, + "time_per_iteration": 2.7898411750793457 + }, + { + "auxiliary_loss_clip": 0.0163559, + "auxiliary_loss_mlp": 0.01516458, + "balance_loss_clip": 1.26793766, + "balance_loss_mlp": 1.15310824, + "epoch": 0.029520517059972943, + "flos": 17677846928160.0, + "grad_norm": 2.986422536102644, + "language_loss": 0.91823316, + "learning_rate": 3.989594081641164e-06, + "loss": 0.94975358, + "num_input_tokens_seen": 10438210, + "step": 491, + "time_per_iteration": 2.761368751525879 + }, + { + "auxiliary_loss_clip": 0.01631702, + "auxiliary_loss_mlp": 0.01514857, + "balance_loss_clip": 1.26180375, + "balance_loss_mlp": 1.14692938, + "epoch": 0.029580640312640915, + "flos": 18955204251840.0, + "grad_norm": 1.99391336521799, + "language_loss": 0.8543945, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.88586009, + "num_input_tokens_seen": 10455125, + "step": 492, + "time_per_iteration": 2.7561237812042236 + }, + { + "auxiliary_loss_clip": 0.01646409, + "auxiliary_loss_mlp": 0.01521656, + "balance_loss_clip": 1.27835119, + "balance_loss_mlp": 1.14667165, + "epoch": 0.029640763565308884, + "flos": 18727671034080.0, + "grad_norm": 2.7165202257856285, + "language_loss": 0.8459155, + "learning_rate": 3.992211370544093e-06, + "loss": 0.87759614, + "num_input_tokens_seen": 10470990, + "step": 493, + "time_per_iteration": 2.766327142715454 + }, + { + "auxiliary_loss_clip": 0.01632519, + "auxiliary_loss_mlp": 0.01520692, + "balance_loss_clip": 1.26255727, + "balance_loss_mlp": 1.15524447, + "epoch": 0.029700886817976852, + "flos": 20597434117920.0, + "grad_norm": 2.2784193613119204, + "language_loss": 0.86715043, + "learning_rate": 3.99351603600268e-06, + "loss": 0.89868259, + "num_input_tokens_seen": 10490685, + "step": 494, + "time_per_iteration": 2.765432834625244 + }, + { + "auxiliary_loss_clip": 0.01642124, + "auxiliary_loss_mlp": 0.01543902, + "balance_loss_clip": 1.27361393, + "balance_loss_mlp": 1.17120636, + "epoch": 0.02976101007064482, + "flos": 22239057133440.0, + "grad_norm": 2.3732602222572923, + "language_loss": 0.86998808, + "learning_rate": 3.994818063106668e-06, + "loss": 0.90184838, + "num_input_tokens_seen": 10509435, + "step": 495, + "time_per_iteration": 2.754467010498047 + }, + { + "auxiliary_loss_clip": 0.01650869, + "auxiliary_loss_mlp": 0.01513497, + "balance_loss_clip": 1.28439629, + "balance_loss_mlp": 1.14156425, + "epoch": 0.029821133323312793, + "flos": 23734920772800.0, + "grad_norm": 2.3758467157354977, + "language_loss": 0.621135, + "learning_rate": 3.99611746250533e-06, + "loss": 0.65277863, + "num_input_tokens_seen": 10530050, + "step": 496, + "time_per_iteration": 2.7948272228240967 + }, + { + "auxiliary_loss_clip": 0.01648854, + "auxiliary_loss_mlp": 0.01510617, + "balance_loss_clip": 1.28037584, + "balance_loss_mlp": 1.14402509, + "epoch": 0.02988125657598076, + "flos": 22421720973600.0, + "grad_norm": 1.7494446784049726, + "language_loss": 0.88750911, + "learning_rate": 3.997414244783595e-06, + "loss": 0.91910386, + "num_input_tokens_seen": 10551370, + "step": 497, + "time_per_iteration": 2.857443332672119 + }, + { + "auxiliary_loss_clip": 0.01640902, + "auxiliary_loss_mlp": 0.01507567, + "balance_loss_clip": 1.27426732, + "balance_loss_mlp": 1.13868546, + "epoch": 0.02994137982864873, + "flos": 13846836945600.0, + "grad_norm": 3.6734927953437246, + "language_loss": 0.85005164, + "learning_rate": 3.998708420462557e-06, + "loss": 0.88153625, + "num_input_tokens_seen": 10569225, + "step": 498, + "time_per_iteration": 2.7356109619140625 + }, + { + "auxiliary_loss_clip": 0.0164071, + "auxiliary_loss_mlp": 0.0149884, + "balance_loss_clip": 1.2716099, + "balance_loss_mlp": 1.12767005, + "epoch": 0.0300015030813167, + "flos": 23910264478080.0, + "grad_norm": 2.947326597226834, + "language_loss": 0.78356791, + "learning_rate": 4e-06, + "loss": 0.81496334, + "num_input_tokens_seen": 10586170, + "step": 499, + "time_per_iteration": 2.8491976261138916 + }, + { + "auxiliary_loss_clip": 0.01633006, + "auxiliary_loss_mlp": 0.01496117, + "balance_loss_clip": 1.26224256, + "balance_loss_mlp": 1.11750829, + "epoch": 0.030061626333984667, + "flos": 22018919906880.0, + "grad_norm": 2.025262129006964, + "language_loss": 0.82762158, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85891277, + "num_input_tokens_seen": 10606205, + "step": 500, + "time_per_iteration": 2.7633092403411865 + }, + { + "auxiliary_loss_clip": 0.01640406, + "auxiliary_loss_mlp": 0.01529484, + "balance_loss_clip": 1.27101564, + "balance_loss_mlp": 1.16003084, + "epoch": 0.03012174958665264, + "flos": 23042004295680.0, + "grad_norm": 3.345843744878574, + "language_loss": 0.8810842, + "learning_rate": 3.9999998483196e-06, + "loss": 0.91278309, + "num_input_tokens_seen": 10625995, + "step": 501, + "time_per_iteration": 2.8133013248443604 + }, + { + "auxiliary_loss_clip": 0.01630384, + "auxiliary_loss_mlp": 0.01513294, + "balance_loss_clip": 1.26051438, + "balance_loss_mlp": 1.13621092, + "epoch": 0.030181872839320608, + "flos": 18955280108160.0, + "grad_norm": 3.21627822077166, + "language_loss": 0.86563128, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.89706802, + "num_input_tokens_seen": 10644105, + "step": 502, + "time_per_iteration": 2.841930389404297 + }, + { + "auxiliary_loss_clip": 0.01643676, + "auxiliary_loss_mlp": 0.01545648, + "balance_loss_clip": 1.27487612, + "balance_loss_mlp": 1.17314243, + "epoch": 0.030241996091988577, + "flos": 16729936881120.0, + "grad_norm": 2.481356604684194, + "language_loss": 0.84596622, + "learning_rate": 3.999999393278425e-06, + "loss": 0.87785947, + "num_input_tokens_seen": 10661090, + "step": 503, + "time_per_iteration": 2.758187770843506 + }, + { + "auxiliary_loss_clip": 0.01647497, + "auxiliary_loss_mlp": 0.01519346, + "balance_loss_clip": 1.27830458, + "balance_loss_mlp": 1.1533258, + "epoch": 0.030302119344656545, + "flos": 28623795631200.0, + "grad_norm": 1.8372048766054252, + "language_loss": 0.88573992, + "learning_rate": 3.999999051997567e-06, + "loss": 0.91740835, + "num_input_tokens_seen": 10682380, + "step": 504, + "time_per_iteration": 2.874821901321411 + }, + { + "auxiliary_loss_clip": 0.01638408, + "auxiliary_loss_mlp": 0.01519749, + "balance_loss_clip": 1.27095342, + "balance_loss_mlp": 1.15010452, + "epoch": 0.030362242597324514, + "flos": 15671161729440.0, + "grad_norm": 2.9779332791161885, + "language_loss": 0.78420168, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.81578326, + "num_input_tokens_seen": 10699925, + "step": 505, + "time_per_iteration": 2.7174744606018066 + }, + { + "auxiliary_loss_clip": 0.01742245, + "auxiliary_loss_mlp": 0.01727768, + "balance_loss_clip": 1.37707901, + "balance_loss_mlp": 1.45082092, + "epoch": 0.030422365849992486, + "flos": 72133745849280.0, + "grad_norm": 1.0860719987265706, + "language_loss": 0.54974937, + "learning_rate": 3.999998141915371e-06, + "loss": 0.58444947, + "num_input_tokens_seen": 10766525, + "step": 506, + "time_per_iteration": 3.4694974422454834 + }, + { + "auxiliary_loss_clip": 0.01625399, + "auxiliary_loss_mlp": 0.01507933, + "balance_loss_clip": 1.25610852, + "balance_loss_mlp": 1.12436521, + "epoch": 0.030482489102660455, + "flos": 19429879910400.0, + "grad_norm": 2.5351314893537786, + "language_loss": 0.83394986, + "learning_rate": 3.999997573114069e-06, + "loss": 0.86528325, + "num_input_tokens_seen": 10786725, + "step": 507, + "time_per_iteration": 2.8082339763641357 + }, + { + "auxiliary_loss_clip": 0.01638657, + "auxiliary_loss_mlp": 0.01493454, + "balance_loss_clip": 1.26847255, + "balance_loss_mlp": 1.09882319, + "epoch": 0.030542612355328423, + "flos": 20377903741920.0, + "grad_norm": 2.5937199728987834, + "language_loss": 0.88582152, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91714263, + "num_input_tokens_seen": 10805390, + "step": 508, + "time_per_iteration": 2.7369725704193115 + }, + { + "auxiliary_loss_clip": 0.01638686, + "auxiliary_loss_mlp": 0.01480954, + "balance_loss_clip": 1.26953959, + "balance_loss_mlp": 1.09032893, + "epoch": 0.030602735607996392, + "flos": 34680186768960.0, + "grad_norm": 1.8958233959432598, + "language_loss": 0.71898025, + "learning_rate": 3.999996207991165e-06, + "loss": 0.75017667, + "num_input_tokens_seen": 10828030, + "step": 509, + "time_per_iteration": 2.886861801147461 + }, + { + "auxiliary_loss_clip": 0.016382, + "auxiliary_loss_mlp": 0.0150496, + "balance_loss_clip": 1.26927328, + "balance_loss_mlp": 1.10403562, + "epoch": 0.03066285886066436, + "flos": 23660884275840.0, + "grad_norm": 2.1815674163743535, + "language_loss": 0.82351732, + "learning_rate": 3.999995411669614e-06, + "loss": 0.854949, + "num_input_tokens_seen": 10845240, + "step": 510, + "time_per_iteration": 5.808893203735352 + }, + { + "auxiliary_loss_clip": 0.01637972, + "auxiliary_loss_mlp": 0.01511327, + "balance_loss_clip": 1.27004075, + "balance_loss_mlp": 1.10410833, + "epoch": 0.030722982113332332, + "flos": 23005213616160.0, + "grad_norm": 4.460350410685459, + "language_loss": 0.8380785, + "learning_rate": 3.999994539508036e-06, + "loss": 0.86957145, + "num_input_tokens_seen": 10864325, + "step": 511, + "time_per_iteration": 4.282354354858398 + }, + { + "auxiliary_loss_clip": 0.01632869, + "auxiliary_loss_mlp": 0.01491584, + "balance_loss_clip": 1.26353228, + "balance_loss_mlp": 1.09580958, + "epoch": 0.0307831053660003, + "flos": 24752998644480.0, + "grad_norm": 2.5864076964113294, + "language_loss": 0.82375509, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.8549996, + "num_input_tokens_seen": 10883860, + "step": 512, + "time_per_iteration": 2.7366766929626465 + }, + { + "auxiliary_loss_clip": 0.01624947, + "auxiliary_loss_mlp": 0.01535823, + "balance_loss_clip": 1.25501657, + "balance_loss_mlp": 1.1238358, + "epoch": 0.03084322861866827, + "flos": 26143307121600.0, + "grad_norm": 2.500700516032624, + "language_loss": 0.87496006, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.90656781, + "num_input_tokens_seen": 10904555, + "step": 513, + "time_per_iteration": 2.9175384044647217 + }, + { + "auxiliary_loss_clip": 0.01641091, + "auxiliary_loss_mlp": 0.01522441, + "balance_loss_clip": 1.27138281, + "balance_loss_mlp": 1.1085459, + "epoch": 0.03090335187133624, + "flos": 18773336903040.0, + "grad_norm": 2.992743926434249, + "language_loss": 0.79429519, + "learning_rate": 3.999991467983491e-06, + "loss": 0.82593048, + "num_input_tokens_seen": 10923700, + "step": 514, + "time_per_iteration": 2.7694058418273926 + }, + { + "auxiliary_loss_clip": 0.01641098, + "auxiliary_loss_mlp": 0.01543702, + "balance_loss_clip": 1.27120256, + "balance_loss_mlp": 1.12446654, + "epoch": 0.030963475124004207, + "flos": 23224137141600.0, + "grad_norm": 3.607549071597642, + "language_loss": 0.77757037, + "learning_rate": 3.999990292462167e-06, + "loss": 0.80941832, + "num_input_tokens_seen": 10942730, + "step": 515, + "time_per_iteration": 2.8002028465270996 + }, + { + "auxiliary_loss_clip": 0.01636004, + "auxiliary_loss_mlp": 0.01528692, + "balance_loss_clip": 1.26501358, + "balance_loss_mlp": 1.11479759, + "epoch": 0.03102359837667218, + "flos": 42529080600000.0, + "grad_norm": 1.9783518663501876, + "language_loss": 0.82776451, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85941148, + "num_input_tokens_seen": 10967120, + "step": 516, + "time_per_iteration": 2.986607313156128 + }, + { + "auxiliary_loss_clip": 0.01643268, + "auxiliary_loss_mlp": 0.01518852, + "balance_loss_clip": 1.27153945, + "balance_loss_mlp": 1.10629201, + "epoch": 0.031083721629340148, + "flos": 21178992424320.0, + "grad_norm": 1.981135305753827, + "language_loss": 0.79040629, + "learning_rate": 3.999987713900071e-06, + "loss": 0.8220275, + "num_input_tokens_seen": 10986775, + "step": 517, + "time_per_iteration": 2.772630214691162 + }, + { + "auxiliary_loss_clip": 0.01642669, + "auxiliary_loss_mlp": 0.01537684, + "balance_loss_clip": 1.27218068, + "balance_loss_mlp": 1.12569714, + "epoch": 0.031143844882008116, + "flos": 29718792540000.0, + "grad_norm": 1.708970783702008, + "language_loss": 0.90803796, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93984151, + "num_input_tokens_seen": 11011360, + "step": 518, + "time_per_iteration": 2.8337082862854004 + }, + { + "auxiliary_loss_clip": 0.01646459, + "auxiliary_loss_mlp": 0.01513479, + "balance_loss_clip": 1.27521801, + "balance_loss_mlp": 1.10606921, + "epoch": 0.031203968134676085, + "flos": 23114713307040.0, + "grad_norm": 2.1635940336472723, + "language_loss": 0.86983001, + "learning_rate": 3.999984831979039e-06, + "loss": 0.90142936, + "num_input_tokens_seen": 11030150, + "step": 519, + "time_per_iteration": 2.8100180625915527 + }, + { + "auxiliary_loss_clip": 0.01641471, + "auxiliary_loss_mlp": 0.01523371, + "balance_loss_clip": 1.27071428, + "balance_loss_mlp": 1.11920404, + "epoch": 0.03126409138734405, + "flos": 20956200226560.0, + "grad_norm": 2.182735333349, + "language_loss": 0.86871797, + "learning_rate": 3.999983277259057e-06, + "loss": 0.90036637, + "num_input_tokens_seen": 11049145, + "step": 520, + "time_per_iteration": 2.868302345275879 + }, + { + "auxiliary_loss_clip": 0.01637814, + "auxiliary_loss_mlp": 0.0152424, + "balance_loss_clip": 1.26611102, + "balance_loss_mlp": 1.12121749, + "epoch": 0.031324214640012026, + "flos": 21652113028320.0, + "grad_norm": 1.7964375743897236, + "language_loss": 0.89369208, + "learning_rate": 3.999981646699509e-06, + "loss": 0.92531264, + "num_input_tokens_seen": 11068835, + "step": 521, + "time_per_iteration": 2.832587242126465 + }, + { + "auxiliary_loss_clip": 0.01641481, + "auxiliary_loss_mlp": 0.01494567, + "balance_loss_clip": 1.27130651, + "balance_loss_mlp": 1.10928249, + "epoch": 0.03138433789267999, + "flos": 23443629589440.0, + "grad_norm": 2.290349361721169, + "language_loss": 0.71343035, + "learning_rate": 3.999979940300456e-06, + "loss": 0.74479079, + "num_input_tokens_seen": 11088980, + "step": 522, + "time_per_iteration": 2.7965662479400635 + }, + { + "auxiliary_loss_clip": 0.01640826, + "auxiliary_loss_mlp": 0.01497852, + "balance_loss_clip": 1.27022791, + "balance_loss_mlp": 1.11676383, + "epoch": 0.03144446114534796, + "flos": 18983764520640.0, + "grad_norm": 3.565682893536531, + "language_loss": 0.85300022, + "learning_rate": 3.999978158061963e-06, + "loss": 0.88438702, + "num_input_tokens_seen": 11104300, + "step": 523, + "time_per_iteration": 2.792633533477783 + }, + { + "auxiliary_loss_clip": 0.01638891, + "auxiliary_loss_mlp": 0.01490146, + "balance_loss_clip": 1.26827931, + "balance_loss_mlp": 1.11401653, + "epoch": 0.031504584398015935, + "flos": 22639885935840.0, + "grad_norm": 2.1346637631996948, + "language_loss": 0.90408075, + "learning_rate": 3.999976299984099e-06, + "loss": 0.93537116, + "num_input_tokens_seen": 11123335, + "step": 524, + "time_per_iteration": 2.7987661361694336 + }, + { + "auxiliary_loss_clip": 0.01650212, + "auxiliary_loss_mlp": 0.01533446, + "balance_loss_clip": 1.27929962, + "balance_loss_mlp": 1.15865183, + "epoch": 0.0315647076506839, + "flos": 25299169613280.0, + "grad_norm": 2.374426263977789, + "language_loss": 0.80254948, + "learning_rate": 3.999974366066933e-06, + "loss": 0.83438605, + "num_input_tokens_seen": 11140880, + "step": 525, + "time_per_iteration": 2.765721082687378 + }, + { + "auxiliary_loss_clip": 0.01637614, + "auxiliary_loss_mlp": 0.01504311, + "balance_loss_clip": 1.26635242, + "balance_loss_mlp": 1.13809967, + "epoch": 0.03162483090335187, + "flos": 16984740810240.0, + "grad_norm": 2.2425609173950685, + "language_loss": 0.80763716, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83905643, + "num_input_tokens_seen": 11158710, + "step": 526, + "time_per_iteration": 2.7732205390930176 + }, + { + "auxiliary_loss_clip": 0.01639822, + "auxiliary_loss_mlp": 0.01514967, + "balance_loss_clip": 1.27049601, + "balance_loss_mlp": 1.15066361, + "epoch": 0.03168495415601984, + "flos": 18736508295360.0, + "grad_norm": 2.2451660864614214, + "language_loss": 0.81585252, + "learning_rate": 3.999970270714991e-06, + "loss": 0.84740043, + "num_input_tokens_seen": 11177550, + "step": 527, + "time_per_iteration": 2.8014252185821533 + }, + { + "auxiliary_loss_clip": 0.01640902, + "auxiliary_loss_mlp": 0.01507683, + "balance_loss_clip": 1.27132893, + "balance_loss_mlp": 1.14109111, + "epoch": 0.03174507740868781, + "flos": 21216996804960.0, + "grad_norm": 2.0852784139773117, + "language_loss": 0.94006562, + "learning_rate": 3.999968109280371e-06, + "loss": 0.97155142, + "num_input_tokens_seen": 11196230, + "step": 528, + "time_per_iteration": 2.728647470474243 + }, + { + "auxiliary_loss_clip": 0.01645945, + "auxiliary_loss_mlp": 0.01514802, + "balance_loss_clip": 1.27647758, + "balance_loss_mlp": 1.14820933, + "epoch": 0.03180520066135578, + "flos": 24789827252160.0, + "grad_norm": 2.3010937922293624, + "language_loss": 0.84410322, + "learning_rate": 3.99996587200676e-06, + "loss": 0.87571073, + "num_input_tokens_seen": 11214935, + "step": 529, + "time_per_iteration": 2.8210349082946777 + }, + { + "auxiliary_loss_clip": 0.01650802, + "auxiliary_loss_mlp": 0.01503791, + "balance_loss_clip": 1.27955401, + "balance_loss_mlp": 1.13319349, + "epoch": 0.03186532391402375, + "flos": 24866632504800.0, + "grad_norm": 3.3324475881046616, + "language_loss": 0.90361798, + "learning_rate": 3.999963558894243e-06, + "loss": 0.93516386, + "num_input_tokens_seen": 11235310, + "step": 530, + "time_per_iteration": 2.794492721557617 + }, + { + "auxiliary_loss_clip": 0.01638832, + "auxiliary_loss_mlp": 0.01518973, + "balance_loss_clip": 1.27009153, + "balance_loss_mlp": 1.16248941, + "epoch": 0.03192544716669172, + "flos": 21217186445760.0, + "grad_norm": 2.5851936742133232, + "language_loss": 0.76237917, + "learning_rate": 3.999961169942907e-06, + "loss": 0.79395723, + "num_input_tokens_seen": 11254425, + "step": 531, + "time_per_iteration": 2.7584757804870605 + }, + { + "auxiliary_loss_clip": 0.01633129, + "auxiliary_loss_mlp": 0.0150197, + "balance_loss_clip": 1.26331127, + "balance_loss_mlp": 1.13995552, + "epoch": 0.03198557041935969, + "flos": 24355545448320.0, + "grad_norm": 2.87043136880743, + "language_loss": 0.90898263, + "learning_rate": 3.999958705152843e-06, + "loss": 0.9403336, + "num_input_tokens_seen": 11274595, + "step": 532, + "time_per_iteration": 2.913958787918091 + }, + { + "auxiliary_loss_clip": 0.01777413, + "auxiliary_loss_mlp": 0.01669281, + "balance_loss_clip": 1.41087699, + "balance_loss_mlp": 1.23898315, + "epoch": 0.032045693672027656, + "flos": 61834023694080.0, + "grad_norm": 0.8418329889307692, + "language_loss": 0.5790053, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.61347222, + "num_input_tokens_seen": 11336705, + "step": 533, + "time_per_iteration": 3.3136091232299805 + }, + { + "auxiliary_loss_clip": 0.01644255, + "auxiliary_loss_mlp": 0.01522367, + "balance_loss_clip": 1.27459323, + "balance_loss_mlp": 1.16702783, + "epoch": 0.03210581692469563, + "flos": 28403620476480.0, + "grad_norm": 2.163746248197227, + "language_loss": 0.86740237, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89906859, + "num_input_tokens_seen": 11356820, + "step": 534, + "time_per_iteration": 2.990827798843384 + }, + { + "auxiliary_loss_clip": 0.0163316, + "auxiliary_loss_mlp": 0.01580864, + "balance_loss_clip": 1.26238191, + "balance_loss_mlp": 1.23639655, + "epoch": 0.03216594017736359, + "flos": 24720038709120.0, + "grad_norm": 2.242759148122535, + "language_loss": 0.77776235, + "learning_rate": 3.999950855751232e-06, + "loss": 0.80990261, + "num_input_tokens_seen": 11376645, + "step": 535, + "time_per_iteration": 2.7515110969543457 + }, + { + "auxiliary_loss_clip": 0.01640338, + "auxiliary_loss_mlp": 0.01521067, + "balance_loss_clip": 1.27070081, + "balance_loss_mlp": 1.15638161, + "epoch": 0.032226063430031565, + "flos": 31178130996960.0, + "grad_norm": 2.480095831741286, + "language_loss": 0.80518007, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83679414, + "num_input_tokens_seen": 11397310, + "step": 536, + "time_per_iteration": 2.847532033920288 + }, + { + "auxiliary_loss_clip": 0.01637632, + "auxiliary_loss_mlp": 0.01531154, + "balance_loss_clip": 1.26782942, + "balance_loss_mlp": 1.17505169, + "epoch": 0.03228618668269954, + "flos": 32201025744960.0, + "grad_norm": 2.106930099147689, + "language_loss": 0.69963241, + "learning_rate": 3.999945243624975e-06, + "loss": 0.73132026, + "num_input_tokens_seen": 11418475, + "step": 537, + "time_per_iteration": 2.814126491546631 + }, + { + "auxiliary_loss_clip": 0.01634331, + "auxiliary_loss_mlp": 0.01550027, + "balance_loss_clip": 1.26447833, + "balance_loss_mlp": 1.20346141, + "epoch": 0.0323463099353675, + "flos": 22672125236160.0, + "grad_norm": 3.2928897782959274, + "language_loss": 0.82916796, + "learning_rate": 3.999942323804607e-06, + "loss": 0.86101162, + "num_input_tokens_seen": 11436630, + "step": 538, + "time_per_iteration": 2.752476453781128 + }, + { + "auxiliary_loss_clip": 0.01646316, + "auxiliary_loss_mlp": 0.01550206, + "balance_loss_clip": 1.27548552, + "balance_loss_mlp": 1.1960113, + "epoch": 0.032406433188035474, + "flos": 26907529268160.0, + "grad_norm": 2.846798604324742, + "language_loss": 0.79387832, + "learning_rate": 3.999939328146225e-06, + "loss": 0.82584357, + "num_input_tokens_seen": 11457275, + "step": 539, + "time_per_iteration": 2.838451385498047 + }, + { + "auxiliary_loss_clip": 0.01635495, + "auxiliary_loss_mlp": 0.01528319, + "balance_loss_clip": 1.26519012, + "balance_loss_mlp": 1.16859269, + "epoch": 0.03246655644070344, + "flos": 31506630069600.0, + "grad_norm": 3.162663837601607, + "language_loss": 0.77813411, + "learning_rate": 3.999936256649943e-06, + "loss": 0.80977219, + "num_input_tokens_seen": 11476925, + "step": 540, + "time_per_iteration": 2.8113324642181396 + }, + { + "auxiliary_loss_clip": 0.01637439, + "auxiliary_loss_mlp": 0.01531958, + "balance_loss_clip": 1.26656556, + "balance_loss_mlp": 1.17452049, + "epoch": 0.03252667969337141, + "flos": 23220382253760.0, + "grad_norm": 2.014642133398968, + "language_loss": 0.85520202, + "learning_rate": 3.999933109315878e-06, + "loss": 0.88689595, + "num_input_tokens_seen": 11496830, + "step": 541, + "time_per_iteration": 2.7746596336364746 + }, + { + "auxiliary_loss_clip": 0.01628122, + "auxiliary_loss_mlp": 0.0151508, + "balance_loss_clip": 1.25662684, + "balance_loss_mlp": 1.15440059, + "epoch": 0.032586802946039384, + "flos": 14759245870560.0, + "grad_norm": 2.5785479857982536, + "language_loss": 0.89124048, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.92267251, + "num_input_tokens_seen": 11515605, + "step": 542, + "time_per_iteration": 2.8107082843780518 + }, + { + "auxiliary_loss_clip": 0.01634562, + "auxiliary_loss_mlp": 0.01508578, + "balance_loss_clip": 1.26387548, + "balance_loss_mlp": 1.14312959, + "epoch": 0.03264692619870735, + "flos": 24283026077760.0, + "grad_norm": 2.225280027946472, + "language_loss": 0.71182203, + "learning_rate": 3.999926587134879e-06, + "loss": 0.74325341, + "num_input_tokens_seen": 11536230, + "step": 543, + "time_per_iteration": 3.053314685821533 + }, + { + "auxiliary_loss_clip": 0.01624604, + "auxiliary_loss_mlp": 0.0150286, + "balance_loss_clip": 1.25365317, + "balance_loss_mlp": 1.13035512, + "epoch": 0.03270704945137532, + "flos": 22895372571840.0, + "grad_norm": 2.643475383486378, + "language_loss": 0.91867793, + "learning_rate": 3.999923212288192e-06, + "loss": 0.9499526, + "num_input_tokens_seen": 11554715, + "step": 544, + "time_per_iteration": 2.818608045578003 + }, + { + "auxiliary_loss_clip": 0.01647051, + "auxiliary_loss_mlp": 0.01494648, + "balance_loss_clip": 1.27593803, + "balance_loss_mlp": 1.1189009, + "epoch": 0.032767172704043286, + "flos": 18042984967680.0, + "grad_norm": 3.464765049488329, + "language_loss": 0.66032219, + "learning_rate": 3.999919761604216e-06, + "loss": 0.6917392, + "num_input_tokens_seen": 11571370, + "step": 545, + "time_per_iteration": 2.7455952167510986 + }, + { + "auxiliary_loss_clip": 0.01625273, + "auxiliary_loss_mlp": 0.01504132, + "balance_loss_clip": 1.25509202, + "balance_loss_mlp": 1.12475991, + "epoch": 0.03282729595671126, + "flos": 22530955167360.0, + "grad_norm": 2.2658333195520544, + "language_loss": 0.91963029, + "learning_rate": 3.999916235083083e-06, + "loss": 0.9509244, + "num_input_tokens_seen": 11588560, + "step": 546, + "time_per_iteration": 2.7691776752471924 + }, + { + "auxiliary_loss_clip": 0.01623262, + "auxiliary_loss_mlp": 0.01496461, + "balance_loss_clip": 1.25063384, + "balance_loss_mlp": 1.11823416, + "epoch": 0.03288741920937923, + "flos": 20412608372640.0, + "grad_norm": 2.7298042056934104, + "language_loss": 0.81994605, + "learning_rate": 3.999912632724925e-06, + "loss": 0.85114324, + "num_input_tokens_seen": 11605685, + "step": 547, + "time_per_iteration": 2.7760019302368164 + }, + { + "auxiliary_loss_clip": 0.01630417, + "auxiliary_loss_mlp": 0.01475569, + "balance_loss_clip": 1.25799298, + "balance_loss_mlp": 1.09295511, + "epoch": 0.032947542462047195, + "flos": 20780135886240.0, + "grad_norm": 2.076668883354691, + "language_loss": 0.81687516, + "learning_rate": 3.999908954529881e-06, + "loss": 0.84793508, + "num_input_tokens_seen": 11626290, + "step": 548, + "time_per_iteration": 2.7889819145202637 + }, + { + "auxiliary_loss_clip": 0.01627731, + "auxiliary_loss_mlp": 0.01489663, + "balance_loss_clip": 1.25605249, + "balance_loss_mlp": 1.11029172, + "epoch": 0.03300766571471517, + "flos": 19903303939680.0, + "grad_norm": 4.661524263333097, + "language_loss": 0.67227691, + "learning_rate": 3.999905200498087e-06, + "loss": 0.70345086, + "num_input_tokens_seen": 11643950, + "step": 549, + "time_per_iteration": 5.749229431152344 + }, + { + "auxiliary_loss_clip": 0.01640836, + "auxiliary_loss_mlp": 0.01485849, + "balance_loss_clip": 1.27061296, + "balance_loss_mlp": 1.10609555, + "epoch": 0.03306778896738313, + "flos": 17969707033920.0, + "grad_norm": 4.580163407338368, + "language_loss": 0.86366832, + "learning_rate": 3.999901370629689e-06, + "loss": 0.89493513, + "num_input_tokens_seen": 11662560, + "step": 550, + "time_per_iteration": 2.8484816551208496 + }, + { + "auxiliary_loss_clip": 0.01649668, + "auxiliary_loss_mlp": 0.0150263, + "balance_loss_clip": 1.28007364, + "balance_loss_mlp": 1.11753654, + "epoch": 0.033127912220051105, + "flos": 21655564490880.0, + "grad_norm": 1.8316315459733388, + "language_loss": 0.81558806, + "learning_rate": 3.99989746492483e-06, + "loss": 0.84711111, + "num_input_tokens_seen": 11682265, + "step": 551, + "time_per_iteration": 2.8075459003448486 + }, + { + "auxiliary_loss_clip": 0.01634915, + "auxiliary_loss_mlp": 0.01506598, + "balance_loss_clip": 1.26314437, + "balance_loss_mlp": 1.12302995, + "epoch": 0.03318803547271908, + "flos": 30190889083680.0, + "grad_norm": 3.9926276723520098, + "language_loss": 0.86506915, + "learning_rate": 3.999893483383658e-06, + "loss": 0.89648426, + "num_input_tokens_seen": 11699300, + "step": 552, + "time_per_iteration": 2.8900082111358643 + }, + { + "auxiliary_loss_clip": 0.01634391, + "auxiliary_loss_mlp": 0.01482966, + "balance_loss_clip": 1.26254296, + "balance_loss_mlp": 1.09653699, + "epoch": 0.03324815872538704, + "flos": 20378131310880.0, + "grad_norm": 2.768780650491027, + "language_loss": 0.9306109, + "learning_rate": 3.999889426006326e-06, + "loss": 0.96178436, + "num_input_tokens_seen": 11716955, + "step": 553, + "time_per_iteration": 2.8103418350219727 + }, + { + "auxiliary_loss_clip": 0.01626668, + "auxiliary_loss_mlp": 0.01493159, + "balance_loss_clip": 1.2558831, + "balance_loss_mlp": 1.11207128, + "epoch": 0.033308281978055014, + "flos": 24496639660800.0, + "grad_norm": 14.410457617259048, + "language_loss": 0.79095083, + "learning_rate": 3.999885292792986e-06, + "loss": 0.82214916, + "num_input_tokens_seen": 11736130, + "step": 554, + "time_per_iteration": 2.9866654872894287 + }, + { + "auxiliary_loss_clip": 0.01631362, + "auxiliary_loss_mlp": 0.01484911, + "balance_loss_clip": 1.25899863, + "balance_loss_mlp": 1.10458517, + "epoch": 0.03336840523072298, + "flos": 23402363387040.0, + "grad_norm": 2.279925876200172, + "language_loss": 0.81961751, + "learning_rate": 3.999881083743795e-06, + "loss": 0.85078025, + "num_input_tokens_seen": 11754425, + "step": 555, + "time_per_iteration": 2.8698031902313232 + }, + { + "auxiliary_loss_clip": 0.0161817, + "auxiliary_loss_mlp": 0.01500107, + "balance_loss_clip": 1.24653351, + "balance_loss_mlp": 1.11711156, + "epoch": 0.03342852848339095, + "flos": 30552954942240.0, + "grad_norm": 2.9015284252795346, + "language_loss": 0.88954574, + "learning_rate": 3.999876798858914e-06, + "loss": 0.9207285, + "num_input_tokens_seen": 11772845, + "step": 556, + "time_per_iteration": 2.8899788856506348 + }, + { + "auxiliary_loss_clip": 0.01616255, + "auxiliary_loss_mlp": 0.0150014, + "balance_loss_clip": 1.24533212, + "balance_loss_mlp": 1.12629962, + "epoch": 0.03348865173605892, + "flos": 22895524284480.0, + "grad_norm": 2.5776976567624215, + "language_loss": 0.83871251, + "learning_rate": 3.999872438138503e-06, + "loss": 0.86987644, + "num_input_tokens_seen": 11792850, + "step": 557, + "time_per_iteration": 2.8373022079467773 + }, + { + "auxiliary_loss_clip": 0.01630143, + "auxiliary_loss_mlp": 0.01506297, + "balance_loss_clip": 1.2580229, + "balance_loss_mlp": 1.1294055, + "epoch": 0.03354877498872689, + "flos": 17677960712640.0, + "grad_norm": 3.7922526806819223, + "language_loss": 0.94504625, + "learning_rate": 3.999868001582729e-06, + "loss": 0.97641063, + "num_input_tokens_seen": 11809670, + "step": 558, + "time_per_iteration": 2.7382781505584717 + }, + { + "auxiliary_loss_clip": 0.01633287, + "auxiliary_loss_mlp": 0.01502636, + "balance_loss_clip": 1.26166821, + "balance_loss_mlp": 1.11372733, + "epoch": 0.03360889824139486, + "flos": 21655147281120.0, + "grad_norm": 2.8024416436412785, + "language_loss": 0.77512217, + "learning_rate": 3.99986348919176e-06, + "loss": 0.80648136, + "num_input_tokens_seen": 11829665, + "step": 559, + "time_per_iteration": 2.8311898708343506 + }, + { + "auxiliary_loss_clip": 0.01617536, + "auxiliary_loss_mlp": 0.01521552, + "balance_loss_clip": 1.24593806, + "balance_loss_mlp": 1.14256227, + "epoch": 0.033669021494062826, + "flos": 21797720691840.0, + "grad_norm": 2.6660851313447225, + "language_loss": 0.87477136, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90616226, + "num_input_tokens_seen": 11848190, + "step": 560, + "time_per_iteration": 2.784266471862793 + }, + { + "auxiliary_loss_clip": 0.01629262, + "auxiliary_loss_mlp": 0.01500752, + "balance_loss_clip": 1.25729752, + "balance_loss_mlp": 1.10745668, + "epoch": 0.0337291447467308, + "flos": 21868078157280.0, + "grad_norm": 2.5940653806312057, + "language_loss": 0.81423646, + "learning_rate": 3.999854236904925e-06, + "loss": 0.84553659, + "num_input_tokens_seen": 11864795, + "step": 561, + "time_per_iteration": 2.717548131942749 + }, + { + "auxiliary_loss_clip": 0.01612952, + "auxiliary_loss_mlp": 0.01502751, + "balance_loss_clip": 1.24110413, + "balance_loss_mlp": 1.11880171, + "epoch": 0.03378926799939877, + "flos": 24248397303360.0, + "grad_norm": 2.0896225355549323, + "language_loss": 0.82563561, + "learning_rate": 3.999849497009409e-06, + "loss": 0.85679263, + "num_input_tokens_seen": 11885275, + "step": 562, + "time_per_iteration": 2.8133466243743896 + }, + { + "auxiliary_loss_clip": 0.01621944, + "auxiliary_loss_mlp": 0.01490949, + "balance_loss_clip": 1.25014198, + "balance_loss_mlp": 1.11100483, + "epoch": 0.033849391252066735, + "flos": 16509647941920.0, + "grad_norm": 2.2927128093150375, + "language_loss": 0.8468529, + "learning_rate": 3.999844681279401e-06, + "loss": 0.87798184, + "num_input_tokens_seen": 11903595, + "step": 563, + "time_per_iteration": 2.782304048538208 + }, + { + "auxiliary_loss_clip": 0.01615729, + "auxiliary_loss_mlp": 0.01493156, + "balance_loss_clip": 1.24264836, + "balance_loss_mlp": 1.11435699, + "epoch": 0.03390951450473471, + "flos": 15671123801280.0, + "grad_norm": 2.1337913987233286, + "language_loss": 0.94509774, + "learning_rate": 3.99983978971508e-06, + "loss": 0.97618657, + "num_input_tokens_seen": 11917815, + "step": 564, + "time_per_iteration": 2.7134156227111816 + }, + { + "auxiliary_loss_clip": 0.01618703, + "auxiliary_loss_mlp": 0.01493128, + "balance_loss_clip": 1.24603534, + "balance_loss_mlp": 1.1166172, + "epoch": 0.03396963775740267, + "flos": 22677017968800.0, + "grad_norm": 2.5266768778316937, + "language_loss": 0.94695783, + "learning_rate": 3.999834822316635e-06, + "loss": 0.9780761, + "num_input_tokens_seen": 11936305, + "step": 565, + "time_per_iteration": 2.85237193107605 + }, + { + "auxiliary_loss_clip": 0.01751727, + "auxiliary_loss_mlp": 0.01555908, + "balance_loss_clip": 1.38182878, + "balance_loss_mlp": 1.12866211, + "epoch": 0.034029761010070644, + "flos": 64400116789440.0, + "grad_norm": 1.1120606194315603, + "language_loss": 0.54827887, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.58135521, + "num_input_tokens_seen": 11998940, + "step": 566, + "time_per_iteration": 3.338473081588745 + }, + { + "auxiliary_loss_clip": 0.0162013, + "auxiliary_loss_mlp": 0.01499816, + "balance_loss_clip": 1.2488184, + "balance_loss_mlp": 1.13246083, + "epoch": 0.034089884262738616, + "flos": 25006247519040.0, + "grad_norm": 2.60265007910168, + "language_loss": 0.7725566, + "learning_rate": 3.999824660018126e-06, + "loss": 0.80375606, + "num_input_tokens_seen": 12018860, + "step": 567, + "time_per_iteration": 2.8349270820617676 + }, + { + "auxiliary_loss_clip": 0.01628601, + "auxiliary_loss_mlp": 0.01511862, + "balance_loss_clip": 1.25518489, + "balance_loss_mlp": 1.13420737, + "epoch": 0.03415000751540658, + "flos": 28441624857120.0, + "grad_norm": 4.499117870181107, + "language_loss": 0.80698329, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83838791, + "num_input_tokens_seen": 12039675, + "step": 568, + "time_per_iteration": 2.816627025604248 + }, + { + "auxiliary_loss_clip": 0.01621197, + "auxiliary_loss_mlp": 0.01534061, + "balance_loss_clip": 1.24831414, + "balance_loss_mlp": 1.18272734, + "epoch": 0.034210130768074554, + "flos": 21470776673760.0, + "grad_norm": 2.3550446449976215, + "language_loss": 0.86400437, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89555699, + "num_input_tokens_seen": 12057680, + "step": 569, + "time_per_iteration": 2.835143566131592 + }, + { + "auxiliary_loss_clip": 0.01600671, + "auxiliary_loss_mlp": 0.01519326, + "balance_loss_clip": 1.22708488, + "balance_loss_mlp": 1.15731084, + "epoch": 0.03427025402074252, + "flos": 18699490046880.0, + "grad_norm": 2.192883018104188, + "language_loss": 0.95857036, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98977029, + "num_input_tokens_seen": 12076135, + "step": 570, + "time_per_iteration": 2.8538599014282227 + }, + { + "auxiliary_loss_clip": 0.01608941, + "auxiliary_loss_mlp": 0.01498157, + "balance_loss_clip": 1.2341187, + "balance_loss_mlp": 1.13080192, + "epoch": 0.03433037727341049, + "flos": 20852086334400.0, + "grad_norm": 2.746110750441103, + "language_loss": 0.80362093, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.83469188, + "num_input_tokens_seen": 12094785, + "step": 571, + "time_per_iteration": 2.7244842052459717 + }, + { + "auxiliary_loss_clip": 0.01616573, + "auxiliary_loss_mlp": 0.0149266, + "balance_loss_clip": 1.24279284, + "balance_loss_mlp": 1.12320638, + "epoch": 0.03439050052607846, + "flos": 25412613832800.0, + "grad_norm": 3.0236787468008655, + "language_loss": 0.80645931, + "learning_rate": 3.999797927188199e-06, + "loss": 0.83755171, + "num_input_tokens_seen": 12114590, + "step": 572, + "time_per_iteration": 2.8477375507354736 + }, + { + "auxiliary_loss_clip": 0.01626622, + "auxiliary_loss_mlp": 0.01518481, + "balance_loss_clip": 1.25191319, + "balance_loss_mlp": 1.15589452, + "epoch": 0.03445062377874643, + "flos": 17642118237120.0, + "grad_norm": 2.578029256884044, + "language_loss": 0.84930515, + "learning_rate": 3.999792353123774e-06, + "loss": 0.88075626, + "num_input_tokens_seen": 12132390, + "step": 573, + "time_per_iteration": 2.7138497829437256 + }, + { + "auxiliary_loss_clip": 0.01606811, + "auxiliary_loss_mlp": 0.01479579, + "balance_loss_clip": 1.23105586, + "balance_loss_mlp": 1.10955358, + "epoch": 0.0345107470314144, + "flos": 16766424135360.0, + "grad_norm": 3.0808572972989787, + "language_loss": 0.7676816, + "learning_rate": 3.999786703227023e-06, + "loss": 0.79854554, + "num_input_tokens_seen": 12149035, + "step": 574, + "time_per_iteration": 2.848012924194336 + }, + { + "auxiliary_loss_clip": 0.0160971, + "auxiliary_loss_mlp": 0.01487972, + "balance_loss_clip": 1.23528314, + "balance_loss_mlp": 1.12176096, + "epoch": 0.03457087028408237, + "flos": 14686233433920.0, + "grad_norm": 6.565983327479279, + "language_loss": 0.84336209, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.87433887, + "num_input_tokens_seen": 12167530, + "step": 575, + "time_per_iteration": 2.7539093494415283 + }, + { + "auxiliary_loss_clip": 0.01615309, + "auxiliary_loss_mlp": 0.01490538, + "balance_loss_clip": 1.24209428, + "balance_loss_mlp": 1.12184787, + "epoch": 0.03463099353675034, + "flos": 20013372552960.0, + "grad_norm": 2.4214332790941224, + "language_loss": 0.84077358, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.87183207, + "num_input_tokens_seen": 12186340, + "step": 576, + "time_per_iteration": 2.7996575832366943 + }, + { + "auxiliary_loss_clip": 0.01617002, + "auxiliary_loss_mlp": 0.01527235, + "balance_loss_clip": 1.24341273, + "balance_loss_mlp": 1.16674638, + "epoch": 0.03469111678941831, + "flos": 25303493423520.0, + "grad_norm": 2.6787216933579177, + "language_loss": 0.86527896, + "learning_rate": 3.99976929854497e-06, + "loss": 0.89672136, + "num_input_tokens_seen": 12204090, + "step": 577, + "time_per_iteration": 2.770962953567505 + }, + { + "auxiliary_loss_clip": 0.01606968, + "auxiliary_loss_mlp": 0.01479025, + "balance_loss_clip": 1.23344553, + "balance_loss_mlp": 1.09869957, + "epoch": 0.034751240042086275, + "flos": 23261724312480.0, + "grad_norm": 2.28004731103832, + "language_loss": 0.72157472, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.75243461, + "num_input_tokens_seen": 12224850, + "step": 578, + "time_per_iteration": 2.8127026557922363 + }, + { + "auxiliary_loss_clip": 0.01607987, + "auxiliary_loss_mlp": 0.01490021, + "balance_loss_clip": 1.23560846, + "balance_loss_mlp": 1.11751628, + "epoch": 0.03481136329475425, + "flos": 23771521811520.0, + "grad_norm": 2.058432536262398, + "language_loss": 0.77825636, + "learning_rate": 3.999757316265973e-06, + "loss": 0.80923641, + "num_input_tokens_seen": 12244935, + "step": 579, + "time_per_iteration": 2.8821263313293457 + }, + { + "auxiliary_loss_clip": 0.01607529, + "auxiliary_loss_mlp": 0.01510368, + "balance_loss_clip": 1.23327136, + "balance_loss_mlp": 1.13442945, + "epoch": 0.03487148654742222, + "flos": 20159624995200.0, + "grad_norm": 2.2473277381211307, + "language_loss": 0.8649444, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89612335, + "num_input_tokens_seen": 12262140, + "step": 580, + "time_per_iteration": 2.717898368835449 + }, + { + "auxiliary_loss_clip": 0.01601386, + "auxiliary_loss_mlp": 0.01494445, + "balance_loss_clip": 1.22807884, + "balance_loss_mlp": 1.11526453, + "epoch": 0.034931609800090184, + "flos": 15671427226560.0, + "grad_norm": 4.556237533928295, + "language_loss": 0.82316744, + "learning_rate": 3.999745030662987e-06, + "loss": 0.85412574, + "num_input_tokens_seen": 12280930, + "step": 581, + "time_per_iteration": 2.776779890060425 + }, + { + "auxiliary_loss_clip": 0.01606505, + "auxiliary_loss_mlp": 0.01496827, + "balance_loss_clip": 1.23279929, + "balance_loss_mlp": 1.11898136, + "epoch": 0.034991733052758156, + "flos": 16364229919200.0, + "grad_norm": 2.805807038812477, + "language_loss": 0.77373493, + "learning_rate": 3.99973877411558e-06, + "loss": 0.8047682, + "num_input_tokens_seen": 12299125, + "step": 582, + "time_per_iteration": 2.7484893798828125 + }, + { + "auxiliary_loss_clip": 0.01609425, + "auxiliary_loss_mlp": 0.01509077, + "balance_loss_clip": 1.23676467, + "balance_loss_mlp": 1.13466477, + "epoch": 0.03505185630542612, + "flos": 19389144702240.0, + "grad_norm": 3.0094670878615783, + "language_loss": 0.87909448, + "learning_rate": 3.999732441737877e-06, + "loss": 0.91027957, + "num_input_tokens_seen": 12316905, + "step": 583, + "time_per_iteration": 2.8058583736419678 + }, + { + "auxiliary_loss_clip": 0.01614297, + "auxiliary_loss_mlp": 0.01493617, + "balance_loss_clip": 1.24214888, + "balance_loss_mlp": 1.11710691, + "epoch": 0.03511197955809409, + "flos": 21325927573440.0, + "grad_norm": 2.562309535996968, + "language_loss": 0.80902946, + "learning_rate": 3.99972603353012e-06, + "loss": 0.84010857, + "num_input_tokens_seen": 12335070, + "step": 584, + "time_per_iteration": 2.854360342025757 + }, + { + "auxiliary_loss_clip": 0.01608104, + "auxiliary_loss_mlp": 0.01495435, + "balance_loss_clip": 1.23647475, + "balance_loss_mlp": 1.11797071, + "epoch": 0.035172102810762065, + "flos": 14138317769760.0, + "grad_norm": 5.7331053346072505, + "language_loss": 0.92841649, + "learning_rate": 3.999719549492551e-06, + "loss": 0.95945191, + "num_input_tokens_seen": 12350315, + "step": 585, + "time_per_iteration": 2.7976763248443604 + }, + { + "auxiliary_loss_clip": 0.01614191, + "auxiliary_loss_mlp": 0.01513246, + "balance_loss_clip": 1.24260402, + "balance_loss_mlp": 1.13711703, + "epoch": 0.03523222606343003, + "flos": 20298633158880.0, + "grad_norm": 7.637725184746024, + "language_loss": 0.87524235, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.90651673, + "num_input_tokens_seen": 12366030, + "step": 586, + "time_per_iteration": 5.789145469665527 + }, + { + "auxiliary_loss_clip": 0.01604412, + "auxiliary_loss_mlp": 0.01519565, + "balance_loss_clip": 1.23301637, + "balance_loss_mlp": 1.13924003, + "epoch": 0.035292349316098, + "flos": 20377903741920.0, + "grad_norm": 2.117372553633385, + "language_loss": 0.76604784, + "learning_rate": 3.999706353928965e-06, + "loss": 0.79728758, + "num_input_tokens_seen": 12384895, + "step": 587, + "time_per_iteration": 4.149916648864746 + }, + { + "auxiliary_loss_clip": 0.01602455, + "auxiliary_loss_mlp": 0.01496979, + "balance_loss_clip": 1.23151469, + "balance_loss_mlp": 1.11627209, + "epoch": 0.03535247256876597, + "flos": 21470928386400.0, + "grad_norm": 1.9329808126337245, + "language_loss": 0.7913571, + "learning_rate": 3.999699642403449e-06, + "loss": 0.82235146, + "num_input_tokens_seen": 12404980, + "step": 588, + "time_per_iteration": 4.287456512451172 + }, + { + "auxiliary_loss_clip": 0.01594936, + "auxiliary_loss_mlp": 0.01481084, + "balance_loss_clip": 1.22347665, + "balance_loss_mlp": 1.10094953, + "epoch": 0.03541259582143394, + "flos": 23625307297440.0, + "grad_norm": 2.945052644966703, + "language_loss": 0.9402082, + "learning_rate": 3.99969285504912e-06, + "loss": 0.97096837, + "num_input_tokens_seen": 12423835, + "step": 589, + "time_per_iteration": 2.744765281677246 + }, + { + "auxiliary_loss_clip": 0.01600504, + "auxiliary_loss_mlp": 0.01496648, + "balance_loss_clip": 1.23022401, + "balance_loss_mlp": 1.11136389, + "epoch": 0.03547271907410191, + "flos": 33729014900160.0, + "grad_norm": 2.2770334289597436, + "language_loss": 0.83999777, + "learning_rate": 3.99968599186624e-06, + "loss": 0.8709693, + "num_input_tokens_seen": 12443135, + "step": 590, + "time_per_iteration": 2.8535072803497314 + }, + { + "auxiliary_loss_clip": 0.01600628, + "auxiliary_loss_mlp": 0.01492622, + "balance_loss_clip": 1.2304399, + "balance_loss_mlp": 1.11439478, + "epoch": 0.03553284232676988, + "flos": 21144856716000.0, + "grad_norm": 2.327503806754012, + "language_loss": 0.8681792, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89911169, + "num_input_tokens_seen": 12462895, + "step": 591, + "time_per_iteration": 2.715792417526245 + }, + { + "auxiliary_loss_clip": 0.01598875, + "auxiliary_loss_mlp": 0.01484629, + "balance_loss_clip": 1.2289592, + "balance_loss_mlp": 1.09934473, + "epoch": 0.03559296557943785, + "flos": 20048608177920.0, + "grad_norm": 2.2526063053875727, + "language_loss": 0.83146203, + "learning_rate": 3.999672038015861e-06, + "loss": 0.862297, + "num_input_tokens_seen": 12481515, + "step": 592, + "time_per_iteration": 2.738260507583618 + }, + { + "auxiliary_loss_clip": 0.01673721, + "auxiliary_loss_mlp": 0.01445358, + "balance_loss_clip": 1.3062526, + "balance_loss_mlp": 1.1073761, + "epoch": 0.035653088832105814, + "flos": 60341346020160.0, + "grad_norm": 0.8826564263252799, + "language_loss": 0.59752762, + "learning_rate": 3.999664947348893e-06, + "loss": 0.62871838, + "num_input_tokens_seen": 12548220, + "step": 593, + "time_per_iteration": 3.3537657260894775 + }, + { + "auxiliary_loss_clip": 0.01604546, + "auxiliary_loss_mlp": 0.01478389, + "balance_loss_clip": 1.23593664, + "balance_loss_mlp": 1.09443927, + "epoch": 0.035713212084773786, + "flos": 20114793545760.0, + "grad_norm": 2.3807281550743324, + "language_loss": 0.87023324, + "learning_rate": 3.999657780854429e-06, + "loss": 0.90106255, + "num_input_tokens_seen": 12566105, + "step": 594, + "time_per_iteration": 2.770905017852783 + }, + { + "auxiliary_loss_clip": 0.01595652, + "auxiliary_loss_mlp": 0.01494315, + "balance_loss_clip": 1.22424197, + "balance_loss_mlp": 1.11933017, + "epoch": 0.03577333533744176, + "flos": 26288156221920.0, + "grad_norm": 2.332757048744959, + "language_loss": 0.83289766, + "learning_rate": 3.999650538532742e-06, + "loss": 0.86379731, + "num_input_tokens_seen": 12586680, + "step": 595, + "time_per_iteration": 2.8323378562927246 + }, + { + "auxiliary_loss_clip": 0.01613068, + "auxiliary_loss_mlp": 0.01493809, + "balance_loss_clip": 1.24383354, + "balance_loss_mlp": 1.11748886, + "epoch": 0.035833458590109724, + "flos": 10891293495840.0, + "grad_norm": 3.254108692224103, + "language_loss": 0.96261388, + "learning_rate": 3.999643220384106e-06, + "loss": 0.99368262, + "num_input_tokens_seen": 12601605, + "step": 596, + "time_per_iteration": 2.797865152359009 + }, + { + "auxiliary_loss_clip": 0.01608002, + "auxiliary_loss_mlp": 0.01490461, + "balance_loss_clip": 1.23865414, + "balance_loss_mlp": 1.10708392, + "epoch": 0.035893581842777696, + "flos": 22092349553280.0, + "grad_norm": 2.334428644803202, + "language_loss": 0.8301602, + "learning_rate": 3.999635826408799e-06, + "loss": 0.86114478, + "num_input_tokens_seen": 12620365, + "step": 597, + "time_per_iteration": 2.883066415786743 + }, + { + "auxiliary_loss_clip": 0.01612119, + "auxiliary_loss_mlp": 0.01500552, + "balance_loss_clip": 1.24295485, + "balance_loss_mlp": 1.12003636, + "epoch": 0.03595370509544566, + "flos": 23040525097440.0, + "grad_norm": 2.688237016301797, + "language_loss": 0.81305116, + "learning_rate": 3.999628356607101e-06, + "loss": 0.84417784, + "num_input_tokens_seen": 12641140, + "step": 598, + "time_per_iteration": 2.832404851913452 + }, + { + "auxiliary_loss_clip": 0.01606332, + "auxiliary_loss_mlp": 0.0149951, + "balance_loss_clip": 1.23667049, + "balance_loss_mlp": 1.12261808, + "epoch": 0.03601382834811363, + "flos": 20779870389120.0, + "grad_norm": 1.8767635833198206, + "language_loss": 0.81100553, + "learning_rate": 3.999620810979295e-06, + "loss": 0.84206396, + "num_input_tokens_seen": 12661080, + "step": 599, + "time_per_iteration": 2.778428077697754 + }, + { + "auxiliary_loss_clip": 0.0159726, + "auxiliary_loss_mlp": 0.0148081, + "balance_loss_clip": 1.22659206, + "balance_loss_mlp": 1.10487199, + "epoch": 0.036073951600781605, + "flos": 23953920154560.0, + "grad_norm": 2.6450375665832677, + "language_loss": 0.86044407, + "learning_rate": 3.999613189525668e-06, + "loss": 0.89122486, + "num_input_tokens_seen": 12678270, + "step": 600, + "time_per_iteration": 2.794165849685669 + }, + { + "auxiliary_loss_clip": 0.01594667, + "auxiliary_loss_mlp": 0.01468673, + "balance_loss_clip": 1.22428131, + "balance_loss_mlp": 1.08434224, + "epoch": 0.03613407485344957, + "flos": 18914051833920.0, + "grad_norm": 3.524714986052813, + "language_loss": 0.82412422, + "learning_rate": 3.999605492246508e-06, + "loss": 0.85475755, + "num_input_tokens_seen": 12697295, + "step": 601, + "time_per_iteration": 2.714480400085449 + }, + { + "auxiliary_loss_clip": 0.01601958, + "auxiliary_loss_mlp": 0.01488171, + "balance_loss_clip": 1.23191833, + "balance_loss_mlp": 1.11413956, + "epoch": 0.03619419810611754, + "flos": 23040714738240.0, + "grad_norm": 3.107810544170797, + "language_loss": 0.75634348, + "learning_rate": 3.999597719142107e-06, + "loss": 0.7872448, + "num_input_tokens_seen": 12716165, + "step": 602, + "time_per_iteration": 2.7957725524902344 + }, + { + "auxiliary_loss_clip": 0.01597177, + "auxiliary_loss_mlp": 0.01486066, + "balance_loss_clip": 1.22591197, + "balance_loss_mlp": 1.11794782, + "epoch": 0.03625432135878551, + "flos": 29460195794880.0, + "grad_norm": 2.425981403396645, + "language_loss": 0.80016279, + "learning_rate": 3.999589870212761e-06, + "loss": 0.8309952, + "num_input_tokens_seen": 12735475, + "step": 603, + "time_per_iteration": 2.8153910636901855 + }, + { + "auxiliary_loss_clip": 0.0161198, + "auxiliary_loss_mlp": 0.01502319, + "balance_loss_clip": 1.24297523, + "balance_loss_mlp": 1.13477325, + "epoch": 0.03631444461145348, + "flos": 23510497664160.0, + "grad_norm": 2.6647263813432818, + "language_loss": 0.86880851, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89995146, + "num_input_tokens_seen": 12754540, + "step": 604, + "time_per_iteration": 2.7773170471191406 + }, + { + "auxiliary_loss_clip": 0.0160044, + "auxiliary_loss_mlp": 0.01497506, + "balance_loss_clip": 1.23023057, + "balance_loss_mlp": 1.12137651, + "epoch": 0.03637456786412145, + "flos": 16620019980480.0, + "grad_norm": 3.611221405731457, + "language_loss": 0.8104068, + "learning_rate": 3.999573944880424e-06, + "loss": 0.84138626, + "num_input_tokens_seen": 12773050, + "step": 605, + "time_per_iteration": 2.749738931655884 + }, + { + "auxiliary_loss_clip": 0.01602846, + "auxiliary_loss_mlp": 0.01512573, + "balance_loss_clip": 1.23242831, + "balance_loss_mlp": 1.13739765, + "epoch": 0.03643469111678942, + "flos": 15853673856960.0, + "grad_norm": 17.984071108591476, + "language_loss": 0.85428691, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.88544118, + "num_input_tokens_seen": 12791240, + "step": 606, + "time_per_iteration": 2.780590534210205 + }, + { + "auxiliary_loss_clip": 0.01605028, + "auxiliary_loss_mlp": 0.01492533, + "balance_loss_clip": 1.23604393, + "balance_loss_mlp": 1.11621332, + "epoch": 0.03649481436945739, + "flos": 23622500613600.0, + "grad_norm": 2.133337222025446, + "language_loss": 0.82372975, + "learning_rate": 3.999557716251912e-06, + "loss": 0.85470539, + "num_input_tokens_seen": 12812245, + "step": 607, + "time_per_iteration": 2.7742984294891357 + }, + { + "auxiliary_loss_clip": 0.01609822, + "auxiliary_loss_mlp": 0.01499416, + "balance_loss_clip": 1.24030471, + "balance_loss_mlp": 1.12939072, + "epoch": 0.036554937622125354, + "flos": 21757364765280.0, + "grad_norm": 2.4388966543881443, + "language_loss": 0.83370847, + "learning_rate": 3.999549488202358e-06, + "loss": 0.86480081, + "num_input_tokens_seen": 12831085, + "step": 608, + "time_per_iteration": 2.732762098312378 + }, + { + "auxiliary_loss_clip": 0.01610176, + "auxiliary_loss_mlp": 0.01502318, + "balance_loss_clip": 1.2405889, + "balance_loss_mlp": 1.12924027, + "epoch": 0.036615060874793326, + "flos": 17821596111840.0, + "grad_norm": 3.1251027218988887, + "language_loss": 0.82053787, + "learning_rate": 3.999541184329688e-06, + "loss": 0.85166276, + "num_input_tokens_seen": 12849115, + "step": 609, + "time_per_iteration": 2.7308132648468018 + }, + { + "auxiliary_loss_clip": 0.01608068, + "auxiliary_loss_mlp": 0.01496439, + "balance_loss_clip": 1.23671114, + "balance_loss_mlp": 1.11554122, + "epoch": 0.0366751841274613, + "flos": 26755853099040.0, + "grad_norm": 2.2359023462227827, + "language_loss": 0.79733735, + "learning_rate": 3.999532804634215e-06, + "loss": 0.82838237, + "num_input_tokens_seen": 12868005, + "step": 610, + "time_per_iteration": 2.779658555984497 + }, + { + "auxiliary_loss_clip": 0.01610861, + "auxiliary_loss_mlp": 0.01490661, + "balance_loss_clip": 1.24156272, + "balance_loss_mlp": 1.11815608, + "epoch": 0.03673530738012926, + "flos": 22198966704000.0, + "grad_norm": 2.7528349115760835, + "language_loss": 0.874461, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.90547621, + "num_input_tokens_seen": 12886890, + "step": 611, + "time_per_iteration": 2.8000359535217285 + }, + { + "auxiliary_loss_clip": 0.01611027, + "auxiliary_loss_mlp": 0.01503235, + "balance_loss_clip": 1.24116087, + "balance_loss_mlp": 1.11756945, + "epoch": 0.036795430632797235, + "flos": 24684006592800.0, + "grad_norm": 2.294951338618536, + "language_loss": 0.7258935, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75703609, + "num_input_tokens_seen": 12906130, + "step": 612, + "time_per_iteration": 2.8005576133728027 + }, + { + "auxiliary_loss_clip": 0.01607624, + "auxiliary_loss_mlp": 0.01492303, + "balance_loss_clip": 1.23895168, + "balance_loss_mlp": 1.11941624, + "epoch": 0.0368555538854652, + "flos": 17750859364800.0, + "grad_norm": 7.240458279496874, + "language_loss": 0.79146087, + "learning_rate": 3.999507210614175e-06, + "loss": 0.82246017, + "num_input_tokens_seen": 12925260, + "step": 613, + "time_per_iteration": 2.828798770904541 + }, + { + "auxiliary_loss_clip": 0.01610452, + "auxiliary_loss_mlp": 0.01479904, + "balance_loss_clip": 1.24229789, + "balance_loss_mlp": 1.09786224, + "epoch": 0.03691567713813317, + "flos": 20596637626560.0, + "grad_norm": 2.4185127742406887, + "language_loss": 0.93759966, + "learning_rate": 3.9994985276307e-06, + "loss": 0.96850318, + "num_input_tokens_seen": 12944590, + "step": 614, + "time_per_iteration": 2.7779579162597656 + }, + { + "auxiliary_loss_clip": 0.016189, + "auxiliary_loss_mlp": 0.01495741, + "balance_loss_clip": 1.25065851, + "balance_loss_mlp": 1.11713266, + "epoch": 0.036975800390801145, + "flos": 33652854426240.0, + "grad_norm": 5.04780460711102, + "language_loss": 0.73325706, + "learning_rate": 3.999489768826041e-06, + "loss": 0.76440346, + "num_input_tokens_seen": 12964785, + "step": 615, + "time_per_iteration": 2.855332374572754 + }, + { + "auxiliary_loss_clip": 0.01596729, + "auxiliary_loss_mlp": 0.01493919, + "balance_loss_clip": 1.22734928, + "balance_loss_mlp": 1.10787129, + "epoch": 0.03703592364346911, + "flos": 28296244762560.0, + "grad_norm": 2.460530868874466, + "language_loss": 0.82020795, + "learning_rate": 3.999480934200528e-06, + "loss": 0.85111439, + "num_input_tokens_seen": 12986705, + "step": 616, + "time_per_iteration": 2.8113930225372314 + }, + { + "auxiliary_loss_clip": 0.01600912, + "auxiliary_loss_mlp": 0.0150499, + "balance_loss_clip": 1.23224592, + "balance_loss_mlp": 1.13267589, + "epoch": 0.03709604689613708, + "flos": 31506933494880.0, + "grad_norm": 3.8141339925034687, + "language_loss": 0.68391776, + "learning_rate": 3.999472023754499e-06, + "loss": 0.71497679, + "num_input_tokens_seen": 13010560, + "step": 617, + "time_per_iteration": 2.82322359085083 + }, + { + "auxiliary_loss_clip": 0.01610225, + "auxiliary_loss_mlp": 0.01492429, + "balance_loss_clip": 1.24298251, + "balance_loss_mlp": 1.12335682, + "epoch": 0.03715617014880505, + "flos": 19611405905760.0, + "grad_norm": 2.5087212928990272, + "language_loss": 0.80532742, + "learning_rate": 3.99946303748829e-06, + "loss": 0.8363539, + "num_input_tokens_seen": 13028935, + "step": 618, + "time_per_iteration": 2.787692070007324 + }, + { + "auxiliary_loss_clip": 0.01603622, + "auxiliary_loss_mlp": 0.01489894, + "balance_loss_clip": 1.23521006, + "balance_loss_mlp": 1.12158489, + "epoch": 0.03721629340147302, + "flos": 15926003586720.0, + "grad_norm": 2.11197565722332, + "language_loss": 0.91236198, + "learning_rate": 3.999453975402242e-06, + "loss": 0.94329715, + "num_input_tokens_seen": 13046000, + "step": 619, + "time_per_iteration": 2.7816169261932373 + }, + { + "auxiliary_loss_clip": 0.01604748, + "auxiliary_loss_mlp": 0.01494351, + "balance_loss_clip": 1.23581016, + "balance_loss_mlp": 1.11707735, + "epoch": 0.03727641665414099, + "flos": 21106131700320.0, + "grad_norm": 2.637032905852305, + "language_loss": 0.94318748, + "learning_rate": 3.9994448374967e-06, + "loss": 0.97417843, + "num_input_tokens_seen": 13062995, + "step": 620, + "time_per_iteration": 2.7607030868530273 + }, + { + "auxiliary_loss_clip": 0.01595519, + "auxiliary_loss_mlp": 0.01482437, + "balance_loss_clip": 1.22637963, + "balance_loss_mlp": 1.09772491, + "epoch": 0.037336539906808956, + "flos": 24133853167200.0, + "grad_norm": 2.996202279958255, + "language_loss": 0.77598834, + "learning_rate": 3.999435623772008e-06, + "loss": 0.80676794, + "num_input_tokens_seen": 13084120, + "step": 621, + "time_per_iteration": 2.7556283473968506 + }, + { + "auxiliary_loss_clip": 0.01607809, + "auxiliary_loss_mlp": 0.01508088, + "balance_loss_clip": 1.23979712, + "balance_loss_mlp": 1.13062358, + "epoch": 0.03739666315947693, + "flos": 22348594752480.0, + "grad_norm": 2.541409756616161, + "language_loss": 0.87010407, + "learning_rate": 3.999426334228518e-06, + "loss": 0.901263, + "num_input_tokens_seen": 13100035, + "step": 622, + "time_per_iteration": 2.8006045818328857 + }, + { + "auxiliary_loss_clip": 0.01612491, + "auxiliary_loss_mlp": 0.01492428, + "balance_loss_clip": 1.24407911, + "balance_loss_mlp": 1.1111486, + "epoch": 0.0374567864121449, + "flos": 20451636813600.0, + "grad_norm": 3.1834195937179164, + "language_loss": 0.90142524, + "learning_rate": 3.999416968866581e-06, + "loss": 0.93247437, + "num_input_tokens_seen": 13118070, + "step": 623, + "time_per_iteration": 2.751049280166626 + }, + { + "auxiliary_loss_clip": 0.01612065, + "auxiliary_loss_mlp": 0.01510839, + "balance_loss_clip": 1.24505949, + "balance_loss_mlp": 1.13470972, + "epoch": 0.037516909664812866, + "flos": 19210046109120.0, + "grad_norm": 1.9979078240914767, + "language_loss": 0.84217501, + "learning_rate": 3.999407527686551e-06, + "loss": 0.87340403, + "num_input_tokens_seen": 13136355, + "step": 624, + "time_per_iteration": 5.9195849895477295 + }, + { + "auxiliary_loss_clip": 0.01598808, + "auxiliary_loss_mlp": 0.014828, + "balance_loss_clip": 1.22972214, + "balance_loss_mlp": 1.09827805, + "epoch": 0.03757703291748084, + "flos": 35008041062880.0, + "grad_norm": 2.6318800762781454, + "language_loss": 0.66782117, + "learning_rate": 3.999398010688788e-06, + "loss": 0.69863725, + "num_input_tokens_seen": 13155435, + "step": 625, + "time_per_iteration": 4.385825872421265 + }, + { + "auxiliary_loss_clip": 0.01604095, + "auxiliary_loss_mlp": 0.01485278, + "balance_loss_clip": 1.2363621, + "balance_loss_mlp": 1.11162853, + "epoch": 0.0376371561701488, + "flos": 25486271048160.0, + "grad_norm": 11.559183135689468, + "language_loss": 0.7769081, + "learning_rate": 3.999388417873652e-06, + "loss": 0.80780184, + "num_input_tokens_seen": 13174295, + "step": 626, + "time_per_iteration": 2.7707366943359375 + }, + { + "auxiliary_loss_clip": 0.01606613, + "auxiliary_loss_mlp": 0.01485245, + "balance_loss_clip": 1.2388165, + "balance_loss_mlp": 1.10301208, + "epoch": 0.037697279422816775, + "flos": 18187530642720.0, + "grad_norm": 2.0711570852940993, + "language_loss": 0.81246579, + "learning_rate": 3.999378749241506e-06, + "loss": 0.84338439, + "num_input_tokens_seen": 13192500, + "step": 627, + "time_per_iteration": 2.93574595451355 + }, + { + "auxiliary_loss_clip": 0.01616377, + "auxiliary_loss_mlp": 0.01509541, + "balance_loss_clip": 1.24866557, + "balance_loss_mlp": 1.13455641, + "epoch": 0.03775740267548475, + "flos": 24646571134560.0, + "grad_norm": 1.698266414284484, + "language_loss": 0.88903022, + "learning_rate": 3.999369004792719e-06, + "loss": 0.92028946, + "num_input_tokens_seen": 13213470, + "step": 628, + "time_per_iteration": 2.845170497894287 + }, + { + "auxiliary_loss_clip": 0.01606104, + "auxiliary_loss_mlp": 0.01486323, + "balance_loss_clip": 1.23989999, + "balance_loss_mlp": 1.10771441, + "epoch": 0.03781752592815271, + "flos": 21290350595040.0, + "grad_norm": 2.679902449104642, + "language_loss": 0.80012172, + "learning_rate": 3.999359184527658e-06, + "loss": 0.83104599, + "num_input_tokens_seen": 13232365, + "step": 629, + "time_per_iteration": 0.1273813247680664 + }, + { + "auxiliary_loss_clip": 0.01613519, + "auxiliary_loss_mlp": 0.01496351, + "balance_loss_clip": 1.24578929, + "balance_loss_mlp": 1.12346458, + "epoch": 0.037877649180820684, + "flos": 22091742702720.0, + "grad_norm": 1.9443373772693937, + "language_loss": 0.766653, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79775167, + "num_input_tokens_seen": 13251920, + "step": 630, + "time_per_iteration": 2.7352211475372314 + }, + { + "auxiliary_loss_clip": 0.01611272, + "auxiliary_loss_mlp": 0.01509707, + "balance_loss_clip": 1.24374318, + "balance_loss_mlp": 1.13643861, + "epoch": 0.03793777243348865, + "flos": 14503076527680.0, + "grad_norm": 3.691021463030873, + "language_loss": 0.91876328, + "learning_rate": 3.99933931655021e-06, + "loss": 0.94997311, + "num_input_tokens_seen": 13267440, + "step": 631, + "time_per_iteration": 2.7617692947387695 + }, + { + "auxiliary_loss_clip": 0.01614117, + "auxiliary_loss_mlp": 0.01507692, + "balance_loss_clip": 1.24522161, + "balance_loss_mlp": 1.12565005, + "epoch": 0.03799789568615662, + "flos": 21910595988960.0, + "grad_norm": 1.7459103250211903, + "language_loss": 0.92283547, + "learning_rate": 3.999329268838575e-06, + "loss": 0.95405352, + "num_input_tokens_seen": 13287850, + "step": 632, + "time_per_iteration": 2.7500088214874268 + }, + { + "auxiliary_loss_clip": 0.01603704, + "auxiliary_loss_mlp": 0.01493932, + "balance_loss_clip": 1.23439574, + "balance_loss_mlp": 1.11570513, + "epoch": 0.03805801893882459, + "flos": 24829348759200.0, + "grad_norm": 3.360531479311041, + "language_loss": 0.83415747, + "learning_rate": 3.999319145312175e-06, + "loss": 0.86513382, + "num_input_tokens_seen": 13307760, + "step": 633, + "time_per_iteration": 2.8336241245269775 + }, + { + "auxiliary_loss_clip": 0.01607287, + "auxiliary_loss_mlp": 0.01493602, + "balance_loss_clip": 1.23828936, + "balance_loss_mlp": 1.11804461, + "epoch": 0.03811814219149256, + "flos": 30485631729600.0, + "grad_norm": 1.6619492715412088, + "language_loss": 0.69781184, + "learning_rate": 3.999308945971392e-06, + "loss": 0.7288208, + "num_input_tokens_seen": 13331230, + "step": 634, + "time_per_iteration": 2.887653112411499 + }, + { + "auxiliary_loss_clip": 0.01817432, + "auxiliary_loss_mlp": 0.01412201, + "balance_loss_clip": 1.45410335, + "balance_loss_mlp": 1.06277466, + "epoch": 0.03817826544416053, + "flos": 66998639190240.0, + "grad_norm": 0.891081179544049, + "language_loss": 0.61618078, + "learning_rate": 3.999298670816614e-06, + "loss": 0.64847708, + "num_input_tokens_seen": 13394760, + "step": 635, + "time_per_iteration": 3.3389840126037598 + }, + { + "auxiliary_loss_clip": 0.01605075, + "auxiliary_loss_mlp": 0.01491796, + "balance_loss_clip": 1.2369988, + "balance_loss_mlp": 1.12081659, + "epoch": 0.038238388696828496, + "flos": 20487441360960.0, + "grad_norm": 4.5125940094357055, + "language_loss": 0.83672035, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86768901, + "num_input_tokens_seen": 13412775, + "step": 636, + "time_per_iteration": 2.775519609451294 + }, + { + "auxiliary_loss_clip": 0.01608592, + "auxiliary_loss_mlp": 0.01482546, + "balance_loss_clip": 1.24031091, + "balance_loss_mlp": 1.11156702, + "epoch": 0.03829851194949647, + "flos": 17967734769600.0, + "grad_norm": 2.8043054929706983, + "language_loss": 0.79679739, + "learning_rate": 3.999277893066632e-06, + "loss": 0.82770872, + "num_input_tokens_seen": 13427835, + "step": 637, + "time_per_iteration": 2.7113680839538574 + }, + { + "auxiliary_loss_clip": 0.01602805, + "auxiliary_loss_mlp": 0.01496642, + "balance_loss_clip": 1.23390269, + "balance_loss_mlp": 1.13481784, + "epoch": 0.03835863520216444, + "flos": 22458777150240.0, + "grad_norm": 2.6953357097063044, + "language_loss": 0.84182942, + "learning_rate": 3.999267390472215e-06, + "loss": 0.87282395, + "num_input_tokens_seen": 13447295, + "step": 638, + "time_per_iteration": 2.7406582832336426 + }, + { + "auxiliary_loss_clip": 0.01605306, + "auxiliary_loss_mlp": 0.01493775, + "balance_loss_clip": 1.23549974, + "balance_loss_mlp": 1.1189816, + "epoch": 0.038418758454832405, + "flos": 22166651547360.0, + "grad_norm": 2.574087805433906, + "language_loss": 0.70462394, + "learning_rate": 3.999256812065381e-06, + "loss": 0.73561478, + "num_input_tokens_seen": 13468455, + "step": 639, + "time_per_iteration": 2.8148891925811768 + }, + { + "auxiliary_loss_clip": 0.01607956, + "auxiliary_loss_mlp": 0.01487703, + "balance_loss_clip": 1.24012399, + "balance_loss_mlp": 1.12263656, + "epoch": 0.03847888170750038, + "flos": 22749651123840.0, + "grad_norm": 2.6758120371604392, + "language_loss": 0.85363841, + "learning_rate": 3.999246157846526e-06, + "loss": 0.88459504, + "num_input_tokens_seen": 13489085, + "step": 640, + "time_per_iteration": 2.8551368713378906 + }, + { + "auxiliary_loss_clip": 0.01607753, + "auxiliary_loss_mlp": 0.01512812, + "balance_loss_clip": 1.23919272, + "balance_loss_mlp": 1.15308571, + "epoch": 0.03853900496016834, + "flos": 22713429366720.0, + "grad_norm": 2.4534788909991243, + "language_loss": 0.82167995, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.8528856, + "num_input_tokens_seen": 13509120, + "step": 641, + "time_per_iteration": 2.8770225048065186 + }, + { + "auxiliary_loss_clip": 0.01777102, + "auxiliary_loss_mlp": 0.01428085, + "balance_loss_clip": 1.41159749, + "balance_loss_mlp": 1.0969696, + "epoch": 0.038599128212836314, + "flos": 70406138966400.0, + "grad_norm": 0.9092615802429368, + "language_loss": 0.65388739, + "learning_rate": 3.999224621974381e-06, + "loss": 0.68593931, + "num_input_tokens_seen": 13562005, + "step": 642, + "time_per_iteration": 3.3111531734466553 + }, + { + "auxiliary_loss_clip": 0.01598142, + "auxiliary_loss_mlp": 0.01487305, + "balance_loss_clip": 1.22954714, + "balance_loss_mlp": 1.12109375, + "epoch": 0.03865925146550429, + "flos": 23297415075360.0, + "grad_norm": 2.4127977865554553, + "language_loss": 0.79650217, + "learning_rate": 3.999213740321906e-06, + "loss": 0.82735664, + "num_input_tokens_seen": 13582185, + "step": 643, + "time_per_iteration": 2.7977418899536133 + }, + { + "auxiliary_loss_clip": 0.0161322, + "auxiliary_loss_mlp": 0.01500556, + "balance_loss_clip": 1.2430886, + "balance_loss_mlp": 1.12194765, + "epoch": 0.03871937471817225, + "flos": 21432393011520.0, + "grad_norm": 2.1322733930146334, + "language_loss": 0.83047754, + "learning_rate": 3.999202782859046e-06, + "loss": 0.8616153, + "num_input_tokens_seen": 13599555, + "step": 644, + "time_per_iteration": 2.740513563156128 + }, + { + "auxiliary_loss_clip": 0.01600632, + "auxiliary_loss_mlp": 0.01488986, + "balance_loss_clip": 1.23004127, + "balance_loss_mlp": 1.1218214, + "epoch": 0.038779497970840224, + "flos": 34280192386080.0, + "grad_norm": 14.443881389345197, + "language_loss": 0.82216954, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.85306579, + "num_input_tokens_seen": 13621160, + "step": 645, + "time_per_iteration": 2.798919439315796 + }, + { + "auxiliary_loss_clip": 0.01605944, + "auxiliary_loss_mlp": 0.01494632, + "balance_loss_clip": 1.23597181, + "balance_loss_mlp": 1.10992026, + "epoch": 0.03883962122350819, + "flos": 22750447615200.0, + "grad_norm": 2.371701215873247, + "language_loss": 0.81627011, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84727585, + "num_input_tokens_seen": 13641915, + "step": 646, + "time_per_iteration": 2.7662453651428223 + }, + { + "auxiliary_loss_clip": 0.01613659, + "auxiliary_loss_mlp": 0.01489576, + "balance_loss_clip": 1.24505734, + "balance_loss_mlp": 1.11001396, + "epoch": 0.03889974447617616, + "flos": 21948069375360.0, + "grad_norm": 2.761398883882831, + "language_loss": 0.82165468, + "learning_rate": 3.999169455612323e-06, + "loss": 0.85268706, + "num_input_tokens_seen": 13661410, + "step": 647, + "time_per_iteration": 2.7985754013061523 + }, + { + "auxiliary_loss_clip": 0.01599747, + "auxiliary_loss_mlp": 0.01482996, + "balance_loss_clip": 1.23015904, + "balance_loss_mlp": 1.09904635, + "epoch": 0.03895986772884413, + "flos": 31507578273600.0, + "grad_norm": 2.5852702761132984, + "language_loss": 0.84504569, + "learning_rate": 3.999158194912106e-06, + "loss": 0.87587309, + "num_input_tokens_seen": 13681705, + "step": 648, + "time_per_iteration": 2.897336959838867 + }, + { + "auxiliary_loss_clip": 0.01604141, + "auxiliary_loss_mlp": 0.01487435, + "balance_loss_clip": 1.23440158, + "balance_loss_mlp": 1.10024285, + "epoch": 0.0390199909815121, + "flos": 19903038442560.0, + "grad_norm": 2.7448437102389884, + "language_loss": 0.84491432, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.87583005, + "num_input_tokens_seen": 13700400, + "step": 649, + "time_per_iteration": 2.7217519283294678 + }, + { + "auxiliary_loss_clip": 0.01604753, + "auxiliary_loss_mlp": 0.01488133, + "balance_loss_clip": 1.23543048, + "balance_loss_mlp": 1.09941542, + "epoch": 0.03908011423418007, + "flos": 21614412072960.0, + "grad_norm": 1.8343637504496368, + "language_loss": 0.79841191, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82934076, + "num_input_tokens_seen": 13720145, + "step": 650, + "time_per_iteration": 2.775531530380249 + }, + { + "auxiliary_loss_clip": 0.01612266, + "auxiliary_loss_mlp": 0.01489535, + "balance_loss_clip": 1.24237847, + "balance_loss_mlp": 1.10787416, + "epoch": 0.039140237486848035, + "flos": 18663420002400.0, + "grad_norm": 2.1870486205460686, + "language_loss": 0.78619325, + "learning_rate": 3.9991239579635e-06, + "loss": 0.81721127, + "num_input_tokens_seen": 13737500, + "step": 651, + "time_per_iteration": 2.775134801864624 + }, + { + "auxiliary_loss_clip": 0.01607112, + "auxiliary_loss_mlp": 0.01478584, + "balance_loss_clip": 1.23862612, + "balance_loss_mlp": 1.10016561, + "epoch": 0.03920036073951601, + "flos": 18663116577120.0, + "grad_norm": 3.6244518024686125, + "language_loss": 0.87744653, + "learning_rate": 3.999112394032757e-06, + "loss": 0.9083035, + "num_input_tokens_seen": 13754750, + "step": 652, + "time_per_iteration": 2.795557737350464 + }, + { + "auxiliary_loss_clip": 0.01613158, + "auxiliary_loss_mlp": 0.014776, + "balance_loss_clip": 1.24336231, + "balance_loss_mlp": 1.09822845, + "epoch": 0.03926048399218398, + "flos": 31356964092960.0, + "grad_norm": 4.187290631411017, + "language_loss": 0.79460371, + "learning_rate": 3.999100754295471e-06, + "loss": 0.82551122, + "num_input_tokens_seen": 13771990, + "step": 653, + "time_per_iteration": 2.8339638710021973 + }, + { + "auxiliary_loss_clip": 0.01605194, + "auxiliary_loss_mlp": 0.01485964, + "balance_loss_clip": 1.23413038, + "balance_loss_mlp": 1.104303, + "epoch": 0.039320607244851945, + "flos": 29605917242880.0, + "grad_norm": 2.7928692484600117, + "language_loss": 0.86023968, + "learning_rate": 3.999089038752085e-06, + "loss": 0.89115131, + "num_input_tokens_seen": 13792750, + "step": 654, + "time_per_iteration": 2.787827730178833 + }, + { + "auxiliary_loss_clip": 0.01803975, + "auxiliary_loss_mlp": 0.01425438, + "balance_loss_clip": 1.44170558, + "balance_loss_mlp": 1.06380463, + "epoch": 0.03938073049751992, + "flos": 66541562197920.0, + "grad_norm": 0.740460585239136, + "language_loss": 0.49872398, + "learning_rate": 3.999077247403041e-06, + "loss": 0.53101814, + "num_input_tokens_seen": 13858570, + "step": 655, + "time_per_iteration": 3.424732208251953 + }, + { + "auxiliary_loss_clip": 0.01609281, + "auxiliary_loss_mlp": 0.0150639, + "balance_loss_clip": 1.24079895, + "balance_loss_mlp": 1.12644625, + "epoch": 0.03944085375018788, + "flos": 23370086158560.0, + "grad_norm": 2.4694452093719597, + "language_loss": 0.81235647, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.84351313, + "num_input_tokens_seen": 13876335, + "step": 656, + "time_per_iteration": 2.743978500366211 + }, + { + "auxiliary_loss_clip": 0.01603361, + "auxiliary_loss_mlp": 0.01492393, + "balance_loss_clip": 1.23522902, + "balance_loss_mlp": 1.12255788, + "epoch": 0.039500977002855854, + "flos": 18550089567360.0, + "grad_norm": 3.204708063930149, + "language_loss": 0.76316512, + "learning_rate": 3.999053437289776e-06, + "loss": 0.79412264, + "num_input_tokens_seen": 13892640, + "step": 657, + "time_per_iteration": 2.734302520751953 + }, + { + "auxiliary_loss_clip": 0.01606107, + "auxiliary_loss_mlp": 0.0149164, + "balance_loss_clip": 1.23707724, + "balance_loss_mlp": 1.12733614, + "epoch": 0.039561100255523826, + "flos": 25340549600160.0, + "grad_norm": 3.8903089053012185, + "language_loss": 0.81605303, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84703046, + "num_input_tokens_seen": 13910085, + "step": 658, + "time_per_iteration": 2.7749621868133545 + }, + { + "auxiliary_loss_clip": 0.01609643, + "auxiliary_loss_mlp": 0.01486432, + "balance_loss_clip": 1.24073696, + "balance_loss_mlp": 1.11449897, + "epoch": 0.03962122350819179, + "flos": 18221666351040.0, + "grad_norm": 2.7187359231768435, + "language_loss": 0.91171771, + "learning_rate": 3.999029323959287e-06, + "loss": 0.94267845, + "num_input_tokens_seen": 13928800, + "step": 659, + "time_per_iteration": 2.77104115486145 + }, + { + "auxiliary_loss_clip": 0.01602124, + "auxiliary_loss_mlp": 0.01497227, + "balance_loss_clip": 1.23097205, + "balance_loss_mlp": 1.12682021, + "epoch": 0.03968134676085976, + "flos": 20524345824960.0, + "grad_norm": 3.3081303905868973, + "language_loss": 0.79542917, + "learning_rate": 3.999017153588724e-06, + "loss": 0.82642269, + "num_input_tokens_seen": 13948325, + "step": 660, + "time_per_iteration": 2.7595980167388916 + }, + { + "auxiliary_loss_clip": 0.01606318, + "auxiliary_loss_mlp": 0.01489039, + "balance_loss_clip": 1.23693895, + "balance_loss_mlp": 1.11557996, + "epoch": 0.03974147001352773, + "flos": 22424982795360.0, + "grad_norm": 1.7772534995340983, + "language_loss": 0.81687689, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84783047, + "num_input_tokens_seen": 13969090, + "step": 661, + "time_per_iteration": 2.839015245437622 + }, + { + "auxiliary_loss_clip": 0.01792547, + "auxiliary_loss_mlp": 0.01461143, + "balance_loss_clip": 1.43097055, + "balance_loss_mlp": 1.14147186, + "epoch": 0.0398015932661957, + "flos": 71135808194880.0, + "grad_norm": 0.9675452808174713, + "language_loss": 0.69331229, + "learning_rate": 3.998992585439272e-06, + "loss": 0.72584915, + "num_input_tokens_seen": 14037555, + "step": 662, + "time_per_iteration": 6.619063854217529 + }, + { + "auxiliary_loss_clip": 0.01605386, + "auxiliary_loss_mlp": 0.01483442, + "balance_loss_clip": 1.23622966, + "balance_loss_mlp": 1.10826612, + "epoch": 0.03986171651886367, + "flos": 16802873461440.0, + "grad_norm": 2.5233082834130296, + "language_loss": 0.83183777, + "learning_rate": 3.998980187661314e-06, + "loss": 0.86272603, + "num_input_tokens_seen": 14055765, + "step": 663, + "time_per_iteration": 4.440661668777466 + }, + { + "auxiliary_loss_clip": 0.01593145, + "auxiliary_loss_mlp": 0.0149779, + "balance_loss_clip": 1.22260416, + "balance_loss_mlp": 1.12280536, + "epoch": 0.03992183977153164, + "flos": 24537374868960.0, + "grad_norm": 6.031112493924439, + "language_loss": 0.877967, + "learning_rate": 3.998967714081826e-06, + "loss": 0.9088763, + "num_input_tokens_seen": 14074195, + "step": 664, + "time_per_iteration": 2.8828299045562744 + }, + { + "auxiliary_loss_clip": 0.01607941, + "auxiliary_loss_mlp": 0.01519779, + "balance_loss_clip": 1.23849154, + "balance_loss_mlp": 1.15261436, + "epoch": 0.03998196302419961, + "flos": 15597694154880.0, + "grad_norm": 2.5266163418369336, + "language_loss": 0.84759527, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87887245, + "num_input_tokens_seen": 14090215, + "step": 665, + "time_per_iteration": 2.8410983085632324 + }, + { + "auxiliary_loss_clip": 0.01616555, + "auxiliary_loss_mlp": 0.01506829, + "balance_loss_clip": 1.24777281, + "balance_loss_mlp": 1.12860203, + "epoch": 0.04004208627686758, + "flos": 25307627592960.0, + "grad_norm": 2.561071300774721, + "language_loss": 0.81761456, + "learning_rate": 3.998942539520158e-06, + "loss": 0.84884834, + "num_input_tokens_seen": 14112150, + "step": 666, + "time_per_iteration": 2.8946778774261475 + }, + { + "auxiliary_loss_clip": 0.01606883, + "auxiliary_loss_mlp": 0.01497288, + "balance_loss_clip": 1.23592854, + "balance_loss_mlp": 1.12573624, + "epoch": 0.04010220952953555, + "flos": 23478030794880.0, + "grad_norm": 2.950968921719695, + "language_loss": 0.87086749, + "learning_rate": 3.998929838538932e-06, + "loss": 0.90190923, + "num_input_tokens_seen": 14131475, + "step": 667, + "time_per_iteration": 2.872547149658203 + }, + { + "auxiliary_loss_clip": 0.01607906, + "auxiliary_loss_mlp": 0.0152008, + "balance_loss_clip": 1.23730421, + "balance_loss_mlp": 1.14032722, + "epoch": 0.04016233278220352, + "flos": 18618588552960.0, + "grad_norm": 2.5753355312453925, + "language_loss": 0.8071419, + "learning_rate": 3.998917061758087e-06, + "loss": 0.8384217, + "num_input_tokens_seen": 14146165, + "step": 668, + "time_per_iteration": 2.7376015186309814 + }, + { + "auxiliary_loss_clip": 0.01754759, + "auxiliary_loss_mlp": 0.01409958, + "balance_loss_clip": 1.39335203, + "balance_loss_mlp": 1.06053162, + "epoch": 0.040222456034871484, + "flos": 70913015997120.0, + "grad_norm": 0.8042007429002167, + "language_loss": 0.60026842, + "learning_rate": 3.998904209178107e-06, + "loss": 0.63191557, + "num_input_tokens_seen": 14215005, + "step": 669, + "time_per_iteration": 3.424560070037842 + }, + { + "auxiliary_loss_clip": 0.01600035, + "auxiliary_loss_mlp": 0.01486095, + "balance_loss_clip": 1.23014307, + "balance_loss_mlp": 1.11301732, + "epoch": 0.040282579287539456, + "flos": 23766742863360.0, + "grad_norm": 1.97881140667893, + "language_loss": 0.86114395, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.89200521, + "num_input_tokens_seen": 14235510, + "step": 670, + "time_per_iteration": 2.8496997356414795 + }, + { + "auxiliary_loss_clip": 0.01608478, + "auxiliary_loss_mlp": 0.01475261, + "balance_loss_clip": 1.23756623, + "balance_loss_mlp": 1.09875035, + "epoch": 0.04034270254020743, + "flos": 18480490665120.0, + "grad_norm": 3.3237908022766947, + "language_loss": 0.75077093, + "learning_rate": 3.998878276622692e-06, + "loss": 0.78160834, + "num_input_tokens_seen": 14254565, + "step": 671, + "time_per_iteration": 2.759674549102783 + }, + { + "auxiliary_loss_clip": 0.01608754, + "auxiliary_loss_mlp": 0.01485127, + "balance_loss_clip": 1.23791242, + "balance_loss_mlp": 1.10270357, + "epoch": 0.040402825792875394, + "flos": 17203778120160.0, + "grad_norm": 1.9601322435348203, + "language_loss": 0.92445856, + "learning_rate": 3.998865196648242e-06, + "loss": 0.95539737, + "num_input_tokens_seen": 14271885, + "step": 672, + "time_per_iteration": 2.705240249633789 + }, + { + "auxiliary_loss_clip": 0.0160665, + "auxiliary_loss_mlp": 0.01479131, + "balance_loss_clip": 1.23657465, + "balance_loss_mlp": 1.10052228, + "epoch": 0.040462949045543366, + "flos": 19174203633600.0, + "grad_norm": 2.0890874487112927, + "language_loss": 0.90116191, + "learning_rate": 3.998852040876622e-06, + "loss": 0.93201971, + "num_input_tokens_seen": 14289670, + "step": 673, + "time_per_iteration": 2.770909309387207 + }, + { + "auxiliary_loss_clip": 0.01597623, + "auxiliary_loss_mlp": 0.01483498, + "balance_loss_clip": 1.2266829, + "balance_loss_mlp": 1.10717833, + "epoch": 0.04052307229821133, + "flos": 24021850217760.0, + "grad_norm": 3.0091981422660776, + "language_loss": 0.75317419, + "learning_rate": 3.998838809308334e-06, + "loss": 0.78398544, + "num_input_tokens_seen": 14309285, + "step": 674, + "time_per_iteration": 2.7590174674987793 + }, + { + "auxiliary_loss_clip": 0.01604789, + "auxiliary_loss_mlp": 0.01478754, + "balance_loss_clip": 1.23366904, + "balance_loss_mlp": 1.10491419, + "epoch": 0.0405831955508793, + "flos": 16438531913280.0, + "grad_norm": 2.957573882704172, + "language_loss": 0.78461319, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.81544864, + "num_input_tokens_seen": 14328300, + "step": 675, + "time_per_iteration": 2.7270474433898926 + }, + { + "auxiliary_loss_clip": 0.01602573, + "auxiliary_loss_mlp": 0.01472657, + "balance_loss_clip": 1.23155165, + "balance_loss_mlp": 1.09309506, + "epoch": 0.040643318803547275, + "flos": 24282229586400.0, + "grad_norm": 1.8074864061920781, + "language_loss": 0.7719627, + "learning_rate": 3.998812118783757e-06, + "loss": 0.80271506, + "num_input_tokens_seen": 14346395, + "step": 676, + "time_per_iteration": 2.785032033920288 + }, + { + "auxiliary_loss_clip": 0.01601367, + "auxiliary_loss_mlp": 0.01482798, + "balance_loss_clip": 1.23065984, + "balance_loss_mlp": 1.1085763, + "epoch": 0.04070344205621524, + "flos": 17713992828960.0, + "grad_norm": 2.839157845457768, + "language_loss": 0.85442698, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.88526869, + "num_input_tokens_seen": 14364605, + "step": 677, + "time_per_iteration": 2.6856210231781006 + }, + { + "auxiliary_loss_clip": 0.01603333, + "auxiliary_loss_mlp": 0.01481955, + "balance_loss_clip": 1.23525393, + "balance_loss_mlp": 1.10792351, + "epoch": 0.04076356530888321, + "flos": 26180097801120.0, + "grad_norm": 2.5477696153360117, + "language_loss": 0.76584363, + "learning_rate": 3.998785125078559e-06, + "loss": 0.79669654, + "num_input_tokens_seen": 14385265, + "step": 678, + "time_per_iteration": 2.806833267211914 + }, + { + "auxiliary_loss_clip": 0.0159236, + "auxiliary_loss_mlp": 0.01496484, + "balance_loss_clip": 1.22087216, + "balance_loss_mlp": 1.13065481, + "epoch": 0.04082368856155118, + "flos": 35776435307040.0, + "grad_norm": 3.137874074596287, + "language_loss": 0.82511741, + "learning_rate": 3.998771514534505e-06, + "loss": 0.85600585, + "num_input_tokens_seen": 14406090, + "step": 679, + "time_per_iteration": 2.8581974506378174 + }, + { + "auxiliary_loss_clip": 0.01606478, + "auxiliary_loss_mlp": 0.01507238, + "balance_loss_clip": 1.23644149, + "balance_loss_mlp": 1.14159894, + "epoch": 0.04088381181421915, + "flos": 28149119972640.0, + "grad_norm": 2.119964698059674, + "language_loss": 0.76563746, + "learning_rate": 3.998757828196835e-06, + "loss": 0.79677463, + "num_input_tokens_seen": 14425130, + "step": 680, + "time_per_iteration": 2.9000139236450195 + }, + { + "auxiliary_loss_clip": 0.01591372, + "auxiliary_loss_mlp": 0.01498351, + "balance_loss_clip": 1.21937561, + "balance_loss_mlp": 1.13099551, + "epoch": 0.04094393506688712, + "flos": 27600028535520.0, + "grad_norm": 3.1705976176485255, + "language_loss": 0.83011782, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.86101508, + "num_input_tokens_seen": 14447355, + "step": 681, + "time_per_iteration": 2.8015434741973877 + }, + { + "auxiliary_loss_clip": 0.01593122, + "auxiliary_loss_mlp": 0.01489443, + "balance_loss_clip": 1.22296906, + "balance_loss_mlp": 1.11770105, + "epoch": 0.04100405831955509, + "flos": 23114258169120.0, + "grad_norm": 2.509997247721602, + "language_loss": 0.71353412, + "learning_rate": 3.998730228142726e-06, + "loss": 0.74435973, + "num_input_tokens_seen": 14466790, + "step": 682, + "time_per_iteration": 2.79766583442688 + }, + { + "auxiliary_loss_clip": 0.01602209, + "auxiliary_loss_mlp": 0.01483555, + "balance_loss_clip": 1.23175955, + "balance_loss_mlp": 1.10704446, + "epoch": 0.04106418157222306, + "flos": 20158790575680.0, + "grad_norm": 2.5562617904037723, + "language_loss": 0.726946, + "learning_rate": 3.998716314427333e-06, + "loss": 0.75780362, + "num_input_tokens_seen": 14485195, + "step": 683, + "time_per_iteration": 2.8018555641174316 + }, + { + "auxiliary_loss_clip": 0.01600572, + "auxiliary_loss_mlp": 0.01488117, + "balance_loss_clip": 1.23073411, + "balance_loss_mlp": 1.11217833, + "epoch": 0.041124304824891024, + "flos": 17422625789280.0, + "grad_norm": 3.100146561389531, + "language_loss": 0.81918395, + "learning_rate": 3.998702324920417e-06, + "loss": 0.85007083, + "num_input_tokens_seen": 14503370, + "step": 684, + "time_per_iteration": 2.804476261138916 + }, + { + "auxiliary_loss_clip": 0.01594772, + "auxiliary_loss_mlp": 0.01497659, + "balance_loss_clip": 1.22329831, + "balance_loss_mlp": 1.11637974, + "epoch": 0.041184428077558996, + "flos": 25782796317600.0, + "grad_norm": 4.1778119537013, + "language_loss": 0.90656966, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.93749392, + "num_input_tokens_seen": 14526415, + "step": 685, + "time_per_iteration": 2.870447874069214 + }, + { + "auxiliary_loss_clip": 0.01596922, + "auxiliary_loss_mlp": 0.01475736, + "balance_loss_clip": 1.2266562, + "balance_loss_mlp": 1.10437512, + "epoch": 0.04124455133022697, + "flos": 22966905810240.0, + "grad_norm": 3.7612473534691713, + "language_loss": 0.88416922, + "learning_rate": 3.998674118534141e-06, + "loss": 0.91489583, + "num_input_tokens_seen": 14546595, + "step": 686, + "time_per_iteration": 2.787524700164795 + }, + { + "auxiliary_loss_clip": 0.0159434, + "auxiliary_loss_mlp": 0.01477197, + "balance_loss_clip": 1.22369289, + "balance_loss_mlp": 1.10106778, + "epoch": 0.04130467458289493, + "flos": 21291260870880.0, + "grad_norm": 1.9779300772106982, + "language_loss": 0.71829319, + "learning_rate": 3.998659901655851e-06, + "loss": 0.74900854, + "num_input_tokens_seen": 14566590, + "step": 687, + "time_per_iteration": 2.8056998252868652 + }, + { + "auxiliary_loss_clip": 0.01600765, + "auxiliary_loss_mlp": 0.01490073, + "balance_loss_clip": 1.23109996, + "balance_loss_mlp": 1.12596023, + "epoch": 0.041364797835562905, + "flos": 19976278448160.0, + "grad_norm": 3.3677040519607933, + "language_loss": 0.86427259, + "learning_rate": 3.998645608988177e-06, + "loss": 0.89518106, + "num_input_tokens_seen": 14585965, + "step": 688, + "time_per_iteration": 2.781090021133423 + }, + { + "auxiliary_loss_clip": 0.01606187, + "auxiliary_loss_mlp": 0.01486889, + "balance_loss_clip": 1.23698461, + "balance_loss_mlp": 1.12048686, + "epoch": 0.04142492108823087, + "flos": 21908282371200.0, + "grad_norm": 1.9986356383873276, + "language_loss": 0.83171743, + "learning_rate": 3.998631240531661e-06, + "loss": 0.86264819, + "num_input_tokens_seen": 14606015, + "step": 689, + "time_per_iteration": 2.759500026702881 + }, + { + "auxiliary_loss_clip": 0.01605691, + "auxiliary_loss_mlp": 0.01494365, + "balance_loss_clip": 1.23600769, + "balance_loss_mlp": 1.12510276, + "epoch": 0.04148504434089884, + "flos": 27642280870080.0, + "grad_norm": 3.1448269054445657, + "language_loss": 0.68308115, + "learning_rate": 3.998616796286848e-06, + "loss": 0.71408176, + "num_input_tokens_seen": 14629955, + "step": 690, + "time_per_iteration": 2.7530407905578613 + }, + { + "auxiliary_loss_clip": 0.0159886, + "auxiliary_loss_mlp": 0.01482693, + "balance_loss_clip": 1.2271564, + "balance_loss_mlp": 1.11381149, + "epoch": 0.041545167593566815, + "flos": 20520287511840.0, + "grad_norm": 1.7229099403481885, + "language_loss": 0.75247079, + "learning_rate": 3.998602276254286e-06, + "loss": 0.78328633, + "num_input_tokens_seen": 14648000, + "step": 691, + "time_per_iteration": 2.7748217582702637 + }, + { + "auxiliary_loss_clip": 0.01598905, + "auxiliary_loss_mlp": 0.0146503, + "balance_loss_clip": 1.22796416, + "balance_loss_mlp": 1.09805644, + "epoch": 0.04160529084623478, + "flos": 11870191213920.0, + "grad_norm": 2.022197853994676, + "language_loss": 0.84277117, + "learning_rate": 3.998587680434526e-06, + "loss": 0.87341052, + "num_input_tokens_seen": 14662235, + "step": 692, + "time_per_iteration": 2.770233631134033 + }, + { + "auxiliary_loss_clip": 0.01592856, + "auxiliary_loss_mlp": 0.01497967, + "balance_loss_clip": 1.22175264, + "balance_loss_mlp": 1.13614249, + "epoch": 0.04166541409890275, + "flos": 14829982617600.0, + "grad_norm": 2.7788628061650598, + "language_loss": 0.88871324, + "learning_rate": 3.99857300882812e-06, + "loss": 0.91962141, + "num_input_tokens_seen": 14676065, + "step": 693, + "time_per_iteration": 2.8437001705169678 + }, + { + "auxiliary_loss_clip": 0.01596034, + "auxiliary_loss_mlp": 0.01478736, + "balance_loss_clip": 1.22463131, + "balance_loss_mlp": 1.09898269, + "epoch": 0.04172553735157072, + "flos": 25810294597920.0, + "grad_norm": 2.7541711984485184, + "language_loss": 0.82111371, + "learning_rate": 3.998558261435626e-06, + "loss": 0.85186136, + "num_input_tokens_seen": 14694955, + "step": 694, + "time_per_iteration": 2.8551061153411865 + }, + { + "auxiliary_loss_clip": 0.01595351, + "auxiliary_loss_mlp": 0.01488931, + "balance_loss_clip": 1.22277212, + "balance_loss_mlp": 1.11509097, + "epoch": 0.04178566060423869, + "flos": 24282115801920.0, + "grad_norm": 3.146354504401753, + "language_loss": 0.83553201, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.86637485, + "num_input_tokens_seen": 14715510, + "step": 695, + "time_per_iteration": 2.8221702575683594 + }, + { + "auxiliary_loss_clip": 0.01594107, + "auxiliary_loss_mlp": 0.01501942, + "balance_loss_clip": 1.2229867, + "balance_loss_mlp": 1.12772059, + "epoch": 0.04184578385690666, + "flos": 18223904112480.0, + "grad_norm": 5.988305085935899, + "language_loss": 0.84753388, + "learning_rate": 3.99852853929461e-06, + "loss": 0.87849438, + "num_input_tokens_seen": 14731755, + "step": 696, + "time_per_iteration": 2.779721736907959 + }, + { + "auxiliary_loss_clip": 0.0159796, + "auxiliary_loss_mlp": 0.01493039, + "balance_loss_clip": 1.22634101, + "balance_loss_mlp": 1.11900771, + "epoch": 0.041905907109574626, + "flos": 22778135536320.0, + "grad_norm": 5.262559617298667, + "language_loss": 0.92860287, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95951283, + "num_input_tokens_seen": 14750810, + "step": 697, + "time_per_iteration": 2.7621588706970215 + }, + { + "auxiliary_loss_clip": 0.01601041, + "auxiliary_loss_mlp": 0.01486869, + "balance_loss_clip": 1.22960234, + "balance_loss_mlp": 1.11627078, + "epoch": 0.0419660303622426, + "flos": 20159018144640.0, + "grad_norm": 2.2125115482434294, + "language_loss": 0.83721548, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86809462, + "num_input_tokens_seen": 14768435, + "step": 698, + "time_per_iteration": 2.7284653186798096 + }, + { + "auxiliary_loss_clip": 0.01589614, + "auxiliary_loss_mlp": 0.01481247, + "balance_loss_clip": 1.21825647, + "balance_loss_mlp": 1.09882319, + "epoch": 0.042026153614910564, + "flos": 23078643262560.0, + "grad_norm": 2.283695197667616, + "language_loss": 0.91228271, + "learning_rate": 3.998483387701495e-06, + "loss": 0.94299126, + "num_input_tokens_seen": 14786690, + "step": 699, + "time_per_iteration": 2.8012545108795166 + }, + { + "auxiliary_loss_clip": 0.01692721, + "auxiliary_loss_mlp": 0.01427826, + "balance_loss_clip": 1.32515383, + "balance_loss_mlp": 1.10205078, + "epoch": 0.042086276867578536, + "flos": 64502296345440.0, + "grad_norm": 0.9192578352694548, + "language_loss": 0.67837119, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70957667, + "num_input_tokens_seen": 14853840, + "step": 700, + "time_per_iteration": 6.351967096328735 + }, + { + "auxiliary_loss_clip": 0.01594611, + "auxiliary_loss_mlp": 0.01488937, + "balance_loss_clip": 1.22200704, + "balance_loss_mlp": 1.11147273, + "epoch": 0.04214640012024651, + "flos": 15488839242720.0, + "grad_norm": 2.671256294864947, + "language_loss": 0.88794541, + "learning_rate": 3.998452907725016e-06, + "loss": 0.91878086, + "num_input_tokens_seen": 14869580, + "step": 701, + "time_per_iteration": 4.246098518371582 + }, + { + "auxiliary_loss_clip": 0.01596947, + "auxiliary_loss_mlp": 0.01480825, + "balance_loss_clip": 1.22405291, + "balance_loss_mlp": 1.11156178, + "epoch": 0.04220652337291447, + "flos": 23879238878880.0, + "grad_norm": 1.7568666067236753, + "language_loss": 0.67251247, + "learning_rate": 3.998437554064184e-06, + "loss": 0.70329022, + "num_input_tokens_seen": 14891065, + "step": 702, + "time_per_iteration": 2.7835123538970947 + }, + { + "auxiliary_loss_clip": 0.01684293, + "auxiliary_loss_mlp": 0.01413139, + "balance_loss_clip": 1.31524229, + "balance_loss_mlp": 1.07820892, + "epoch": 0.042266646625582445, + "flos": 63802097661600.0, + "grad_norm": 0.8473525945660978, + "language_loss": 0.60753208, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.63850641, + "num_input_tokens_seen": 14954815, + "step": 703, + "time_per_iteration": 3.3474416732788086 + }, + { + "auxiliary_loss_clip": 0.01677954, + "auxiliary_loss_mlp": 0.01430679, + "balance_loss_clip": 1.30836618, + "balance_loss_mlp": 1.1033783, + "epoch": 0.04232676987825041, + "flos": 50025579888960.0, + "grad_norm": 1.02886158329967, + "language_loss": 0.57620215, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.60728848, + "num_input_tokens_seen": 15003050, + "step": 704, + "time_per_iteration": 3.137460708618164 + }, + { + "auxiliary_loss_clip": 0.01603879, + "auxiliary_loss_mlp": 0.01494559, + "balance_loss_clip": 1.22933233, + "balance_loss_mlp": 1.11556888, + "epoch": 0.04238689313091838, + "flos": 21618242817120.0, + "grad_norm": 3.5494106283699467, + "language_loss": 0.87496781, + "learning_rate": 3.998391038398319e-06, + "loss": 0.90595222, + "num_input_tokens_seen": 15021990, + "step": 705, + "time_per_iteration": 2.808864116668701 + }, + { + "auxiliary_loss_clip": 0.01587676, + "auxiliary_loss_mlp": 0.01496361, + "balance_loss_clip": 1.21454954, + "balance_loss_mlp": 1.12061357, + "epoch": 0.042447016383586354, + "flos": 19137109528800.0, + "grad_norm": 2.643657545774481, + "language_loss": 0.71194494, + "learning_rate": 3.998375381617201e-06, + "loss": 0.74278533, + "num_input_tokens_seen": 15040700, + "step": 706, + "time_per_iteration": 2.775763750076294 + }, + { + "auxiliary_loss_clip": 0.01593975, + "auxiliary_loss_mlp": 0.0147778, + "balance_loss_clip": 1.21917033, + "balance_loss_mlp": 1.09726429, + "epoch": 0.04250713963625432, + "flos": 24428330316000.0, + "grad_norm": 2.7679258776351543, + "language_loss": 0.932244, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.96296155, + "num_input_tokens_seen": 15056725, + "step": 707, + "time_per_iteration": 2.7949912548065186 + }, + { + "auxiliary_loss_clip": 0.01585156, + "auxiliary_loss_mlp": 0.01480207, + "balance_loss_clip": 1.21195197, + "balance_loss_mlp": 1.09988225, + "epoch": 0.04256726288892229, + "flos": 30369722179680.0, + "grad_norm": 2.0986758068751294, + "language_loss": 0.81234789, + "learning_rate": 3.998343840719776e-06, + "loss": 0.84300154, + "num_input_tokens_seen": 15077550, + "step": 708, + "time_per_iteration": 2.8423423767089844 + }, + { + "auxiliary_loss_clip": 0.01592776, + "auxiliary_loss_mlp": 0.01484253, + "balance_loss_clip": 1.21834612, + "balance_loss_mlp": 1.10488105, + "epoch": 0.04262738614159026, + "flos": 16364533344480.0, + "grad_norm": 2.95439650920094, + "language_loss": 0.82603228, + "learning_rate": 3.998327956604666e-06, + "loss": 0.85680264, + "num_input_tokens_seen": 15094955, + "step": 709, + "time_per_iteration": 2.714772939682007 + }, + { + "auxiliary_loss_clip": 0.01602716, + "auxiliary_loss_mlp": 0.01511809, + "balance_loss_clip": 1.2286005, + "balance_loss_mlp": 1.12309122, + "epoch": 0.04268750939425823, + "flos": 20414466852480.0, + "grad_norm": 3.396824249154019, + "language_loss": 0.85784227, + "learning_rate": 3.99831199671276e-06, + "loss": 0.88898754, + "num_input_tokens_seen": 15113395, + "step": 710, + "time_per_iteration": 2.7901339530944824 + }, + { + "auxiliary_loss_clip": 0.01589368, + "auxiliary_loss_mlp": 0.01483896, + "balance_loss_clip": 1.21669316, + "balance_loss_mlp": 1.11139035, + "epoch": 0.0427476326469262, + "flos": 20305043017920.0, + "grad_norm": 5.495645761149696, + "language_loss": 0.84702909, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87776172, + "num_input_tokens_seen": 15132920, + "step": 711, + "time_per_iteration": 2.8049917221069336 + }, + { + "auxiliary_loss_clip": 0.01591564, + "auxiliary_loss_mlp": 0.01473155, + "balance_loss_clip": 1.2170099, + "balance_loss_mlp": 1.09378278, + "epoch": 0.042807755899594166, + "flos": 21652833663360.0, + "grad_norm": 2.3889448213835682, + "language_loss": 0.85420018, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.8848474, + "num_input_tokens_seen": 15153115, + "step": 712, + "time_per_iteration": 2.796342372894287 + }, + { + "auxiliary_loss_clip": 0.0159093, + "auxiliary_loss_mlp": 0.0147915, + "balance_loss_clip": 1.21652102, + "balance_loss_mlp": 1.09558201, + "epoch": 0.04286787915226214, + "flos": 21437209887840.0, + "grad_norm": 2.6130639966695832, + "language_loss": 0.90997517, + "learning_rate": 3.998263662382328e-06, + "loss": 0.94067597, + "num_input_tokens_seen": 15172770, + "step": 713, + "time_per_iteration": 2.7786450386047363 + }, + { + "auxiliary_loss_clip": 0.01658597, + "auxiliary_loss_mlp": 0.01431801, + "balance_loss_clip": 1.28810537, + "balance_loss_mlp": 1.11823273, + "epoch": 0.04292800240493011, + "flos": 66405436574400.0, + "grad_norm": 0.9197108960426409, + "language_loss": 0.63699526, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.66789925, + "num_input_tokens_seen": 15240055, + "step": 714, + "time_per_iteration": 3.423335075378418 + }, + { + "auxiliary_loss_clip": 0.01593492, + "auxiliary_loss_mlp": 0.01504002, + "balance_loss_clip": 1.21901965, + "balance_loss_mlp": 1.12558365, + "epoch": 0.042988125657598075, + "flos": 31652996296320.0, + "grad_norm": 2.744546820664693, + "language_loss": 0.75066769, + "learning_rate": 3.998231060622563e-06, + "loss": 0.78164262, + "num_input_tokens_seen": 15261585, + "step": 715, + "time_per_iteration": 2.8598709106445312 + }, + { + "auxiliary_loss_clip": 0.01598785, + "auxiliary_loss_mlp": 0.01477397, + "balance_loss_clip": 1.22521377, + "balance_loss_mlp": 1.10641789, + "epoch": 0.04304824891026605, + "flos": 33250887779040.0, + "grad_norm": 2.059301024471306, + "language_loss": 0.7274164, + "learning_rate": 3.998214646082688e-06, + "loss": 0.75817817, + "num_input_tokens_seen": 15281160, + "step": 716, + "time_per_iteration": 2.8507139682769775 + }, + { + "auxiliary_loss_clip": 0.01657469, + "auxiliary_loss_mlp": 0.01419067, + "balance_loss_clip": 1.28810751, + "balance_loss_mlp": 1.08718872, + "epoch": 0.04310837216293401, + "flos": 64072224567360.0, + "grad_norm": 0.9289372980779688, + "language_loss": 0.65439975, + "learning_rate": 3.998198155770314e-06, + "loss": 0.68516511, + "num_input_tokens_seen": 15344505, + "step": 717, + "time_per_iteration": 3.287321090698242 + }, + { + "auxiliary_loss_clip": 0.01655355, + "auxiliary_loss_mlp": 0.01409096, + "balance_loss_clip": 1.28579593, + "balance_loss_mlp": 1.08026886, + "epoch": 0.043168495415601985, + "flos": 61349865995520.0, + "grad_norm": 1.0021738485810607, + "language_loss": 0.58765817, + "learning_rate": 3.998181589686065e-06, + "loss": 0.61830264, + "num_input_tokens_seen": 15404050, + "step": 718, + "time_per_iteration": 3.1091244220733643 + }, + { + "auxiliary_loss_clip": 0.01599424, + "auxiliary_loss_mlp": 0.01494705, + "balance_loss_clip": 1.22745192, + "balance_loss_mlp": 1.13650489, + "epoch": 0.04322861866826996, + "flos": 20706364886400.0, + "grad_norm": 2.4471727179937863, + "language_loss": 0.91569841, + "learning_rate": 3.99816494783057e-06, + "loss": 0.94663972, + "num_input_tokens_seen": 15424190, + "step": 719, + "time_per_iteration": 2.747295379638672 + }, + { + "auxiliary_loss_clip": 0.01584394, + "auxiliary_loss_mlp": 0.01492258, + "balance_loss_clip": 1.21159172, + "balance_loss_mlp": 1.12528384, + "epoch": 0.04328874192093792, + "flos": 30376018254240.0, + "grad_norm": 1.8020210983284894, + "language_loss": 0.66707134, + "learning_rate": 3.99814823020446e-06, + "loss": 0.69783795, + "num_input_tokens_seen": 15446500, + "step": 720, + "time_per_iteration": 2.8388423919677734 + }, + { + "auxiliary_loss_clip": 0.01596119, + "auxiliary_loss_mlp": 0.01481306, + "balance_loss_clip": 1.22394955, + "balance_loss_mlp": 1.11948204, + "epoch": 0.043348865173605894, + "flos": 21946969458720.0, + "grad_norm": 2.211703930272912, + "language_loss": 0.77861583, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80939007, + "num_input_tokens_seen": 15465830, + "step": 721, + "time_per_iteration": 2.7773261070251465 + }, + { + "auxiliary_loss_clip": 0.01599643, + "auxiliary_loss_mlp": 0.01488848, + "balance_loss_clip": 1.22650266, + "balance_loss_mlp": 1.12359107, + "epoch": 0.04340898842627386, + "flos": 15265819476000.0, + "grad_norm": 2.877311649036385, + "language_loss": 0.88215005, + "learning_rate": 3.998114567642933e-06, + "loss": 0.91303504, + "num_input_tokens_seen": 15479985, + "step": 722, + "time_per_iteration": 2.748526096343994 + }, + { + "auxiliary_loss_clip": 0.01591, + "auxiliary_loss_mlp": 0.01503842, + "balance_loss_clip": 1.21835661, + "balance_loss_mlp": 1.14011025, + "epoch": 0.04346911167894183, + "flos": 27967973258880.0, + "grad_norm": 2.3664577939682725, + "language_loss": 0.8464638, + "learning_rate": 3.998097622708792e-06, + "loss": 0.87741226, + "num_input_tokens_seen": 15501545, + "step": 723, + "time_per_iteration": 2.8445754051208496 + }, + { + "auxiliary_loss_clip": 0.01593616, + "auxiliary_loss_mlp": 0.01500511, + "balance_loss_clip": 1.22110367, + "balance_loss_mlp": 1.13830519, + "epoch": 0.0435292349316098, + "flos": 29244799588320.0, + "grad_norm": 1.8969738413476698, + "language_loss": 0.82931745, + "learning_rate": 3.99808060200659e-06, + "loss": 0.8602587, + "num_input_tokens_seen": 15521725, + "step": 724, + "time_per_iteration": 2.813234806060791 + }, + { + "auxiliary_loss_clip": 0.01605921, + "auxiliary_loss_mlp": 0.01493141, + "balance_loss_clip": 1.23172069, + "balance_loss_mlp": 1.13093507, + "epoch": 0.04358935818427777, + "flos": 20560453797600.0, + "grad_norm": 2.020741884649762, + "language_loss": 0.79904085, + "learning_rate": 3.998063505536971e-06, + "loss": 0.83003151, + "num_input_tokens_seen": 15540910, + "step": 725, + "time_per_iteration": 2.833235740661621 + }, + { + "auxiliary_loss_clip": 0.01593085, + "auxiliary_loss_mlp": 0.01501484, + "balance_loss_clip": 1.21964085, + "balance_loss_mlp": 1.13489199, + "epoch": 0.04364948143694574, + "flos": 14466665129760.0, + "grad_norm": 3.8073806282357356, + "language_loss": 0.87491548, + "learning_rate": 3.998046333300584e-06, + "loss": 0.90586114, + "num_input_tokens_seen": 15558640, + "step": 726, + "time_per_iteration": 2.703495979309082 + }, + { + "auxiliary_loss_clip": 0.0165919, + "auxiliary_loss_mlp": 0.01595154, + "balance_loss_clip": 1.29114223, + "balance_loss_mlp": 1.31744385, + "epoch": 0.043709604689613706, + "flos": 50072611536000.0, + "grad_norm": 1.080915747674032, + "language_loss": 0.5587393, + "learning_rate": 3.998029085298079e-06, + "loss": 0.59128273, + "num_input_tokens_seen": 15612975, + "step": 727, + "time_per_iteration": 3.4423599243164062 + }, + { + "auxiliary_loss_clip": 0.01598987, + "auxiliary_loss_mlp": 0.01476136, + "balance_loss_clip": 1.22508287, + "balance_loss_mlp": 1.10858965, + "epoch": 0.04376972794228168, + "flos": 13993354884960.0, + "grad_norm": 3.547095342724641, + "language_loss": 0.82453889, + "learning_rate": 3.998011761530112e-06, + "loss": 0.85529006, + "num_input_tokens_seen": 15631070, + "step": 728, + "time_per_iteration": 2.7787296772003174 + }, + { + "auxiliary_loss_clip": 0.01593469, + "auxiliary_loss_mlp": 0.014804, + "balance_loss_clip": 1.22104192, + "balance_loss_mlp": 1.10293543, + "epoch": 0.04382985119494965, + "flos": 22011637700160.0, + "grad_norm": 2.105318428561484, + "language_loss": 0.76920021, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79993886, + "num_input_tokens_seen": 15647825, + "step": 729, + "time_per_iteration": 2.7676234245300293 + }, + { + "auxiliary_loss_clip": 0.01592799, + "auxiliary_loss_mlp": 0.01473424, + "balance_loss_clip": 1.21967196, + "balance_loss_mlp": 1.08699501, + "epoch": 0.043889974447617615, + "flos": 24208799940000.0, + "grad_norm": 2.3650624377983926, + "language_loss": 0.95076233, + "learning_rate": 3.997976886700417e-06, + "loss": 0.98142457, + "num_input_tokens_seen": 15668260, + "step": 730, + "time_per_iteration": 2.7743287086486816 + }, + { + "auxiliary_loss_clip": 0.0159305, + "auxiliary_loss_mlp": 0.01469778, + "balance_loss_clip": 1.22096491, + "balance_loss_mlp": 1.08258593, + "epoch": 0.04395009770028559, + "flos": 17276449203360.0, + "grad_norm": 3.361727015643526, + "language_loss": 0.8819381, + "learning_rate": 3.997959335640013e-06, + "loss": 0.9125663, + "num_input_tokens_seen": 15685630, + "step": 731, + "time_per_iteration": 2.7090675830841064 + }, + { + "auxiliary_loss_clip": 0.01592054, + "auxiliary_loss_mlp": 0.01471165, + "balance_loss_clip": 1.21998644, + "balance_loss_mlp": 1.07596278, + "epoch": 0.04401022095295355, + "flos": 12311793152640.0, + "grad_norm": 5.504628474649134, + "language_loss": 0.89106894, + "learning_rate": 3.997941708816791e-06, + "loss": 0.92170107, + "num_input_tokens_seen": 15698645, + "step": 732, + "time_per_iteration": 2.737893581390381 + }, + { + "auxiliary_loss_clip": 0.01593585, + "auxiliary_loss_mlp": 0.01476222, + "balance_loss_clip": 1.22075856, + "balance_loss_mlp": 1.07968402, + "epoch": 0.044070344205621524, + "flos": 20961775666080.0, + "grad_norm": 2.3404695201592802, + "language_loss": 0.85777712, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88847518, + "num_input_tokens_seen": 15716775, + "step": 733, + "time_per_iteration": 2.7179622650146484 + }, + { + "auxiliary_loss_clip": 0.01597735, + "auxiliary_loss_mlp": 0.01502343, + "balance_loss_clip": 1.22546649, + "balance_loss_mlp": 1.09874761, + "epoch": 0.044130467458289496, + "flos": 13847026586400.0, + "grad_norm": 2.3610665310664896, + "language_loss": 0.91764963, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.94865036, + "num_input_tokens_seen": 15733320, + "step": 734, + "time_per_iteration": 2.7473552227020264 + }, + { + "auxiliary_loss_clip": 0.01601056, + "auxiliary_loss_mlp": 0.01483825, + "balance_loss_clip": 1.23028874, + "balance_loss_mlp": 1.07775092, + "epoch": 0.04419059071095746, + "flos": 28657438273440.0, + "grad_norm": 1.9902672234556251, + "language_loss": 0.78069139, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.81154025, + "num_input_tokens_seen": 15752705, + "step": 735, + "time_per_iteration": 2.7978391647338867 + }, + { + "auxiliary_loss_clip": 0.01590809, + "auxiliary_loss_mlp": 0.01495604, + "balance_loss_clip": 1.21773553, + "balance_loss_mlp": 1.09353471, + "epoch": 0.04425071396362543, + "flos": 28185796867680.0, + "grad_norm": 2.815795114818924, + "language_loss": 0.88381404, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.9146781, + "num_input_tokens_seen": 15772800, + "step": 736, + "time_per_iteration": 2.8453469276428223 + }, + { + "auxiliary_loss_clip": 0.01591851, + "auxiliary_loss_mlp": 0.01493513, + "balance_loss_clip": 1.22031331, + "balance_loss_mlp": 1.08343279, + "epoch": 0.0443108372162934, + "flos": 23660656706880.0, + "grad_norm": 1.8608834640140646, + "language_loss": 0.84404713, + "learning_rate": 3.997852438281901e-06, + "loss": 0.8749007, + "num_input_tokens_seen": 15793665, + "step": 737, + "time_per_iteration": 4.265499830245972 + }, + { + "auxiliary_loss_clip": 0.01595684, + "auxiliary_loss_mlp": 0.01486147, + "balance_loss_clip": 1.22312236, + "balance_loss_mlp": 1.08980048, + "epoch": 0.04437096046896137, + "flos": 33982415487360.0, + "grad_norm": 2.788379035305137, + "language_loss": 0.8521055, + "learning_rate": 3.997834356895906e-06, + "loss": 0.88292384, + "num_input_tokens_seen": 15813175, + "step": 738, + "time_per_iteration": 4.297913074493408 + }, + { + "auxiliary_loss_clip": 0.01672104, + "auxiliary_loss_mlp": 0.01682625, + "balance_loss_clip": 1.30320883, + "balance_loss_mlp": 1.25232697, + "epoch": 0.04443108372162934, + "flos": 67403791802880.0, + "grad_norm": 0.8925858587231493, + "language_loss": 0.59119415, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.62474144, + "num_input_tokens_seen": 15872050, + "step": 739, + "time_per_iteration": 4.827616930007935 + }, + { + "auxiliary_loss_clip": 0.01600451, + "auxiliary_loss_mlp": 0.01481401, + "balance_loss_clip": 1.22867298, + "balance_loss_mlp": 1.07704306, + "epoch": 0.04449120697429731, + "flos": 29755279794240.0, + "grad_norm": 2.93543849927053, + "language_loss": 0.9136734, + "learning_rate": 3.997797966850369e-06, + "loss": 0.94449192, + "num_input_tokens_seen": 15891085, + "step": 740, + "time_per_iteration": 4.403778076171875 + }, + { + "auxiliary_loss_clip": 0.01587306, + "auxiliary_loss_mlp": 0.01485795, + "balance_loss_clip": 1.21556413, + "balance_loss_mlp": 1.09612334, + "epoch": 0.04455133022696528, + "flos": 36505080475200.0, + "grad_norm": 2.8200552564701624, + "language_loss": 0.71850002, + "learning_rate": 3.997779658192205e-06, + "loss": 0.7492311, + "num_input_tokens_seen": 15914225, + "step": 741, + "time_per_iteration": 2.902454137802124 + }, + { + "auxiliary_loss_clip": 0.01586167, + "auxiliary_loss_mlp": 0.01474114, + "balance_loss_clip": 1.21267295, + "balance_loss_mlp": 1.09321594, + "epoch": 0.044611453479633245, + "flos": 28806193974240.0, + "grad_norm": 1.8445961080781412, + "language_loss": 0.88865733, + "learning_rate": 3.997761273778037e-06, + "loss": 0.91926008, + "num_input_tokens_seen": 15934540, + "step": 742, + "time_per_iteration": 2.7596895694732666 + }, + { + "auxiliary_loss_clip": 0.01584307, + "auxiliary_loss_mlp": 0.01473683, + "balance_loss_clip": 1.21133554, + "balance_loss_mlp": 1.09507489, + "epoch": 0.04467157673230122, + "flos": 20013031199520.0, + "grad_norm": 2.210701935553013, + "language_loss": 0.8404935, + "learning_rate": 3.997742813608561e-06, + "loss": 0.87107348, + "num_input_tokens_seen": 15952560, + "step": 743, + "time_per_iteration": 2.7567105293273926 + }, + { + "auxiliary_loss_clip": 0.01590648, + "auxiliary_loss_mlp": 0.01498668, + "balance_loss_clip": 1.21729875, + "balance_loss_mlp": 1.12425506, + "epoch": 0.04473169998496919, + "flos": 18006952851360.0, + "grad_norm": 2.321587956252452, + "language_loss": 0.8021152, + "learning_rate": 3.997724277684479e-06, + "loss": 0.83300841, + "num_input_tokens_seen": 15970620, + "step": 744, + "time_per_iteration": 2.800903797149658 + }, + { + "auxiliary_loss_clip": 0.01584067, + "auxiliary_loss_mlp": 0.01498707, + "balance_loss_clip": 1.21175253, + "balance_loss_mlp": 1.13440347, + "epoch": 0.044791823237637154, + "flos": 20633845515840.0, + "grad_norm": 2.6595802532742336, + "language_loss": 0.85629594, + "learning_rate": 3.99770566600649e-06, + "loss": 0.8871237, + "num_input_tokens_seen": 15987325, + "step": 745, + "time_per_iteration": 2.82902455329895 + }, + { + "auxiliary_loss_clip": 0.0157516, + "auxiliary_loss_mlp": 0.01492167, + "balance_loss_clip": 1.20169044, + "balance_loss_mlp": 1.12977028, + "epoch": 0.04485194649030513, + "flos": 31178889560160.0, + "grad_norm": 1.9532104613125068, + "language_loss": 0.69035649, + "learning_rate": 3.997686978575302e-06, + "loss": 0.72102976, + "num_input_tokens_seen": 16008310, + "step": 746, + "time_per_iteration": 2.888566732406616 + }, + { + "auxiliary_loss_clip": 0.01597032, + "auxiliary_loss_mlp": 0.01508035, + "balance_loss_clip": 1.22534966, + "balance_loss_mlp": 1.14602017, + "epoch": 0.04491206974297309, + "flos": 26145848308320.0, + "grad_norm": 2.0934932649473224, + "language_loss": 0.69096625, + "learning_rate": 3.997668215391625e-06, + "loss": 0.72201693, + "num_input_tokens_seen": 16029620, + "step": 747, + "time_per_iteration": 2.760067939758301 + }, + { + "auxiliary_loss_clip": 0.01588224, + "auxiliary_loss_mlp": 0.01493725, + "balance_loss_clip": 1.21535158, + "balance_loss_mlp": 1.13552475, + "epoch": 0.044972192995641064, + "flos": 20669763847680.0, + "grad_norm": 2.686665919311934, + "language_loss": 0.66915166, + "learning_rate": 3.997649376456168e-06, + "loss": 0.6999712, + "num_input_tokens_seen": 16049065, + "step": 748, + "time_per_iteration": 2.8020153045654297 + }, + { + "auxiliary_loss_clip": 0.01603293, + "auxiliary_loss_mlp": 0.01529536, + "balance_loss_clip": 1.23049068, + "balance_loss_mlp": 1.17667627, + "epoch": 0.045032316248309036, + "flos": 16108667426880.0, + "grad_norm": 4.29512023153504, + "language_loss": 0.76720107, + "learning_rate": 3.997630461769647e-06, + "loss": 0.79852933, + "num_input_tokens_seen": 16066765, + "step": 749, + "time_per_iteration": 2.7556381225585938 + }, + { + "auxiliary_loss_clip": 0.01587309, + "auxiliary_loss_mlp": 0.01501164, + "balance_loss_clip": 1.21479988, + "balance_loss_mlp": 1.13876796, + "epoch": 0.045092439500977, + "flos": 17860814193600.0, + "grad_norm": 3.3555690167738925, + "language_loss": 0.89062011, + "learning_rate": 3.997611471332778e-06, + "loss": 0.92150486, + "num_input_tokens_seen": 16085980, + "step": 750, + "time_per_iteration": 2.7949700355529785 + }, + { + "auxiliary_loss_clip": 0.01583167, + "auxiliary_loss_mlp": 0.01512282, + "balance_loss_clip": 1.20936918, + "balance_loss_mlp": 1.16056669, + "epoch": 0.04515256275364497, + "flos": 24465158923680.0, + "grad_norm": 1.9373343329519521, + "language_loss": 0.74899554, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77995002, + "num_input_tokens_seen": 16106260, + "step": 751, + "time_per_iteration": 2.77394437789917 + }, + { + "auxiliary_loss_clip": 0.01591536, + "auxiliary_loss_mlp": 0.01519675, + "balance_loss_clip": 1.21777654, + "balance_loss_mlp": 1.16185641, + "epoch": 0.04521268600631294, + "flos": 20918119989600.0, + "grad_norm": 2.4833690574671117, + "language_loss": 0.69431311, + "learning_rate": 3.997573263210883e-06, + "loss": 0.72542518, + "num_input_tokens_seen": 16123475, + "step": 752, + "time_per_iteration": 2.726755380630493 + }, + { + "auxiliary_loss_clip": 0.01588483, + "auxiliary_loss_mlp": 0.01482024, + "balance_loss_clip": 1.21447301, + "balance_loss_mlp": 1.11466908, + "epoch": 0.04527280925898091, + "flos": 13373109491040.0, + "grad_norm": 6.714439587342623, + "language_loss": 0.92380226, + "learning_rate": 3.997554045527305e-06, + "loss": 0.95450723, + "num_input_tokens_seen": 16138335, + "step": 753, + "time_per_iteration": 2.7544498443603516 + }, + { + "auxiliary_loss_clip": 0.01598905, + "auxiliary_loss_mlp": 0.01512976, + "balance_loss_clip": 1.22538662, + "balance_loss_mlp": 1.1480999, + "epoch": 0.04533293251164888, + "flos": 23256604010880.0, + "grad_norm": 2.1692696693906273, + "language_loss": 0.91123289, + "learning_rate": 3.997534752096277e-06, + "loss": 0.9423517, + "num_input_tokens_seen": 16157110, + "step": 754, + "time_per_iteration": 2.7158620357513428 + }, + { + "auxiliary_loss_clip": 0.01609233, + "auxiliary_loss_mlp": 0.01499271, + "balance_loss_clip": 1.23551977, + "balance_loss_mlp": 1.13344121, + "epoch": 0.04539305576431685, + "flos": 12423644389440.0, + "grad_norm": 2.255672881752274, + "language_loss": 0.78729248, + "learning_rate": 3.997515382918531e-06, + "loss": 0.81837749, + "num_input_tokens_seen": 16174155, + "step": 755, + "time_per_iteration": 2.90519642829895 + }, + { + "auxiliary_loss_clip": 0.01590896, + "auxiliary_loss_mlp": 0.01475599, + "balance_loss_clip": 1.21602476, + "balance_loss_mlp": 1.1069088, + "epoch": 0.04545317901698482, + "flos": 16072862879520.0, + "grad_norm": 2.71785753163779, + "language_loss": 0.78739238, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.81805742, + "num_input_tokens_seen": 16192240, + "step": 756, + "time_per_iteration": 2.7319893836975098 + }, + { + "auxiliary_loss_clip": 0.01716513, + "auxiliary_loss_mlp": 0.01483292, + "balance_loss_clip": 1.3413167, + "balance_loss_mlp": 1.09648132, + "epoch": 0.045513302269652785, + "flos": 66403502238240.0, + "grad_norm": 0.8232537358036071, + "language_loss": 0.62680686, + "learning_rate": 3.997476417325827e-06, + "loss": 0.65880489, + "num_input_tokens_seen": 16255775, + "step": 757, + "time_per_iteration": 3.357712984085083 + }, + { + "auxiliary_loss_clip": 0.01590342, + "auxiliary_loss_mlp": 0.01489415, + "balance_loss_clip": 1.21574235, + "balance_loss_mlp": 1.12530208, + "epoch": 0.04557342552232076, + "flos": 21473355788640.0, + "grad_norm": 2.246286750325206, + "language_loss": 0.84530354, + "learning_rate": 3.997456820912346e-06, + "loss": 0.87610108, + "num_input_tokens_seen": 16277015, + "step": 758, + "time_per_iteration": 2.7696800231933594 + }, + { + "auxiliary_loss_clip": 0.01587643, + "auxiliary_loss_mlp": 0.0147724, + "balance_loss_clip": 1.2130115, + "balance_loss_mlp": 1.10721469, + "epoch": 0.04563354877498873, + "flos": 23734958700960.0, + "grad_norm": 2.265036953760202, + "language_loss": 0.8863734, + "learning_rate": 3.997437148755101e-06, + "loss": 0.91702217, + "num_input_tokens_seen": 16296005, + "step": 759, + "time_per_iteration": 2.768505096435547 + }, + { + "auxiliary_loss_clip": 0.01586517, + "auxiliary_loss_mlp": 0.01477629, + "balance_loss_clip": 1.21090734, + "balance_loss_mlp": 1.09539676, + "epoch": 0.045693672027656694, + "flos": 25737926940000.0, + "grad_norm": 2.4795724569580124, + "language_loss": 0.74189496, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.7725364, + "num_input_tokens_seen": 16315300, + "step": 760, + "time_per_iteration": 2.8020217418670654 + }, + { + "auxiliary_loss_clip": 0.0159786, + "auxiliary_loss_mlp": 0.01481139, + "balance_loss_clip": 1.22109878, + "balance_loss_mlp": 1.10615396, + "epoch": 0.045753795280324666, + "flos": 19721057309280.0, + "grad_norm": 5.2686854900988, + "language_loss": 0.82748747, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.85827744, + "num_input_tokens_seen": 16333820, + "step": 761, + "time_per_iteration": 2.758539915084839 + }, + { + "auxiliary_loss_clip": 0.01600493, + "auxiliary_loss_mlp": 0.0149774, + "balance_loss_clip": 1.22674417, + "balance_loss_mlp": 1.13553476, + "epoch": 0.04581391853299264, + "flos": 23257893568320.0, + "grad_norm": 1.9378920153786516, + "language_loss": 0.79992163, + "learning_rate": 3.997377677828266e-06, + "loss": 0.83090389, + "num_input_tokens_seen": 16355290, + "step": 762, + "time_per_iteration": 2.8336567878723145 + }, + { + "auxiliary_loss_clip": 0.01705974, + "auxiliary_loss_mlp": 0.01440033, + "balance_loss_clip": 1.32716036, + "balance_loss_mlp": 1.06542969, + "epoch": 0.0458740417856606, + "flos": 64237706951040.0, + "grad_norm": 1.0095252464158453, + "language_loss": 0.58702934, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.61848938, + "num_input_tokens_seen": 16415995, + "step": 763, + "time_per_iteration": 3.3463988304138184 + }, + { + "auxiliary_loss_clip": 0.01589756, + "auxiliary_loss_mlp": 0.01503615, + "balance_loss_clip": 1.2129488, + "balance_loss_mlp": 1.13816714, + "epoch": 0.045934165038328575, + "flos": 20771829619200.0, + "grad_norm": 3.3174019951613913, + "language_loss": 0.88326186, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.91419554, + "num_input_tokens_seen": 16433120, + "step": 764, + "time_per_iteration": 2.7730140686035156 + }, + { + "auxiliary_loss_clip": 0.01589403, + "auxiliary_loss_mlp": 0.01498079, + "balance_loss_clip": 1.21474719, + "balance_loss_mlp": 1.13472939, + "epoch": 0.04599428829099654, + "flos": 30265494503040.0, + "grad_norm": 3.0040527271376956, + "language_loss": 0.8579855, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88886034, + "num_input_tokens_seen": 16453360, + "step": 765, + "time_per_iteration": 2.82035493850708 + }, + { + "auxiliary_loss_clip": 0.01591747, + "auxiliary_loss_mlp": 0.01483338, + "balance_loss_clip": 1.21520424, + "balance_loss_mlp": 1.11808074, + "epoch": 0.04605441154366451, + "flos": 23041131948000.0, + "grad_norm": 2.7317996994445495, + "language_loss": 0.88160229, + "learning_rate": 3.997297322892056e-06, + "loss": 0.91235316, + "num_input_tokens_seen": 16471160, + "step": 766, + "time_per_iteration": 2.828303098678589 + }, + { + "auxiliary_loss_clip": 0.01595197, + "auxiliary_loss_mlp": 0.01489108, + "balance_loss_clip": 1.21886337, + "balance_loss_mlp": 1.11965442, + "epoch": 0.046114534796332485, + "flos": 22019564685600.0, + "grad_norm": 2.91475363284369, + "language_loss": 0.84389347, + "learning_rate": 3.997277044811806e-06, + "loss": 0.87473655, + "num_input_tokens_seen": 16488940, + "step": 767, + "time_per_iteration": 2.821161985397339 + }, + { + "auxiliary_loss_clip": 0.01588257, + "auxiliary_loss_mlp": 0.01474724, + "balance_loss_clip": 1.21206057, + "balance_loss_mlp": 1.09859514, + "epoch": 0.04617465804900045, + "flos": 29865007054080.0, + "grad_norm": 2.031277493463063, + "language_loss": 0.87293899, + "learning_rate": 3.99725669099461e-06, + "loss": 0.90356874, + "num_input_tokens_seen": 16509505, + "step": 768, + "time_per_iteration": 2.812931776046753 + }, + { + "auxiliary_loss_clip": 0.0158512, + "auxiliary_loss_mlp": 0.01496739, + "balance_loss_clip": 1.20736217, + "balance_loss_mlp": 1.13262677, + "epoch": 0.04623478130166842, + "flos": 25632447634080.0, + "grad_norm": 2.5228017453318903, + "language_loss": 0.75532705, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.78614569, + "num_input_tokens_seen": 16528840, + "step": 769, + "time_per_iteration": 2.890606641769409 + }, + { + "auxiliary_loss_clip": 0.0159008, + "auxiliary_loss_mlp": 0.01492541, + "balance_loss_clip": 1.21318102, + "balance_loss_mlp": 1.13262439, + "epoch": 0.04629490455433639, + "flos": 20451181675680.0, + "grad_norm": 3.3195380614458334, + "language_loss": 0.86629075, + "learning_rate": 3.997215756152471e-06, + "loss": 0.8971169, + "num_input_tokens_seen": 16548335, + "step": 770, + "time_per_iteration": 2.7473437786102295 + }, + { + "auxiliary_loss_clip": 0.01585476, + "auxiliary_loss_mlp": 0.01507358, + "balance_loss_clip": 1.20702624, + "balance_loss_mlp": 1.14496207, + "epoch": 0.04635502780700436, + "flos": 23150972992320.0, + "grad_norm": 2.5147355338871953, + "language_loss": 0.8671447, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89807308, + "num_input_tokens_seen": 16567725, + "step": 771, + "time_per_iteration": 2.793691635131836 + }, + { + "auxiliary_loss_clip": 0.01587727, + "auxiliary_loss_mlp": 0.01507781, + "balance_loss_clip": 1.21124816, + "balance_loss_mlp": 1.14176106, + "epoch": 0.04641515105967233, + "flos": 23294115325440.0, + "grad_norm": 2.6133930510474492, + "language_loss": 0.83790565, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86886072, + "num_input_tokens_seen": 16588175, + "step": 772, + "time_per_iteration": 2.8559536933898926 + }, + { + "auxiliary_loss_clip": 0.01590625, + "auxiliary_loss_mlp": 0.01486078, + "balance_loss_clip": 1.21330595, + "balance_loss_mlp": 1.11910403, + "epoch": 0.046475274312340296, + "flos": 25117340192640.0, + "grad_norm": 2.0134780717150766, + "language_loss": 0.7391367, + "learning_rate": 3.997153785881557e-06, + "loss": 0.76990372, + "num_input_tokens_seen": 16607735, + "step": 773, + "time_per_iteration": 2.8177852630615234 + }, + { + "auxiliary_loss_clip": 0.01596037, + "auxiliary_loss_mlp": 0.01504457, + "balance_loss_clip": 1.21729755, + "balance_loss_mlp": 1.13404965, + "epoch": 0.04653539756500827, + "flos": 25267271666400.0, + "grad_norm": 2.629909695189193, + "language_loss": 0.7857579, + "learning_rate": 3.997132977658996e-06, + "loss": 0.8167628, + "num_input_tokens_seen": 16627225, + "step": 774, + "time_per_iteration": 2.881319761276245 + }, + { + "auxiliary_loss_clip": 0.01582168, + "auxiliary_loss_mlp": 0.01491188, + "balance_loss_clip": 1.20275056, + "balance_loss_mlp": 1.12650263, + "epoch": 0.046595520817676234, + "flos": 35406859672800.0, + "grad_norm": 2.6157082440932933, + "language_loss": 0.73873305, + "learning_rate": 3.997112093704952e-06, + "loss": 0.7694667, + "num_input_tokens_seen": 16647785, + "step": 775, + "time_per_iteration": 4.34852409362793 + }, + { + "auxiliary_loss_clip": 0.0158504, + "auxiliary_loss_mlp": 0.01467475, + "balance_loss_clip": 1.20579195, + "balance_loss_mlp": 1.08352637, + "epoch": 0.046655644070344206, + "flos": 18114404421600.0, + "grad_norm": 2.2057251085727576, + "language_loss": 0.77286339, + "learning_rate": 3.997091134020217e-06, + "loss": 0.80338854, + "num_input_tokens_seen": 16667555, + "step": 776, + "time_per_iteration": 4.304578065872192 + }, + { + "auxiliary_loss_clip": 0.0158434, + "auxiliary_loss_mlp": 0.01496527, + "balance_loss_clip": 1.20494747, + "balance_loss_mlp": 1.12611938, + "epoch": 0.04671576732301218, + "flos": 29207857196160.0, + "grad_norm": 1.9278590526860155, + "language_loss": 0.71123284, + "learning_rate": 3.997070098605585e-06, + "loss": 0.74204147, + "num_input_tokens_seen": 16686875, + "step": 777, + "time_per_iteration": 4.257874250411987 + }, + { + "auxiliary_loss_clip": 0.01583762, + "auxiliary_loss_mlp": 0.0150381, + "balance_loss_clip": 1.20312619, + "balance_loss_mlp": 1.12920642, + "epoch": 0.04677589057568014, + "flos": 30480738996960.0, + "grad_norm": 2.3872020355817836, + "language_loss": 0.76520443, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79608011, + "num_input_tokens_seen": 16706420, + "step": 778, + "time_per_iteration": 4.365997314453125 + }, + { + "auxiliary_loss_clip": 0.01584957, + "auxiliary_loss_mlp": 0.01468799, + "balance_loss_clip": 1.2064507, + "balance_loss_mlp": 1.09724736, + "epoch": 0.046836013828348115, + "flos": 20559733162560.0, + "grad_norm": 2.3069961431865744, + "language_loss": 0.79092813, + "learning_rate": 3.997027800589829e-06, + "loss": 0.82146573, + "num_input_tokens_seen": 16726390, + "step": 779, + "time_per_iteration": 2.8490102291107178 + }, + { + "auxiliary_loss_clip": 0.01588878, + "auxiliary_loss_mlp": 0.01483611, + "balance_loss_clip": 1.20901918, + "balance_loss_mlp": 1.11205959, + "epoch": 0.04689613708101608, + "flos": 25449745865760.0, + "grad_norm": 1.9322984491037076, + "language_loss": 0.77466547, + "learning_rate": 3.997006537990308e-06, + "loss": 0.80539036, + "num_input_tokens_seen": 16748965, + "step": 780, + "time_per_iteration": 2.8705828189849854 + }, + { + "auxiliary_loss_clip": 0.01604675, + "auxiliary_loss_mlp": 0.01492219, + "balance_loss_clip": 1.22333288, + "balance_loss_mlp": 1.11380088, + "epoch": 0.04695626033368405, + "flos": 23003544777120.0, + "grad_norm": 1.8057780921569595, + "language_loss": 0.76501489, + "learning_rate": 3.996985199664099e-06, + "loss": 0.79598391, + "num_input_tokens_seen": 16768620, + "step": 781, + "time_per_iteration": 2.850186347961426 + }, + { + "auxiliary_loss_clip": 0.01588052, + "auxiliary_loss_mlp": 0.01480913, + "balance_loss_clip": 1.20726061, + "balance_loss_mlp": 1.10936189, + "epoch": 0.047016383586352024, + "flos": 29135982604320.0, + "grad_norm": 3.020642239784877, + "language_loss": 0.74109423, + "learning_rate": 3.99696378561201e-06, + "loss": 0.77178383, + "num_input_tokens_seen": 16789755, + "step": 782, + "time_per_iteration": 2.852313756942749 + }, + { + "auxiliary_loss_clip": 0.01589281, + "auxiliary_loss_mlp": 0.0147868, + "balance_loss_clip": 1.20822144, + "balance_loss_mlp": 1.1004529, + "epoch": 0.04707650683901999, + "flos": 14978131467840.0, + "grad_norm": 2.2707697244988543, + "language_loss": 0.80686867, + "learning_rate": 3.996942295834855e-06, + "loss": 0.83754832, + "num_input_tokens_seen": 16807585, + "step": 783, + "time_per_iteration": 2.7343199253082275 + }, + { + "auxiliary_loss_clip": 0.01589575, + "auxiliary_loss_mlp": 0.01474209, + "balance_loss_clip": 1.20793235, + "balance_loss_mlp": 1.09159517, + "epoch": 0.04713663009168796, + "flos": 21653175016800.0, + "grad_norm": 2.1163086453121536, + "language_loss": 0.81567281, + "learning_rate": 3.996920730333448e-06, + "loss": 0.84631073, + "num_input_tokens_seen": 16827220, + "step": 784, + "time_per_iteration": 2.7849128246307373 + }, + { + "auxiliary_loss_clip": 0.01587716, + "auxiliary_loss_mlp": 0.01475097, + "balance_loss_clip": 1.20554733, + "balance_loss_mlp": 1.09648824, + "epoch": 0.04719675334435593, + "flos": 21327596412480.0, + "grad_norm": 2.340752249688011, + "language_loss": 0.80662155, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83724964, + "num_input_tokens_seen": 16846230, + "step": 785, + "time_per_iteration": 2.7901387214660645 + }, + { + "auxiliary_loss_clip": 0.01593484, + "auxiliary_loss_mlp": 0.01466579, + "balance_loss_clip": 1.21193969, + "balance_loss_mlp": 1.08873296, + "epoch": 0.0472568765970239, + "flos": 17933333564160.0, + "grad_norm": 1.9988765894867118, + "language_loss": 0.89648688, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92708749, + "num_input_tokens_seen": 16865325, + "step": 786, + "time_per_iteration": 2.812649965286255 + }, + { + "auxiliary_loss_clip": 0.01587685, + "auxiliary_loss_mlp": 0.0148172, + "balance_loss_clip": 1.20403624, + "balance_loss_mlp": 1.09910631, + "epoch": 0.04731699984969187, + "flos": 18079282581120.0, + "grad_norm": 3.9235558161478457, + "language_loss": 0.76931, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.80000407, + "num_input_tokens_seen": 16882930, + "step": 787, + "time_per_iteration": 2.7543041706085205 + }, + { + "auxiliary_loss_clip": 0.01588359, + "auxiliary_loss_mlp": 0.01470132, + "balance_loss_clip": 1.20739841, + "balance_loss_mlp": 1.09629178, + "epoch": 0.047377123102359836, + "flos": 23187308533920.0, + "grad_norm": 3.7716784038942412, + "language_loss": 0.81216156, + "learning_rate": 3.996833711101698e-06, + "loss": 0.8427465, + "num_input_tokens_seen": 16900710, + "step": 788, + "time_per_iteration": 2.800427198410034 + }, + { + "auxiliary_loss_clip": 0.01589339, + "auxiliary_loss_mlp": 0.01469809, + "balance_loss_clip": 1.20779157, + "balance_loss_mlp": 1.08280754, + "epoch": 0.04743724635502781, + "flos": 22750030405440.0, + "grad_norm": 2.0477231151484943, + "language_loss": 0.8512398, + "learning_rate": 3.996811766991355e-06, + "loss": 0.88183123, + "num_input_tokens_seen": 16919210, + "step": 789, + "time_per_iteration": 2.7519569396972656 + }, + { + "auxiliary_loss_clip": 0.01601379, + "auxiliary_loss_mlp": 0.01472593, + "balance_loss_clip": 1.21937919, + "balance_loss_mlp": 1.09016967, + "epoch": 0.04749736960769577, + "flos": 17240644656000.0, + "grad_norm": 2.1803194424376464, + "language_loss": 0.82180762, + "learning_rate": 3.996789747161709e-06, + "loss": 0.85254735, + "num_input_tokens_seen": 16937125, + "step": 790, + "time_per_iteration": 2.7571158409118652 + }, + { + "auxiliary_loss_clip": 0.0158801, + "auxiliary_loss_mlp": 0.01481472, + "balance_loss_clip": 1.20594692, + "balance_loss_mlp": 1.10438967, + "epoch": 0.047557492860363745, + "flos": 40482456684480.0, + "grad_norm": 2.500216022457942, + "language_loss": 0.88370121, + "learning_rate": 3.996767651613597e-06, + "loss": 0.91439605, + "num_input_tokens_seen": 16958610, + "step": 791, + "time_per_iteration": 2.863459587097168 + }, + { + "auxiliary_loss_clip": 0.01591257, + "auxiliary_loss_mlp": 0.0147406, + "balance_loss_clip": 1.20972705, + "balance_loss_mlp": 1.08934784, + "epoch": 0.04761761611303172, + "flos": 18700021041120.0, + "grad_norm": 4.053218262582627, + "language_loss": 0.90708888, + "learning_rate": 3.996745480347854e-06, + "loss": 0.93774199, + "num_input_tokens_seen": 16977300, + "step": 792, + "time_per_iteration": 2.800368309020996 + }, + { + "auxiliary_loss_clip": 0.01588809, + "auxiliary_loss_mlp": 0.01482322, + "balance_loss_clip": 1.20561576, + "balance_loss_mlp": 1.10104311, + "epoch": 0.04767773936569968, + "flos": 20924036782560.0, + "grad_norm": 2.197030605154354, + "language_loss": 0.73812562, + "learning_rate": 3.996723233365324e-06, + "loss": 0.76883698, + "num_input_tokens_seen": 16994950, + "step": 793, + "time_per_iteration": 2.72638201713562 + }, + { + "auxiliary_loss_clip": 0.01586098, + "auxiliary_loss_mlp": 0.01473587, + "balance_loss_clip": 1.2039361, + "balance_loss_mlp": 1.08830273, + "epoch": 0.047737862618367655, + "flos": 23734958700960.0, + "grad_norm": 2.8801239936856486, + "language_loss": 0.86156487, + "learning_rate": 3.996700910666847e-06, + "loss": 0.89216179, + "num_input_tokens_seen": 17014760, + "step": 794, + "time_per_iteration": 2.7545714378356934 + }, + { + "auxiliary_loss_clip": 0.01584747, + "auxiliary_loss_mlp": 0.01481958, + "balance_loss_clip": 1.20396614, + "balance_loss_mlp": 1.09896243, + "epoch": 0.04779798587103562, + "flos": 23698016308800.0, + "grad_norm": 5.070649874666961, + "language_loss": 0.69489896, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72556603, + "num_input_tokens_seen": 17032715, + "step": 795, + "time_per_iteration": 2.777031898498535 + }, + { + "auxiliary_loss_clip": 0.01599324, + "auxiliary_loss_mlp": 0.01478463, + "balance_loss_clip": 1.21613812, + "balance_loss_mlp": 1.10157132, + "epoch": 0.04785810912370359, + "flos": 23185791407520.0, + "grad_norm": 2.1737640621793433, + "language_loss": 0.80913216, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83991003, + "num_input_tokens_seen": 17052215, + "step": 796, + "time_per_iteration": 2.727217435836792 + }, + { + "auxiliary_loss_clip": 0.01595086, + "auxiliary_loss_mlp": 0.01485971, + "balance_loss_clip": 1.21355677, + "balance_loss_mlp": 1.12052298, + "epoch": 0.047918232376371564, + "flos": 18042567757920.0, + "grad_norm": 4.105358286043683, + "language_loss": 0.81756574, + "learning_rate": 3.996633488284228e-06, + "loss": 0.84837633, + "num_input_tokens_seen": 17069225, + "step": 797, + "time_per_iteration": 2.783985137939453 + }, + { + "auxiliary_loss_clip": 0.01698615, + "auxiliary_loss_mlp": 0.01425529, + "balance_loss_clip": 1.30735469, + "balance_loss_mlp": 1.0875473, + "epoch": 0.04797835562903953, + "flos": 62448997073760.0, + "grad_norm": 0.9202581268779897, + "language_loss": 0.64435643, + "learning_rate": 3.996610862730465e-06, + "loss": 0.67559791, + "num_input_tokens_seen": 17126680, + "step": 798, + "time_per_iteration": 3.261061906814575 + }, + { + "auxiliary_loss_clip": 0.01588287, + "auxiliary_loss_mlp": 0.01477105, + "balance_loss_clip": 1.20700073, + "balance_loss_mlp": 1.11242032, + "epoch": 0.0480384788817075, + "flos": 21509463761280.0, + "grad_norm": 2.4454846298656365, + "language_loss": 0.91249985, + "learning_rate": 3.996588161465018e-06, + "loss": 0.94315386, + "num_input_tokens_seen": 17144835, + "step": 799, + "time_per_iteration": 2.987316846847534 + }, + { + "auxiliary_loss_clip": 0.01597631, + "auxiliary_loss_mlp": 0.01486188, + "balance_loss_clip": 1.21426809, + "balance_loss_mlp": 1.11291957, + "epoch": 0.048098602134375466, + "flos": 21728918280960.0, + "grad_norm": 2.592066336226436, + "language_loss": 0.86957997, + "learning_rate": 3.996565384488748e-06, + "loss": 0.90041816, + "num_input_tokens_seen": 17165030, + "step": 800, + "time_per_iteration": 2.8059606552124023 + }, + { + "auxiliary_loss_clip": 0.01590647, + "auxiliary_loss_mlp": 0.01502838, + "balance_loss_clip": 1.20862675, + "balance_loss_mlp": 1.13853407, + "epoch": 0.04815872538704344, + "flos": 22933376952480.0, + "grad_norm": 2.447607325805084, + "language_loss": 0.84761953, + "learning_rate": 3.996542531802518e-06, + "loss": 0.87855434, + "num_input_tokens_seen": 17184895, + "step": 801, + "time_per_iteration": 2.767613410949707 + }, + { + "auxiliary_loss_clip": 0.01599781, + "auxiliary_loss_mlp": 0.01488577, + "balance_loss_clip": 1.21743202, + "balance_loss_mlp": 1.12446427, + "epoch": 0.04821884863971141, + "flos": 43177810406400.0, + "grad_norm": 2.0735948556182517, + "language_loss": 0.79957348, + "learning_rate": 3.996519603407196e-06, + "loss": 0.83045697, + "num_input_tokens_seen": 17208225, + "step": 802, + "time_per_iteration": 3.0252323150634766 + }, + { + "auxiliary_loss_clip": 0.0160007, + "auxiliary_loss_mlp": 0.0148585, + "balance_loss_clip": 1.21810961, + "balance_loss_mlp": 1.1211648, + "epoch": 0.048278971892379376, + "flos": 18621736590240.0, + "grad_norm": 5.582353775979843, + "language_loss": 0.86681437, + "learning_rate": 3.996496599303649e-06, + "loss": 0.89767361, + "num_input_tokens_seen": 17226305, + "step": 803, + "time_per_iteration": 2.6870529651641846 + }, + { + "auxiliary_loss_clip": 0.01605545, + "auxiliary_loss_mlp": 0.01507818, + "balance_loss_clip": 1.22432244, + "balance_loss_mlp": 1.14980888, + "epoch": 0.04833909514504735, + "flos": 20232144365760.0, + "grad_norm": 3.3003683106170256, + "language_loss": 0.85533947, + "learning_rate": 3.996473519492753e-06, + "loss": 0.88647306, + "num_input_tokens_seen": 17244545, + "step": 804, + "time_per_iteration": 2.774773359298706 + }, + { + "auxiliary_loss_clip": 0.01601355, + "auxiliary_loss_mlp": 0.01493827, + "balance_loss_clip": 1.21877599, + "balance_loss_mlp": 1.12723446, + "epoch": 0.04839921839771532, + "flos": 24647519338560.0, + "grad_norm": 2.0832646959911494, + "language_loss": 0.8649677, + "learning_rate": 3.99645036397538e-06, + "loss": 0.89591956, + "num_input_tokens_seen": 17265730, + "step": 805, + "time_per_iteration": 2.782658338546753 + }, + { + "auxiliary_loss_clip": 0.01590098, + "auxiliary_loss_mlp": 0.01491764, + "balance_loss_clip": 1.20910883, + "balance_loss_mlp": 1.12574387, + "epoch": 0.048459341650383285, + "flos": 24829803897120.0, + "grad_norm": 2.172233064625165, + "language_loss": 0.68245786, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.71327651, + "num_input_tokens_seen": 17284820, + "step": 806, + "time_per_iteration": 2.8411993980407715 + }, + { + "auxiliary_loss_clip": 0.01595079, + "auxiliary_loss_mlp": 0.014942, + "balance_loss_clip": 1.21232724, + "balance_loss_mlp": 1.12646341, + "epoch": 0.04851946490305126, + "flos": 22165361989920.0, + "grad_norm": 3.2240626818707736, + "language_loss": 0.77373552, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.80462837, + "num_input_tokens_seen": 17305085, + "step": 807, + "time_per_iteration": 2.7758774757385254 + }, + { + "auxiliary_loss_clip": 0.01597504, + "auxiliary_loss_mlp": 0.01472366, + "balance_loss_clip": 1.21454024, + "balance_loss_mlp": 1.09509206, + "epoch": 0.04857958815571922, + "flos": 19794069745920.0, + "grad_norm": 6.140019372783446, + "language_loss": 0.87086016, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.90155888, + "num_input_tokens_seen": 17322715, + "step": 808, + "time_per_iteration": 2.7571284770965576 + }, + { + "auxiliary_loss_clip": 0.01595168, + "auxiliary_loss_mlp": 0.01493279, + "balance_loss_clip": 1.21378994, + "balance_loss_mlp": 1.12821245, + "epoch": 0.048639711408387194, + "flos": 18699983112960.0, + "grad_norm": 2.656246230628043, + "language_loss": 0.90221125, + "learning_rate": 3.996356984858732e-06, + "loss": 0.93309569, + "num_input_tokens_seen": 17341455, + "step": 809, + "time_per_iteration": 2.7715253829956055 + }, + { + "auxiliary_loss_clip": 0.0160471, + "auxiliary_loss_mlp": 0.01481987, + "balance_loss_clip": 1.22364295, + "balance_loss_mlp": 1.10986352, + "epoch": 0.048699834661055166, + "flos": 24865836013440.0, + "grad_norm": 2.4988868240982445, + "language_loss": 0.8502242, + "learning_rate": 3.996333450822208e-06, + "loss": 0.88109118, + "num_input_tokens_seen": 17360765, + "step": 810, + "time_per_iteration": 2.79219651222229 + }, + { + "auxiliary_loss_clip": 0.01601857, + "auxiliary_loss_mlp": 0.01484154, + "balance_loss_clip": 1.22176075, + "balance_loss_mlp": 1.11336493, + "epoch": 0.04875995791372313, + "flos": 20706099389280.0, + "grad_norm": 2.2732504250056467, + "language_loss": 0.80625075, + "learning_rate": 3.99630984108452e-06, + "loss": 0.83711088, + "num_input_tokens_seen": 17380625, + "step": 811, + "time_per_iteration": 2.7977683544158936 + }, + { + "auxiliary_loss_clip": 0.01604524, + "auxiliary_loss_mlp": 0.01483945, + "balance_loss_clip": 1.22371507, + "balance_loss_mlp": 1.10972357, + "epoch": 0.048820081166391104, + "flos": 18590293781280.0, + "grad_norm": 1.8000227467040977, + "language_loss": 0.74661821, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.77750289, + "num_input_tokens_seen": 17399355, + "step": 812, + "time_per_iteration": 2.744022846221924 + }, + { + "auxiliary_loss_clip": 0.01613018, + "auxiliary_loss_mlp": 0.01489103, + "balance_loss_clip": 1.2327987, + "balance_loss_mlp": 1.1179328, + "epoch": 0.04888020441905907, + "flos": 22709333125440.0, + "grad_norm": 2.197170781151613, + "language_loss": 0.90531826, + "learning_rate": 3.996262394509233e-06, + "loss": 0.93633944, + "num_input_tokens_seen": 17418240, + "step": 813, + "time_per_iteration": 2.7667765617370605 + }, + { + "auxiliary_loss_clip": 0.01601902, + "auxiliary_loss_mlp": 0.01482247, + "balance_loss_clip": 1.22068679, + "balance_loss_mlp": 1.1064992, + "epoch": 0.04894032767172704, + "flos": 22786593516000.0, + "grad_norm": 2.099409825053945, + "language_loss": 0.75100982, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.78185135, + "num_input_tokens_seen": 17436250, + "step": 814, + "time_per_iteration": 5.809283018112183 + }, + { + "auxiliary_loss_clip": 0.01603124, + "auxiliary_loss_mlp": 0.01491797, + "balance_loss_clip": 1.22321939, + "balance_loss_mlp": 1.1206274, + "epoch": 0.04900045092439501, + "flos": 25518282779520.0, + "grad_norm": 2.1131307248450257, + "language_loss": 0.83132958, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.86227876, + "num_input_tokens_seen": 17455750, + "step": 815, + "time_per_iteration": 4.216533184051514 + }, + { + "auxiliary_loss_clip": 0.01602436, + "auxiliary_loss_mlp": 0.01475945, + "balance_loss_clip": 1.22141147, + "balance_loss_mlp": 1.10630083, + "epoch": 0.04906057417706298, + "flos": 25960529496960.0, + "grad_norm": 2.245244109820368, + "language_loss": 0.91005599, + "learning_rate": 3.996190656910043e-06, + "loss": 0.94083977, + "num_input_tokens_seen": 17474995, + "step": 816, + "time_per_iteration": 2.7677555084228516 + }, + { + "auxiliary_loss_clip": 0.01599124, + "auxiliary_loss_mlp": 0.01471094, + "balance_loss_clip": 1.21914124, + "balance_loss_mlp": 1.09515536, + "epoch": 0.04912069742973095, + "flos": 18626629322880.0, + "grad_norm": 2.5326072418077485, + "language_loss": 0.80129081, + "learning_rate": 3.996166592984268e-06, + "loss": 0.83199298, + "num_input_tokens_seen": 17493395, + "step": 817, + "time_per_iteration": 4.2608795166015625 + }, + { + "auxiliary_loss_clip": 0.01609199, + "auxiliary_loss_mlp": 0.01497257, + "balance_loss_clip": 1.22752488, + "balance_loss_mlp": 1.12189126, + "epoch": 0.049180820682398915, + "flos": 23702264262720.0, + "grad_norm": 2.254048209469861, + "language_loss": 0.85019577, + "learning_rate": 3.996142453363656e-06, + "loss": 0.88126028, + "num_input_tokens_seen": 17514565, + "step": 818, + "time_per_iteration": 2.8229494094848633 + }, + { + "auxiliary_loss_clip": 0.01601721, + "auxiliary_loss_mlp": 0.01471949, + "balance_loss_clip": 1.21966863, + "balance_loss_mlp": 1.09257698, + "epoch": 0.04924094393506689, + "flos": 22422669177600.0, + "grad_norm": 2.083967234494482, + "language_loss": 0.7536459, + "learning_rate": 3.996118238049124e-06, + "loss": 0.78438258, + "num_input_tokens_seen": 17534590, + "step": 819, + "time_per_iteration": 2.8195865154266357 + }, + { + "auxiliary_loss_clip": 0.01605912, + "auxiliary_loss_mlp": 0.01495691, + "balance_loss_clip": 1.22663188, + "balance_loss_mlp": 1.11879897, + "epoch": 0.04930106718773486, + "flos": 15739850355840.0, + "grad_norm": 2.52862448130024, + "language_loss": 0.84716713, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87818319, + "num_input_tokens_seen": 17551900, + "step": 820, + "time_per_iteration": 2.770303726196289 + }, + { + "auxiliary_loss_clip": 0.01601785, + "auxiliary_loss_mlp": 0.01473671, + "balance_loss_clip": 1.21963382, + "balance_loss_mlp": 1.09563494, + "epoch": 0.049361190440402825, + "flos": 26253072309600.0, + "grad_norm": 2.229333602878343, + "language_loss": 0.90805489, + "learning_rate": 3.996069580341966e-06, + "loss": 0.93880951, + "num_input_tokens_seen": 17571485, + "step": 821, + "time_per_iteration": 2.796457529067993 + }, + { + "auxiliary_loss_clip": 0.01611591, + "auxiliary_loss_mlp": 0.01483207, + "balance_loss_clip": 1.23122001, + "balance_loss_mlp": 1.10707736, + "epoch": 0.0494213136930708, + "flos": 21254546047680.0, + "grad_norm": 2.397404691616607, + "language_loss": 0.89937258, + "learning_rate": 3.996045137951188e-06, + "loss": 0.93032056, + "num_input_tokens_seen": 17591410, + "step": 822, + "time_per_iteration": 2.793041467666626 + }, + { + "auxiliary_loss_clip": 0.01599465, + "auxiliary_loss_mlp": 0.01481142, + "balance_loss_clip": 1.21878624, + "balance_loss_mlp": 1.10501337, + "epoch": 0.04948143694573876, + "flos": 27968049115200.0, + "grad_norm": 2.227035236533098, + "language_loss": 0.67481959, + "learning_rate": 3.996020619870178e-06, + "loss": 0.70562565, + "num_input_tokens_seen": 17612010, + "step": 823, + "time_per_iteration": 2.8804123401641846 + }, + { + "auxiliary_loss_clip": 0.01764243, + "auxiliary_loss_mlp": 0.01419876, + "balance_loss_clip": 1.37453592, + "balance_loss_mlp": 1.07426453, + "epoch": 0.049541560198406734, + "flos": 66186513413280.0, + "grad_norm": 1.4275234226447124, + "language_loss": 0.62259281, + "learning_rate": 3.995996026099866e-06, + "loss": 0.65443397, + "num_input_tokens_seen": 17673430, + "step": 824, + "time_per_iteration": 3.3886635303497314 + }, + { + "auxiliary_loss_clip": 0.01604705, + "auxiliary_loss_mlp": 0.01483109, + "balance_loss_clip": 1.22480679, + "balance_loss_mlp": 1.11422753, + "epoch": 0.049601683451074706, + "flos": 22894879505760.0, + "grad_norm": 1.9646270945471846, + "language_loss": 0.90894872, + "learning_rate": 3.995971356641185e-06, + "loss": 0.93982685, + "num_input_tokens_seen": 17689545, + "step": 825, + "time_per_iteration": 2.780670404434204 + }, + { + "auxiliary_loss_clip": 0.01600444, + "auxiliary_loss_mlp": 0.01485278, + "balance_loss_clip": 1.22146475, + "balance_loss_mlp": 1.10933959, + "epoch": 0.04966180670374267, + "flos": 21435768617760.0, + "grad_norm": 5.79159034461491, + "language_loss": 0.67156011, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.70241737, + "num_input_tokens_seen": 17705965, + "step": 826, + "time_per_iteration": 2.7315330505371094 + }, + { + "auxiliary_loss_clip": 0.01606467, + "auxiliary_loss_mlp": 0.01473779, + "balance_loss_clip": 1.22562265, + "balance_loss_mlp": 1.10012889, + "epoch": 0.04972192995641064, + "flos": 23109327508320.0, + "grad_norm": 1.8951522531647276, + "language_loss": 0.78272557, + "learning_rate": 3.995921790662459e-06, + "loss": 0.813528, + "num_input_tokens_seen": 17724580, + "step": 827, + "time_per_iteration": 2.813575029373169 + }, + { + "auxiliary_loss_clip": 0.01602391, + "auxiliary_loss_mlp": 0.01490518, + "balance_loss_clip": 1.2235136, + "balance_loss_mlp": 1.11953831, + "epoch": 0.04978205320907861, + "flos": 40409330463360.0, + "grad_norm": 1.9791980385509902, + "language_loss": 0.78934693, + "learning_rate": 3.995896894144294e-06, + "loss": 0.82027602, + "num_input_tokens_seen": 17747755, + "step": 828, + "time_per_iteration": 2.956491470336914 + }, + { + "auxiliary_loss_clip": 0.01603857, + "auxiliary_loss_mlp": 0.01475817, + "balance_loss_clip": 1.22459507, + "balance_loss_mlp": 1.10693598, + "epoch": 0.04984217646174658, + "flos": 25230898196640.0, + "grad_norm": 2.1814099828880664, + "language_loss": 0.83806932, + "learning_rate": 3.995871921941519e-06, + "loss": 0.86886597, + "num_input_tokens_seen": 17768550, + "step": 829, + "time_per_iteration": 2.8270435333251953 + }, + { + "auxiliary_loss_clip": 0.01605249, + "auxiliary_loss_mlp": 0.01492489, + "balance_loss_clip": 1.22501159, + "balance_loss_mlp": 1.12532485, + "epoch": 0.04990229971441455, + "flos": 15961390924320.0, + "grad_norm": 2.3458306068631227, + "language_loss": 0.75546587, + "learning_rate": 3.99584687405508e-06, + "loss": 0.78644323, + "num_input_tokens_seen": 17786080, + "step": 830, + "time_per_iteration": 2.7942752838134766 + }, + { + "auxiliary_loss_clip": 0.01599254, + "auxiliary_loss_mlp": 0.01487046, + "balance_loss_clip": 1.2178303, + "balance_loss_mlp": 1.12102532, + "epoch": 0.04996242296708252, + "flos": 18407061018720.0, + "grad_norm": 4.762219931577293, + "language_loss": 0.7954793, + "learning_rate": 3.995821750485929e-06, + "loss": 0.82634228, + "num_input_tokens_seen": 17803635, + "step": 831, + "time_per_iteration": 2.8453850746154785 + }, + { + "auxiliary_loss_clip": 0.01610991, + "auxiliary_loss_mlp": 0.01480843, + "balance_loss_clip": 1.23125958, + "balance_loss_mlp": 1.11520386, + "epoch": 0.05002254621975049, + "flos": 17859979774080.0, + "grad_norm": 2.6997513151616053, + "language_loss": 0.91541409, + "learning_rate": 3.995796551235016e-06, + "loss": 0.94633245, + "num_input_tokens_seen": 17822190, + "step": 832, + "time_per_iteration": 2.811673879623413 + }, + { + "auxiliary_loss_clip": 0.01609167, + "auxiliary_loss_mlp": 0.01492036, + "balance_loss_clip": 1.22998953, + "balance_loss_mlp": 1.12620616, + "epoch": 0.050082669472418455, + "flos": 45663760571040.0, + "grad_norm": 2.1090621497697866, + "language_loss": 0.8360942, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.8671062, + "num_input_tokens_seen": 17846915, + "step": 833, + "time_per_iteration": 2.955228090286255 + }, + { + "auxiliary_loss_clip": 0.01601744, + "auxiliary_loss_mlp": 0.01485464, + "balance_loss_clip": 1.22287035, + "balance_loss_mlp": 1.11658287, + "epoch": 0.05014279272508643, + "flos": 37965139567200.0, + "grad_norm": 4.704756523067834, + "language_loss": 0.82279813, + "learning_rate": 3.995745925691733e-06, + "loss": 0.85367024, + "num_input_tokens_seen": 17867270, + "step": 834, + "time_per_iteration": 2.9222230911254883 + }, + { + "auxiliary_loss_clip": 0.01600743, + "auxiliary_loss_mlp": 0.01487717, + "balance_loss_clip": 1.21963012, + "balance_loss_mlp": 1.12532079, + "epoch": 0.0502029159777544, + "flos": 20998300848480.0, + "grad_norm": 2.8410822983278243, + "language_loss": 0.91903925, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94992387, + "num_input_tokens_seen": 17884880, + "step": 835, + "time_per_iteration": 2.752405881881714 + }, + { + "auxiliary_loss_clip": 0.01590633, + "auxiliary_loss_mlp": 0.01483152, + "balance_loss_clip": 1.21119022, + "balance_loss_mlp": 1.11388969, + "epoch": 0.050263039230422364, + "flos": 15889895614080.0, + "grad_norm": 2.460093002305328, + "language_loss": 0.76726121, + "learning_rate": 3.995694997432911e-06, + "loss": 0.79799902, + "num_input_tokens_seen": 17903695, + "step": 836, + "time_per_iteration": 2.7608394622802734 + }, + { + "auxiliary_loss_clip": 0.0160453, + "auxiliary_loss_mlp": 0.01485596, + "balance_loss_clip": 1.22404933, + "balance_loss_mlp": 1.12052917, + "epoch": 0.050323162483090336, + "flos": 23735034557280.0, + "grad_norm": 2.3438412081089255, + "language_loss": 0.83844161, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.8693428, + "num_input_tokens_seen": 17920745, + "step": 837, + "time_per_iteration": 2.8023030757904053 + }, + { + "auxiliary_loss_clip": 0.01616172, + "auxiliary_loss_mlp": 0.01486006, + "balance_loss_clip": 1.23692393, + "balance_loss_mlp": 1.12170279, + "epoch": 0.0503832857357583, + "flos": 20268290266560.0, + "grad_norm": 2.4349380304366144, + "language_loss": 0.72990942, + "learning_rate": 3.995643766466275e-06, + "loss": 0.76093119, + "num_input_tokens_seen": 17938220, + "step": 838, + "time_per_iteration": 2.74694561958313 + }, + { + "auxiliary_loss_clip": 0.01602399, + "auxiliary_loss_mlp": 0.01490709, + "balance_loss_clip": 1.22201037, + "balance_loss_mlp": 1.12735915, + "epoch": 0.05044340898842627, + "flos": 17786929409280.0, + "grad_norm": 1.917123184916067, + "language_loss": 0.83537155, + "learning_rate": 3.995618037469953e-06, + "loss": 0.86630267, + "num_input_tokens_seen": 17957325, + "step": 839, + "time_per_iteration": 2.7834432125091553 + }, + { + "auxiliary_loss_clip": 0.01601178, + "auxiliary_loss_mlp": 0.01479213, + "balance_loss_clip": 1.22233999, + "balance_loss_mlp": 1.11147618, + "epoch": 0.050503532241094246, + "flos": 22968764290080.0, + "grad_norm": 2.2704070954781654, + "language_loss": 0.85832393, + "learning_rate": 3.995592232799595e-06, + "loss": 0.88912785, + "num_input_tokens_seen": 17975875, + "step": 840, + "time_per_iteration": 2.7260985374450684 + }, + { + "auxiliary_loss_clip": 0.015911, + "auxiliary_loss_mlp": 0.01479968, + "balance_loss_clip": 1.21115232, + "balance_loss_mlp": 1.10307622, + "epoch": 0.05056365549376221, + "flos": 22778818243200.0, + "grad_norm": 1.9451898605226372, + "language_loss": 0.94535255, + "learning_rate": 3.99556635245618e-06, + "loss": 0.97606325, + "num_input_tokens_seen": 17994340, + "step": 841, + "time_per_iteration": 2.82609224319458 + }, + { + "auxiliary_loss_clip": 0.01602659, + "auxiliary_loss_mlp": 0.01482499, + "balance_loss_clip": 1.22143662, + "balance_loss_mlp": 1.10865855, + "epoch": 0.05062377874643018, + "flos": 30919458395520.0, + "grad_norm": 2.4250866705315293, + "language_loss": 0.77567047, + "learning_rate": 3.995540396440688e-06, + "loss": 0.80652201, + "num_input_tokens_seen": 18015260, + "step": 842, + "time_per_iteration": 2.8628182411193848 + }, + { + "auxiliary_loss_clip": 0.01609519, + "auxiliary_loss_mlp": 0.01490663, + "balance_loss_clip": 1.22857869, + "balance_loss_mlp": 1.12044692, + "epoch": 0.05068390199909815, + "flos": 19649258573760.0, + "grad_norm": 3.790797671216452, + "language_loss": 0.78308934, + "learning_rate": 3.995514364754105e-06, + "loss": 0.81409121, + "num_input_tokens_seen": 18033960, + "step": 843, + "time_per_iteration": 2.827629804611206 + }, + { + "auxiliary_loss_clip": 0.0160026, + "auxiliary_loss_mlp": 0.01474155, + "balance_loss_clip": 1.21984267, + "balance_loss_mlp": 1.1014595, + "epoch": 0.05074402525176612, + "flos": 37965177495360.0, + "grad_norm": 2.983263060651386, + "language_loss": 0.83001262, + "learning_rate": 3.995488257397417e-06, + "loss": 0.86075681, + "num_input_tokens_seen": 18056700, + "step": 844, + "time_per_iteration": 2.8622467517852783 + }, + { + "auxiliary_loss_clip": 0.01592416, + "auxiliary_loss_mlp": 0.01469795, + "balance_loss_clip": 1.21152532, + "balance_loss_mlp": 1.10263062, + "epoch": 0.05080414850443409, + "flos": 22056810503040.0, + "grad_norm": 2.9846099907825647, + "language_loss": 0.76555955, + "learning_rate": 3.995462074371614e-06, + "loss": 0.79618162, + "num_input_tokens_seen": 18075815, + "step": 845, + "time_per_iteration": 2.7889273166656494 + }, + { + "auxiliary_loss_clip": 0.01600171, + "auxiliary_loss_mlp": 0.01481009, + "balance_loss_clip": 1.21892715, + "balance_loss_mlp": 1.10735941, + "epoch": 0.05086427175710206, + "flos": 20227630914720.0, + "grad_norm": 2.0155230958277595, + "language_loss": 0.87688828, + "learning_rate": 3.99543581567769e-06, + "loss": 0.90770012, + "num_input_tokens_seen": 18095095, + "step": 846, + "time_per_iteration": 2.813591480255127 + }, + { + "auxiliary_loss_clip": 0.01603089, + "auxiliary_loss_mlp": 0.01479402, + "balance_loss_clip": 1.22357607, + "balance_loss_mlp": 1.1099484, + "epoch": 0.05092439500977003, + "flos": 15161060805120.0, + "grad_norm": 9.053877273049808, + "language_loss": 0.87694651, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.90777147, + "num_input_tokens_seen": 18112675, + "step": 847, + "time_per_iteration": 2.7500383853912354 + }, + { + "auxiliary_loss_clip": 0.01601268, + "auxiliary_loss_mlp": 0.01485621, + "balance_loss_clip": 1.22162104, + "balance_loss_mlp": 1.1199826, + "epoch": 0.050984518262437994, + "flos": 22057379425440.0, + "grad_norm": 2.527943749539725, + "language_loss": 0.82171285, + "learning_rate": 3.995383071289462e-06, + "loss": 0.8525818, + "num_input_tokens_seen": 18130745, + "step": 848, + "time_per_iteration": 2.778815507888794 + }, + { + "auxiliary_loss_clip": 0.01608658, + "auxiliary_loss_mlp": 0.01492479, + "balance_loss_clip": 1.23033869, + "balance_loss_mlp": 1.13961911, + "epoch": 0.05104464151510597, + "flos": 30227869404000.0, + "grad_norm": 1.9413275947759456, + "language_loss": 0.87384868, + "learning_rate": 3.995356585597158e-06, + "loss": 0.90486008, + "num_input_tokens_seen": 18152410, + "step": 849, + "time_per_iteration": 2.846540927886963 + }, + { + "auxiliary_loss_clip": 0.0159681, + "auxiliary_loss_mlp": 0.01471973, + "balance_loss_clip": 1.21657705, + "balance_loss_mlp": 1.10194755, + "epoch": 0.05110476476777394, + "flos": 18334883001600.0, + "grad_norm": 5.569379243674454, + "language_loss": 0.83756042, + "learning_rate": 3.995330024240732e-06, + "loss": 0.86824822, + "num_input_tokens_seen": 18170870, + "step": 850, + "time_per_iteration": 2.814908266067505 + }, + { + "auxiliary_loss_clip": 0.01599913, + "auxiliary_loss_mlp": 0.0147878, + "balance_loss_clip": 1.21827388, + "balance_loss_mlp": 1.11676526, + "epoch": 0.051164888020441904, + "flos": 38001816462240.0, + "grad_norm": 2.904361115270721, + "language_loss": 0.65388864, + "learning_rate": 3.995303387221192e-06, + "loss": 0.68467557, + "num_input_tokens_seen": 18191555, + "step": 851, + "time_per_iteration": 2.9206433296203613 + }, + { + "auxiliary_loss_clip": 0.01595996, + "auxiliary_loss_mlp": 0.01481941, + "balance_loss_clip": 1.21443105, + "balance_loss_mlp": 1.11820912, + "epoch": 0.051225011273109876, + "flos": 23040828522720.0, + "grad_norm": 5.158185343678895, + "language_loss": 0.83525074, + "learning_rate": 3.995276674539547e-06, + "loss": 0.8660301, + "num_input_tokens_seen": 18208620, + "step": 852, + "time_per_iteration": 4.323344945907593 + }, + { + "auxiliary_loss_clip": 0.0160305, + "auxiliary_loss_mlp": 0.01513458, + "balance_loss_clip": 1.22222638, + "balance_loss_mlp": 1.15087152, + "epoch": 0.05128513452577785, + "flos": 18261794708640.0, + "grad_norm": 2.1426094412545047, + "language_loss": 0.80482799, + "learning_rate": 3.995249886196811e-06, + "loss": 0.83599305, + "num_input_tokens_seen": 18226370, + "step": 853, + "time_per_iteration": 4.342184782028198 + }, + { + "auxiliary_loss_clip": 0.01595421, + "auxiliary_loss_mlp": 0.01481573, + "balance_loss_clip": 1.21485174, + "balance_loss_mlp": 1.11211967, + "epoch": 0.05134525777844581, + "flos": 27201437494560.0, + "grad_norm": 2.0742072529739843, + "language_loss": 0.75743663, + "learning_rate": 3.995223022193999e-06, + "loss": 0.78820658, + "num_input_tokens_seen": 18247075, + "step": 854, + "time_per_iteration": 2.790907144546509 + }, + { + "auxiliary_loss_clip": 0.01606985, + "auxiliary_loss_mlp": 0.01471711, + "balance_loss_clip": 1.22678018, + "balance_loss_mlp": 1.10225797, + "epoch": 0.051405381031113785, + "flos": 28364554107360.0, + "grad_norm": 2.11627660467045, + "language_loss": 0.81243753, + "learning_rate": 3.99519608253213e-06, + "loss": 0.84322453, + "num_input_tokens_seen": 18265680, + "step": 855, + "time_per_iteration": 2.8117637634277344 + }, + { + "auxiliary_loss_clip": 0.01734653, + "auxiliary_loss_mlp": 0.0142907, + "balance_loss_clip": 1.34729135, + "balance_loss_mlp": 1.11244965, + "epoch": 0.05146550428378175, + "flos": 65624829462720.0, + "grad_norm": 1.008356440234955, + "language_loss": 0.65507519, + "learning_rate": 3.995169067212227e-06, + "loss": 0.6867125, + "num_input_tokens_seen": 18327015, + "step": 856, + "time_per_iteration": 4.730028390884399 + }, + { + "auxiliary_loss_clip": 0.01600895, + "auxiliary_loss_mlp": 0.01458077, + "balance_loss_clip": 1.22043538, + "balance_loss_mlp": 1.0813756, + "epoch": 0.05152562753644972, + "flos": 22057076000160.0, + "grad_norm": 2.33886313708137, + "language_loss": 0.773076, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.80366576, + "num_input_tokens_seen": 18345235, + "step": 857, + "time_per_iteration": 2.816222906112671 + }, + { + "auxiliary_loss_clip": 0.01598208, + "auxiliary_loss_mlp": 0.01476116, + "balance_loss_clip": 1.2186389, + "balance_loss_mlp": 1.1009407, + "epoch": 0.051585750789117694, + "flos": 18511212839040.0, + "grad_norm": 2.458879620621452, + "language_loss": 0.89546257, + "learning_rate": 3.995114809602412e-06, + "loss": 0.92620575, + "num_input_tokens_seen": 18362350, + "step": 858, + "time_per_iteration": 2.801893949508667 + }, + { + "auxiliary_loss_clip": 0.01599216, + "auxiliary_loss_mlp": 0.01472734, + "balance_loss_clip": 1.21852291, + "balance_loss_mlp": 1.10766768, + "epoch": 0.05164587404178566, + "flos": 23732379586080.0, + "grad_norm": 2.088510336351858, + "language_loss": 0.75351393, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.78423345, + "num_input_tokens_seen": 18383390, + "step": 859, + "time_per_iteration": 2.9374165534973145 + }, + { + "auxiliary_loss_clip": 0.01597456, + "auxiliary_loss_mlp": 0.0147282, + "balance_loss_clip": 1.21680963, + "balance_loss_mlp": 1.09897995, + "epoch": 0.05170599729445363, + "flos": 16254730228320.0, + "grad_norm": 2.2452771784761207, + "language_loss": 0.9092375, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93994027, + "num_input_tokens_seen": 18399220, + "step": 860, + "time_per_iteration": 2.854414701461792 + }, + { + "auxiliary_loss_clip": 0.01601267, + "auxiliary_loss_mlp": 0.014816, + "balance_loss_clip": 1.2197516, + "balance_loss_mlp": 1.10566187, + "epoch": 0.0517661205471216, + "flos": 23988093791040.0, + "grad_norm": 2.209103209016842, + "language_loss": 0.82394964, + "learning_rate": 3.99503285577813e-06, + "loss": 0.85477829, + "num_input_tokens_seen": 18419005, + "step": 861, + "time_per_iteration": 2.879263401031494 + }, + { + "auxiliary_loss_clip": 0.0159632, + "auxiliary_loss_mlp": 0.01467884, + "balance_loss_clip": 1.21668696, + "balance_loss_mlp": 1.09900284, + "epoch": 0.05182624379978957, + "flos": 29280262782240.0, + "grad_norm": 2.0410294602725365, + "language_loss": 0.7850132, + "learning_rate": 3.995005386531627e-06, + "loss": 0.81565523, + "num_input_tokens_seen": 18440550, + "step": 862, + "time_per_iteration": 2.9554338455200195 + }, + { + "auxiliary_loss_clip": 0.0160703, + "auxiliary_loss_mlp": 0.01491718, + "balance_loss_clip": 1.22620368, + "balance_loss_mlp": 1.1186409, + "epoch": 0.05188636705245754, + "flos": 24173147105280.0, + "grad_norm": 2.147359983320735, + "language_loss": 0.88924599, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.92023343, + "num_input_tokens_seen": 18461950, + "step": 863, + "time_per_iteration": 2.8967883586883545 + }, + { + "auxiliary_loss_clip": 0.0159638, + "auxiliary_loss_mlp": 0.01473329, + "balance_loss_clip": 1.21806431, + "balance_loss_mlp": 1.09910679, + "epoch": 0.051946490305125506, + "flos": 26763135305760.0, + "grad_norm": 2.346198645359186, + "language_loss": 0.76007092, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.79076803, + "num_input_tokens_seen": 18480555, + "step": 864, + "time_per_iteration": 2.8761043548583984 + }, + { + "auxiliary_loss_clip": 0.01595058, + "auxiliary_loss_mlp": 0.01467472, + "balance_loss_clip": 1.21515989, + "balance_loss_mlp": 1.09706497, + "epoch": 0.05200661355779348, + "flos": 21504419316000.0, + "grad_norm": 2.375808773530364, + "language_loss": 0.78891671, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81954205, + "num_input_tokens_seen": 18499645, + "step": 865, + "time_per_iteration": 2.8692169189453125 + }, + { + "auxiliary_loss_clip": 0.01597415, + "auxiliary_loss_mlp": 0.01487879, + "balance_loss_clip": 1.21612859, + "balance_loss_mlp": 1.12014222, + "epoch": 0.05206673681046144, + "flos": 18116490470400.0, + "grad_norm": 2.5483234045715255, + "language_loss": 0.8640132, + "learning_rate": 3.994894753048032e-06, + "loss": 0.89486611, + "num_input_tokens_seen": 18516810, + "step": 866, + "time_per_iteration": 2.790088176727295 + }, + { + "auxiliary_loss_clip": 0.01600244, + "auxiliary_loss_mlp": 0.01472514, + "balance_loss_clip": 1.21953321, + "balance_loss_mlp": 1.09657598, + "epoch": 0.052126860063129415, + "flos": 17525791477440.0, + "grad_norm": 3.539585314644739, + "language_loss": 0.87512046, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.90584803, + "num_input_tokens_seen": 18532510, + "step": 867, + "time_per_iteration": 2.840233325958252 + }, + { + "auxiliary_loss_clip": 0.01600233, + "auxiliary_loss_mlp": 0.01467137, + "balance_loss_clip": 1.21984398, + "balance_loss_mlp": 1.10245204, + "epoch": 0.05218698331579739, + "flos": 32600792558880.0, + "grad_norm": 1.5606560984680955, + "language_loss": 0.63945436, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.67012811, + "num_input_tokens_seen": 18557380, + "step": 868, + "time_per_iteration": 2.8705837726593018 + }, + { + "auxiliary_loss_clip": 0.01589835, + "auxiliary_loss_mlp": 0.01470492, + "balance_loss_clip": 1.20906782, + "balance_loss_mlp": 1.09836817, + "epoch": 0.05224710656846535, + "flos": 22129481586240.0, + "grad_norm": 2.5949860297294087, + "language_loss": 0.83714342, + "learning_rate": 3.994810983642281e-06, + "loss": 0.86774665, + "num_input_tokens_seen": 18575720, + "step": 869, + "time_per_iteration": 2.8953895568847656 + }, + { + "auxiliary_loss_clip": 0.01597705, + "auxiliary_loss_mlp": 0.01479839, + "balance_loss_clip": 1.21726322, + "balance_loss_mlp": 1.1119113, + "epoch": 0.052307229821133325, + "flos": 11146666347360.0, + "grad_norm": 2.1096419535287123, + "language_loss": 0.88128465, + "learning_rate": 3.994782909218751e-06, + "loss": 0.91206014, + "num_input_tokens_seen": 18592185, + "step": 870, + "time_per_iteration": 2.7870328426361084 + }, + { + "auxiliary_loss_clip": 0.01589632, + "auxiliary_loss_mlp": 0.01479421, + "balance_loss_clip": 1.21034527, + "balance_loss_mlp": 1.11588001, + "epoch": 0.05236735307380129, + "flos": 19129675609440.0, + "grad_norm": 2.360478224240403, + "language_loss": 0.80962813, + "learning_rate": 3.994754759152854e-06, + "loss": 0.84031868, + "num_input_tokens_seen": 18609560, + "step": 871, + "time_per_iteration": 2.847245216369629 + }, + { + "auxiliary_loss_clip": 0.01600504, + "auxiliary_loss_mlp": 0.01471082, + "balance_loss_clip": 1.22130167, + "balance_loss_mlp": 1.09876812, + "epoch": 0.05242747632646926, + "flos": 20962837654560.0, + "grad_norm": 11.046162117937106, + "language_loss": 0.81454682, + "learning_rate": 3.994726533445656e-06, + "loss": 0.84526277, + "num_input_tokens_seen": 18629405, + "step": 872, + "time_per_iteration": 2.813397169113159 + }, + { + "auxiliary_loss_clip": 0.01709386, + "auxiliary_loss_mlp": 0.01414452, + "balance_loss_clip": 1.32006061, + "balance_loss_mlp": 1.09783173, + "epoch": 0.052487599579137234, + "flos": 65026393125120.0, + "grad_norm": 0.8822452563446321, + "language_loss": 0.61601943, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.6472578, + "num_input_tokens_seen": 18681480, + "step": 873, + "time_per_iteration": 3.2629528045654297 + }, + { + "auxiliary_loss_clip": 0.01590659, + "auxiliary_loss_mlp": 0.01480425, + "balance_loss_clip": 1.21066606, + "balance_loss_mlp": 1.10505915, + "epoch": 0.0525477228318052, + "flos": 23290777647360.0, + "grad_norm": 1.8540353342862546, + "language_loss": 0.88891441, + "learning_rate": 3.994669855111643e-06, + "loss": 0.91962528, + "num_input_tokens_seen": 18700390, + "step": 874, + "time_per_iteration": 2.8843746185302734 + }, + { + "auxiliary_loss_clip": 0.01583126, + "auxiliary_loss_mlp": 0.01474809, + "balance_loss_clip": 1.20433736, + "balance_loss_mlp": 1.10688174, + "epoch": 0.05260784608447317, + "flos": 32232771979200.0, + "grad_norm": 3.9634139549465304, + "language_loss": 0.74991399, + "learning_rate": 3.994641402486977e-06, + "loss": 0.78049338, + "num_input_tokens_seen": 18721280, + "step": 875, + "time_per_iteration": 2.888678550720215 + }, + { + "auxiliary_loss_clip": 0.01591597, + "auxiliary_loss_mlp": 0.0147308, + "balance_loss_clip": 1.21120119, + "balance_loss_mlp": 1.10725009, + "epoch": 0.052667969337141136, + "flos": 24465576133440.0, + "grad_norm": 1.953156305308115, + "language_loss": 0.92805696, + "learning_rate": 3.99461287422531e-06, + "loss": 0.95870376, + "num_input_tokens_seen": 18741545, + "step": 876, + "time_per_iteration": 2.8290693759918213 + }, + { + "auxiliary_loss_clip": 0.01698165, + "auxiliary_loss_mlp": 0.01424049, + "balance_loss_clip": 1.30936253, + "balance_loss_mlp": 1.10971832, + "epoch": 0.05272809258980911, + "flos": 57791144823840.0, + "grad_norm": 0.8247151591673576, + "language_loss": 0.62887907, + "learning_rate": 3.994584270327722e-06, + "loss": 0.66010118, + "num_input_tokens_seen": 18801400, + "step": 877, + "time_per_iteration": 3.2497611045837402 + }, + { + "auxiliary_loss_clip": 0.01587415, + "auxiliary_loss_mlp": 0.01472437, + "balance_loss_clip": 1.2086339, + "balance_loss_mlp": 1.10851455, + "epoch": 0.05278821584247708, + "flos": 17422739573760.0, + "grad_norm": 2.363765736462934, + "language_loss": 0.85590541, + "learning_rate": 3.994555590795299e-06, + "loss": 0.88650393, + "num_input_tokens_seen": 18819670, + "step": 878, + "time_per_iteration": 2.8238160610198975 + }, + { + "auxiliary_loss_clip": 0.01596486, + "auxiliary_loss_mlp": 0.01483965, + "balance_loss_clip": 1.21900439, + "balance_loss_mlp": 1.1118412, + "epoch": 0.052848339095145046, + "flos": 26139476377440.0, + "grad_norm": 2.089510195160713, + "language_loss": 0.83232868, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.86313319, + "num_input_tokens_seen": 18840580, + "step": 879, + "time_per_iteration": 2.908902645111084 + }, + { + "auxiliary_loss_clip": 0.01586076, + "auxiliary_loss_mlp": 0.01477456, + "balance_loss_clip": 1.20697665, + "balance_loss_mlp": 1.09961069, + "epoch": 0.05290846234781302, + "flos": 16473957179040.0, + "grad_norm": 2.309989508582945, + "language_loss": 0.84753323, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.87816858, + "num_input_tokens_seen": 18859295, + "step": 880, + "time_per_iteration": 2.8046133518218994 + }, + { + "auxiliary_loss_clip": 0.01590001, + "auxiliary_loss_mlp": 0.01482661, + "balance_loss_clip": 1.21126866, + "balance_loss_mlp": 1.11397076, + "epoch": 0.05296858560048098, + "flos": 19867233895200.0, + "grad_norm": 2.5622918283617873, + "language_loss": 0.87313247, + "learning_rate": 3.994469098399906e-06, + "loss": 0.90385914, + "num_input_tokens_seen": 18877485, + "step": 881, + "time_per_iteration": 2.8081283569335938 + }, + { + "auxiliary_loss_clip": 0.01588328, + "auxiliary_loss_mlp": 0.01484208, + "balance_loss_clip": 1.20841956, + "balance_loss_mlp": 1.12181211, + "epoch": 0.053028708853148955, + "flos": 24390856929600.0, + "grad_norm": 1.9555237456255432, + "language_loss": 0.88005513, + "learning_rate": 3.994440116339046e-06, + "loss": 0.91078049, + "num_input_tokens_seen": 18898275, + "step": 882, + "time_per_iteration": 2.8642282485961914 + }, + { + "auxiliary_loss_clip": 0.01587442, + "auxiliary_loss_mlp": 0.01477344, + "balance_loss_clip": 1.20857561, + "balance_loss_mlp": 1.10197759, + "epoch": 0.05308883210581693, + "flos": 36396149706720.0, + "grad_norm": 2.4017676646890864, + "language_loss": 0.69566888, + "learning_rate": 3.994411058648816e-06, + "loss": 0.72631681, + "num_input_tokens_seen": 18920665, + "step": 883, + "time_per_iteration": 2.853809118270874 + }, + { + "auxiliary_loss_clip": 0.01586852, + "auxiliary_loss_mlp": 0.01463264, + "balance_loss_clip": 1.20856822, + "balance_loss_mlp": 1.09342921, + "epoch": 0.05314895535848489, + "flos": 22857216478560.0, + "grad_norm": 4.322714227738021, + "language_loss": 0.76404464, + "learning_rate": 3.994381925330319e-06, + "loss": 0.79454583, + "num_input_tokens_seen": 18939835, + "step": 884, + "time_per_iteration": 2.738755941390991 + }, + { + "auxiliary_loss_clip": 0.01586641, + "auxiliary_loss_mlp": 0.01475156, + "balance_loss_clip": 1.20681429, + "balance_loss_mlp": 1.10322261, + "epoch": 0.053209078611152864, + "flos": 12862022434560.0, + "grad_norm": 3.802763482752133, + "language_loss": 0.85850167, + "learning_rate": 3.994352716384659e-06, + "loss": 0.88911963, + "num_input_tokens_seen": 18958405, + "step": 885, + "time_per_iteration": 2.8229928016662598 + }, + { + "auxiliary_loss_clip": 0.01579903, + "auxiliary_loss_mlp": 0.01469025, + "balance_loss_clip": 1.20162296, + "balance_loss_mlp": 1.09384918, + "epoch": 0.05326920186382083, + "flos": 12166109632800.0, + "grad_norm": 3.331399034279253, + "language_loss": 0.86260587, + "learning_rate": 3.994323431812945e-06, + "loss": 0.89309514, + "num_input_tokens_seen": 18975445, + "step": 886, + "time_per_iteration": 2.804697275161743 + }, + { + "auxiliary_loss_clip": 0.01587757, + "auxiliary_loss_mlp": 0.01475561, + "balance_loss_clip": 1.20575356, + "balance_loss_mlp": 1.0992415, + "epoch": 0.0533293251164888, + "flos": 22706336800800.0, + "grad_norm": 2.3931877912736432, + "language_loss": 0.89618456, + "learning_rate": 3.994294071616286e-06, + "loss": 0.92681777, + "num_input_tokens_seen": 18991930, + "step": 887, + "time_per_iteration": 2.8402979373931885 + }, + { + "auxiliary_loss_clip": 0.01582138, + "auxiliary_loss_mlp": 0.01484929, + "balance_loss_clip": 1.20195436, + "balance_loss_mlp": 1.11642933, + "epoch": 0.053389448369156774, + "flos": 26943257959200.0, + "grad_norm": 5.031690705282135, + "language_loss": 0.75106186, + "learning_rate": 3.994264635795796e-06, + "loss": 0.7817325, + "num_input_tokens_seen": 19009790, + "step": 888, + "time_per_iteration": 2.8035850524902344 + }, + { + "auxiliary_loss_clip": 0.01582512, + "auxiliary_loss_mlp": 0.01477391, + "balance_loss_clip": 1.20265281, + "balance_loss_mlp": 1.10641193, + "epoch": 0.05344957162182474, + "flos": 25558676634240.0, + "grad_norm": 2.2564842569999533, + "language_loss": 0.89139247, + "learning_rate": 3.994235124352592e-06, + "loss": 0.92199147, + "num_input_tokens_seen": 19030170, + "step": 889, + "time_per_iteration": 2.7702956199645996 + }, + { + "auxiliary_loss_clip": 0.01579528, + "auxiliary_loss_mlp": 0.01476771, + "balance_loss_clip": 1.19977832, + "balance_loss_mlp": 1.09320343, + "epoch": 0.05350969487449271, + "flos": 19721664159840.0, + "grad_norm": 2.39901729820102, + "language_loss": 0.89153272, + "learning_rate": 3.994205537287791e-06, + "loss": 0.92209566, + "num_input_tokens_seen": 19048075, + "step": 890, + "time_per_iteration": 2.827716112136841 + }, + { + "auxiliary_loss_clip": 0.01583391, + "auxiliary_loss_mlp": 0.01468177, + "balance_loss_clip": 1.20317507, + "balance_loss_mlp": 1.08575392, + "epoch": 0.053569818127160676, + "flos": 27018811582560.0, + "grad_norm": 2.5101511906940486, + "language_loss": 0.93545747, + "learning_rate": 3.994175874602517e-06, + "loss": 0.96597314, + "num_input_tokens_seen": 19067465, + "step": 891, + "time_per_iteration": 6.059449672698975 + }, + { + "auxiliary_loss_clip": 0.01583249, + "auxiliary_loss_mlp": 0.01471063, + "balance_loss_clip": 1.20266557, + "balance_loss_mlp": 1.1019913, + "epoch": 0.05362994137982865, + "flos": 13190028441120.0, + "grad_norm": 2.3761788967059423, + "language_loss": 0.72063005, + "learning_rate": 3.994146136297893e-06, + "loss": 0.7511732, + "num_input_tokens_seen": 19085505, + "step": 892, + "time_per_iteration": 2.8466639518737793 + }, + { + "auxiliary_loss_clip": 0.01573235, + "auxiliary_loss_mlp": 0.01484159, + "balance_loss_clip": 1.19284296, + "balance_loss_mlp": 1.11260748, + "epoch": 0.05369006463249662, + "flos": 28660548382560.0, + "grad_norm": 1.81911636429595, + "language_loss": 0.82440877, + "learning_rate": 3.994116322375049e-06, + "loss": 0.85498273, + "num_input_tokens_seen": 19104360, + "step": 893, + "time_per_iteration": 4.322775840759277 + }, + { + "auxiliary_loss_clip": 0.01573372, + "auxiliary_loss_mlp": 0.01464715, + "balance_loss_clip": 1.19133711, + "balance_loss_mlp": 1.08667827, + "epoch": 0.053750187885164585, + "flos": 28915693665120.0, + "grad_norm": 2.1923773285741053, + "language_loss": 0.81923354, + "learning_rate": 3.994086432835114e-06, + "loss": 0.84961438, + "num_input_tokens_seen": 19124680, + "step": 894, + "time_per_iteration": 2.874113082885742 + }, + { + "auxiliary_loss_clip": 0.01582284, + "auxiliary_loss_mlp": 0.01466894, + "balance_loss_clip": 1.20092678, + "balance_loss_mlp": 1.09991992, + "epoch": 0.05381031113783256, + "flos": 15160643595360.0, + "grad_norm": 3.5627904540213757, + "language_loss": 0.76050252, + "learning_rate": 3.994056467679221e-06, + "loss": 0.79099429, + "num_input_tokens_seen": 19142895, + "step": 895, + "time_per_iteration": 2.852187395095825 + }, + { + "auxiliary_loss_clip": 0.01584369, + "auxiliary_loss_mlp": 0.01467129, + "balance_loss_clip": 1.20476413, + "balance_loss_mlp": 1.09462404, + "epoch": 0.05387043439050053, + "flos": 21837393911520.0, + "grad_norm": 2.6152717308567834, + "language_loss": 0.86652863, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.89704359, + "num_input_tokens_seen": 19163125, + "step": 896, + "time_per_iteration": 2.760192394256592 + }, + { + "auxiliary_loss_clip": 0.01578432, + "auxiliary_loss_mlp": 0.01462506, + "balance_loss_clip": 1.19821918, + "balance_loss_mlp": 1.08942866, + "epoch": 0.053930557643168495, + "flos": 17312177894400.0, + "grad_norm": 2.4279087804038015, + "language_loss": 0.88240099, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.91281039, + "num_input_tokens_seen": 19179385, + "step": 897, + "time_per_iteration": 2.77807879447937 + }, + { + "auxiliary_loss_clip": 0.01585845, + "auxiliary_loss_mlp": 0.01484614, + "balance_loss_clip": 1.20362687, + "balance_loss_mlp": 1.11401618, + "epoch": 0.05399068089583647, + "flos": 17350447772160.0, + "grad_norm": 1.7297112102282706, + "language_loss": 0.90282303, + "learning_rate": 3.993966118527175e-06, + "loss": 0.93352771, + "num_input_tokens_seen": 19198725, + "step": 898, + "time_per_iteration": 2.7447311878204346 + }, + { + "auxiliary_loss_clip": 0.01590652, + "auxiliary_loss_mlp": 0.01472384, + "balance_loss_clip": 1.20792127, + "balance_loss_mlp": 1.10369349, + "epoch": 0.05405080414850443, + "flos": 17488507731840.0, + "grad_norm": 5.797967638567358, + "language_loss": 0.91622341, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94685376, + "num_input_tokens_seen": 19212380, + "step": 899, + "time_per_iteration": 2.757105588912964 + }, + { + "auxiliary_loss_clip": 0.0157963, + "auxiliary_loss_mlp": 0.01473172, + "balance_loss_clip": 1.19894433, + "balance_loss_mlp": 1.09589863, + "epoch": 0.054110927401172404, + "flos": 24498991206720.0, + "grad_norm": 2.8803569807369045, + "language_loss": 0.75566995, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.78619802, + "num_input_tokens_seen": 19232235, + "step": 900, + "time_per_iteration": 2.766263008117676 + }, + { + "auxiliary_loss_clip": 0.01579005, + "auxiliary_loss_mlp": 0.01484463, + "balance_loss_clip": 1.19774699, + "balance_loss_mlp": 1.11520052, + "epoch": 0.054171050653840376, + "flos": 22932277035840.0, + "grad_norm": 3.0484489991036203, + "language_loss": 0.73780435, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76843894, + "num_input_tokens_seen": 19251460, + "step": 901, + "time_per_iteration": 2.7852628231048584 + }, + { + "auxiliary_loss_clip": 0.01582545, + "auxiliary_loss_mlp": 0.01461942, + "balance_loss_clip": 1.20086396, + "balance_loss_mlp": 1.09096265, + "epoch": 0.05423117390650834, + "flos": 12934883158560.0, + "grad_norm": 2.6348912657846104, + "language_loss": 0.84882373, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87926865, + "num_input_tokens_seen": 19269060, + "step": 902, + "time_per_iteration": 2.7914254665374756 + }, + { + "auxiliary_loss_clip": 0.01575066, + "auxiliary_loss_mlp": 0.0148735, + "balance_loss_clip": 1.19246244, + "balance_loss_mlp": 1.12438202, + "epoch": 0.05429129715917631, + "flos": 19903379796000.0, + "grad_norm": 2.24416107945362, + "language_loss": 0.86531448, + "learning_rate": 3.993814024394569e-06, + "loss": 0.89593863, + "num_input_tokens_seen": 19288620, + "step": 903, + "time_per_iteration": 2.9342920780181885 + }, + { + "auxiliary_loss_clip": 0.01573992, + "auxiliary_loss_mlp": 0.01472826, + "balance_loss_clip": 1.19109917, + "balance_loss_mlp": 1.1014657, + "epoch": 0.05435142041184428, + "flos": 16910438816160.0, + "grad_norm": 2.3966320521637856, + "language_loss": 0.75151861, + "learning_rate": 3.993783378746537e-06, + "loss": 0.78198671, + "num_input_tokens_seen": 19306615, + "step": 904, + "time_per_iteration": 3.061764717102051 + }, + { + "auxiliary_loss_clip": 0.01586388, + "auxiliary_loss_mlp": 0.0146964, + "balance_loss_clip": 1.20473826, + "balance_loss_mlp": 1.09675336, + "epoch": 0.05441154366451225, + "flos": 23950430763840.0, + "grad_norm": 3.218780343340011, + "language_loss": 0.85740304, + "learning_rate": 3.993752657494039e-06, + "loss": 0.88796329, + "num_input_tokens_seen": 19321680, + "step": 905, + "time_per_iteration": 2.788529872894287 + }, + { + "auxiliary_loss_clip": 0.0158655, + "auxiliary_loss_mlp": 0.01486375, + "balance_loss_clip": 1.20486093, + "balance_loss_mlp": 1.11692202, + "epoch": 0.05447166691718022, + "flos": 19977492149280.0, + "grad_norm": 1.9640616828481328, + "language_loss": 0.74351162, + "learning_rate": 3.993721860638241e-06, + "loss": 0.77424085, + "num_input_tokens_seen": 19339760, + "step": 906, + "time_per_iteration": 2.84883451461792 + }, + { + "auxiliary_loss_clip": 0.01579339, + "auxiliary_loss_mlp": 0.01477017, + "balance_loss_clip": 1.19674981, + "balance_loss_mlp": 1.11404848, + "epoch": 0.05453179016984819, + "flos": 24938848450080.0, + "grad_norm": 2.704651352695237, + "language_loss": 0.87963486, + "learning_rate": 3.993690988180309e-06, + "loss": 0.91019845, + "num_input_tokens_seen": 19359585, + "step": 907, + "time_per_iteration": 2.895956516265869 + }, + { + "auxiliary_loss_clip": 0.01580442, + "auxiliary_loss_mlp": 0.01497846, + "balance_loss_clip": 1.19631219, + "balance_loss_mlp": 1.14498651, + "epoch": 0.05459191342251616, + "flos": 18117135249120.0, + "grad_norm": 1.9921522451497469, + "language_loss": 0.86938208, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.90016496, + "num_input_tokens_seen": 19378590, + "step": 908, + "time_per_iteration": 2.9089877605438232 + }, + { + "auxiliary_loss_clip": 0.01590268, + "auxiliary_loss_mlp": 0.0148141, + "balance_loss_clip": 1.20890737, + "balance_loss_mlp": 1.11043048, + "epoch": 0.054652036675184125, + "flos": 19210311606240.0, + "grad_norm": 3.2081947771519337, + "language_loss": 0.89944357, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.93016028, + "num_input_tokens_seen": 19397910, + "step": 909, + "time_per_iteration": 2.8231399059295654 + }, + { + "auxiliary_loss_clip": 0.01579596, + "auxiliary_loss_mlp": 0.01469494, + "balance_loss_clip": 1.19516301, + "balance_loss_mlp": 1.10023153, + "epoch": 0.0547121599278521, + "flos": 16327590952320.0, + "grad_norm": 18.502311688786794, + "language_loss": 0.71255028, + "learning_rate": 3.99359791720544e-06, + "loss": 0.74304116, + "num_input_tokens_seen": 19415950, + "step": 910, + "time_per_iteration": 2.791632890701294 + }, + { + "auxiliary_loss_clip": 0.01590222, + "auxiliary_loss_mlp": 0.01469482, + "balance_loss_clip": 1.20782351, + "balance_loss_mlp": 1.09907532, + "epoch": 0.05477228318052007, + "flos": 20341075134240.0, + "grad_norm": 1.7687325063422132, + "language_loss": 0.83215261, + "learning_rate": 3.993566742350714e-06, + "loss": 0.86274964, + "num_input_tokens_seen": 19435275, + "step": 911, + "time_per_iteration": 2.829622268676758 + }, + { + "auxiliary_loss_clip": 0.01579937, + "auxiliary_loss_mlp": 0.01460371, + "balance_loss_clip": 1.19807494, + "balance_loss_mlp": 1.08347881, + "epoch": 0.054832406433188034, + "flos": 21974809092480.0, + "grad_norm": 3.5461043088608846, + "language_loss": 0.76024699, + "learning_rate": 3.993535491899736e-06, + "loss": 0.79065013, + "num_input_tokens_seen": 19452090, + "step": 912, + "time_per_iteration": 2.7831602096557617 + }, + { + "auxiliary_loss_clip": 0.01586205, + "auxiliary_loss_mlp": 0.01482148, + "balance_loss_clip": 1.20255554, + "balance_loss_mlp": 1.11841679, + "epoch": 0.054892529685856006, + "flos": 16400793029760.0, + "grad_norm": 2.6545349356638246, + "language_loss": 0.82804286, + "learning_rate": 3.993504165853694e-06, + "loss": 0.85872638, + "num_input_tokens_seen": 19470865, + "step": 913, + "time_per_iteration": 2.798112154006958 + }, + { + "auxiliary_loss_clip": 0.01598102, + "auxiliary_loss_mlp": 0.01476206, + "balance_loss_clip": 1.21512079, + "balance_loss_mlp": 1.10751534, + "epoch": 0.05495265293852397, + "flos": 23914474503840.0, + "grad_norm": 1.9583870823157739, + "language_loss": 0.83628416, + "learning_rate": 3.993472764213772e-06, + "loss": 0.86702728, + "num_input_tokens_seen": 19492145, + "step": 914, + "time_per_iteration": 2.817072868347168 + }, + { + "auxiliary_loss_clip": 0.01590366, + "auxiliary_loss_mlp": 0.01463607, + "balance_loss_clip": 1.20666718, + "balance_loss_mlp": 1.09243703, + "epoch": 0.055012776191191944, + "flos": 23589237252960.0, + "grad_norm": 2.703513340463389, + "language_loss": 0.90108049, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.93162024, + "num_input_tokens_seen": 19511015, + "step": 915, + "time_per_iteration": 2.8275184631347656 + }, + { + "auxiliary_loss_clip": 0.01587425, + "auxiliary_loss_mlp": 0.0147108, + "balance_loss_clip": 1.20366621, + "balance_loss_mlp": 1.10601354, + "epoch": 0.055072899443859916, + "flos": 17530342856640.0, + "grad_norm": 1.8504131293267365, + "language_loss": 0.89644158, + "learning_rate": 3.993409734157064e-06, + "loss": 0.92702663, + "num_input_tokens_seen": 19529040, + "step": 916, + "time_per_iteration": 2.7388112545013428 + }, + { + "auxiliary_loss_clip": 0.01593747, + "auxiliary_loss_mlp": 0.01468129, + "balance_loss_clip": 1.20998836, + "balance_loss_mlp": 1.0931443, + "epoch": 0.05513302269652788, + "flos": 21689207133120.0, + "grad_norm": 2.023921049372844, + "language_loss": 0.79845005, + "learning_rate": 3.993378105742666e-06, + "loss": 0.8290689, + "num_input_tokens_seen": 19549540, + "step": 917, + "time_per_iteration": 2.8230316638946533 + }, + { + "auxiliary_loss_clip": 0.01586846, + "auxiliary_loss_mlp": 0.01471143, + "balance_loss_clip": 1.20289648, + "balance_loss_mlp": 1.09634864, + "epoch": 0.05519314594919585, + "flos": 21615322348800.0, + "grad_norm": 3.0533539971198413, + "language_loss": 0.79880834, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.82938826, + "num_input_tokens_seen": 19567570, + "step": 918, + "time_per_iteration": 2.8050055503845215 + }, + { + "auxiliary_loss_clip": 0.01584741, + "auxiliary_loss_mlp": 0.01494301, + "balance_loss_clip": 1.20218682, + "balance_loss_mlp": 1.12732685, + "epoch": 0.05525326920186382, + "flos": 21800717016480.0, + "grad_norm": 2.477783061196228, + "language_loss": 0.88994378, + "learning_rate": 3.99331462214778e-06, + "loss": 0.92073423, + "num_input_tokens_seen": 19585330, + "step": 919, + "time_per_iteration": 2.7779901027679443 + }, + { + "auxiliary_loss_clip": 0.01588713, + "auxiliary_loss_mlp": 0.01487744, + "balance_loss_clip": 1.20542431, + "balance_loss_mlp": 1.1221056, + "epoch": 0.05531339245453179, + "flos": 28442193779520.0, + "grad_norm": 2.4499651486649228, + "language_loss": 0.87500793, + "learning_rate": 3.993282766969699e-06, + "loss": 0.90577251, + "num_input_tokens_seen": 19604970, + "step": 920, + "time_per_iteration": 2.8487653732299805 + }, + { + "auxiliary_loss_clip": 0.01589892, + "auxiliary_loss_mlp": 0.01470949, + "balance_loss_clip": 1.20707273, + "balance_loss_mlp": 1.10950637, + "epoch": 0.05537351570719976, + "flos": 37378005821280.0, + "grad_norm": 2.8610711973441343, + "language_loss": 0.66429126, + "learning_rate": 3.993250836206136e-06, + "loss": 0.69489968, + "num_input_tokens_seen": 19626235, + "step": 921, + "time_per_iteration": 2.8717963695526123 + }, + { + "auxiliary_loss_clip": 0.01589492, + "auxiliary_loss_mlp": 0.01454885, + "balance_loss_clip": 1.20746422, + "balance_loss_mlp": 1.0860039, + "epoch": 0.05543363895986773, + "flos": 20086574630400.0, + "grad_norm": 2.432720792633366, + "language_loss": 0.7227729, + "learning_rate": 3.993218829858301e-06, + "loss": 0.75321668, + "num_input_tokens_seen": 19644305, + "step": 922, + "time_per_iteration": 2.809373378753662 + }, + { + "auxiliary_loss_clip": 0.01581599, + "auxiliary_loss_mlp": 0.01466312, + "balance_loss_clip": 1.19633234, + "balance_loss_mlp": 1.09724021, + "epoch": 0.0554937622125357, + "flos": 24535402604640.0, + "grad_norm": 7.565131860801407, + "language_loss": 0.8236258, + "learning_rate": 3.993186747927408e-06, + "loss": 0.85410488, + "num_input_tokens_seen": 19662130, + "step": 923, + "time_per_iteration": 2.7801365852355957 + }, + { + "auxiliary_loss_clip": 0.01586452, + "auxiliary_loss_mlp": 0.01479523, + "balance_loss_clip": 1.2028985, + "balance_loss_mlp": 1.11178565, + "epoch": 0.055553885465203665, + "flos": 14321853957600.0, + "grad_norm": 3.020717002808865, + "language_loss": 0.78804839, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81870818, + "num_input_tokens_seen": 19680715, + "step": 924, + "time_per_iteration": 2.777106523513794 + }, + { + "auxiliary_loss_clip": 0.01588241, + "auxiliary_loss_mlp": 0.01486062, + "balance_loss_clip": 1.20288277, + "balance_loss_mlp": 1.12404728, + "epoch": 0.05561400871787164, + "flos": 27383608268640.0, + "grad_norm": 2.158414109782977, + "language_loss": 1.02199936, + "learning_rate": 3.993122357321319e-06, + "loss": 1.05274248, + "num_input_tokens_seen": 19700535, + "step": 925, + "time_per_iteration": 2.8205056190490723 + }, + { + "auxiliary_loss_clip": 0.01581305, + "auxiliary_loss_mlp": 0.0146813, + "balance_loss_clip": 1.19758558, + "balance_loss_mlp": 1.1091665, + "epoch": 0.05567413197053961, + "flos": 23223302722080.0, + "grad_norm": 2.2562700424087243, + "language_loss": 0.81306714, + "learning_rate": 3.993090048648564e-06, + "loss": 0.84356141, + "num_input_tokens_seen": 19718825, + "step": 926, + "time_per_iteration": 2.767641067504883 + }, + { + "auxiliary_loss_clip": 0.01592342, + "auxiliary_loss_mlp": 0.0149242, + "balance_loss_clip": 1.20689821, + "balance_loss_mlp": 1.12926054, + "epoch": 0.055734255223207574, + "flos": 25267309594560.0, + "grad_norm": 3.016115405390078, + "language_loss": 0.73837197, + "learning_rate": 3.993057664397634e-06, + "loss": 0.76921964, + "num_input_tokens_seen": 19739080, + "step": 927, + "time_per_iteration": 2.8269176483154297 + }, + { + "auxiliary_loss_clip": 0.01688528, + "auxiliary_loss_mlp": 0.0141851, + "balance_loss_clip": 1.29736662, + "balance_loss_mlp": 1.11180878, + "epoch": 0.055794378475875546, + "flos": 66510346957920.0, + "grad_norm": 0.8749106061187115, + "language_loss": 0.59800327, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62907368, + "num_input_tokens_seen": 19802960, + "step": 928, + "time_per_iteration": 3.3592562675476074 + }, + { + "auxiliary_loss_clip": 0.01584321, + "auxiliary_loss_mlp": 0.01489469, + "balance_loss_clip": 1.2001493, + "balance_loss_mlp": 1.11677313, + "epoch": 0.05585450172854351, + "flos": 25339980677760.0, + "grad_norm": 2.3589392201618202, + "language_loss": 0.95634663, + "learning_rate": 3.992992669166168e-06, + "loss": 0.98708457, + "num_input_tokens_seen": 19822765, + "step": 929, + "time_per_iteration": 5.80059027671814 + }, + { + "auxiliary_loss_clip": 0.01591931, + "auxiliary_loss_mlp": 0.01473668, + "balance_loss_clip": 1.2061311, + "balance_loss_mlp": 1.10287917, + "epoch": 0.05591462498121148, + "flos": 33914561280480.0, + "grad_norm": 2.4775970363572033, + "language_loss": 0.72122526, + "learning_rate": 3.992960058188094e-06, + "loss": 0.75188118, + "num_input_tokens_seen": 19843590, + "step": 930, + "time_per_iteration": 2.8912577629089355 + }, + { + "auxiliary_loss_clip": 0.01583704, + "auxiliary_loss_mlp": 0.01461813, + "balance_loss_clip": 1.19928086, + "balance_loss_mlp": 1.09693694, + "epoch": 0.055974748233879455, + "flos": 17932802569920.0, + "grad_norm": 4.592310123272022, + "language_loss": 0.85607702, + "learning_rate": 3.992927371636776e-06, + "loss": 0.88653219, + "num_input_tokens_seen": 19860230, + "step": 931, + "time_per_iteration": 4.477998971939087 + }, + { + "auxiliary_loss_clip": 0.01587145, + "auxiliary_loss_mlp": 0.01488444, + "balance_loss_clip": 1.20271575, + "balance_loss_mlp": 1.11784601, + "epoch": 0.05603487148654742, + "flos": 24024125907360.0, + "grad_norm": 2.0110193733654036, + "language_loss": 0.83743799, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.86819392, + "num_input_tokens_seen": 19880795, + "step": 932, + "time_per_iteration": 2.787367343902588 + }, + { + "auxiliary_loss_clip": 0.01590074, + "auxiliary_loss_mlp": 0.01470171, + "balance_loss_clip": 1.20617819, + "balance_loss_mlp": 1.09385157, + "epoch": 0.05609499473921539, + "flos": 17309484995040.0, + "grad_norm": 2.1611229301032706, + "language_loss": 0.73705757, + "learning_rate": 3.992861771819365e-06, + "loss": 0.76766008, + "num_input_tokens_seen": 19897960, + "step": 933, + "time_per_iteration": 2.7830352783203125 + }, + { + "auxiliary_loss_clip": 0.01587172, + "auxiliary_loss_mlp": 0.01476488, + "balance_loss_clip": 1.20391822, + "balance_loss_mlp": 1.10398316, + "epoch": 0.05615511799188336, + "flos": 20996783722080.0, + "grad_norm": 10.386631902798888, + "language_loss": 0.86498177, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89561832, + "num_input_tokens_seen": 19913315, + "step": 934, + "time_per_iteration": 2.8443844318389893 + }, + { + "auxiliary_loss_clip": 0.01588254, + "auxiliary_loss_mlp": 0.01465815, + "balance_loss_clip": 1.20504653, + "balance_loss_mlp": 1.09254646, + "epoch": 0.05621524124455133, + "flos": 17275387214880.0, + "grad_norm": 2.239040818841728, + "language_loss": 0.80421621, + "learning_rate": 3.992795869723885e-06, + "loss": 0.83475685, + "num_input_tokens_seen": 19928790, + "step": 935, + "time_per_iteration": 2.768739700317383 + }, + { + "auxiliary_loss_clip": 0.01698256, + "auxiliary_loss_mlp": 0.01393127, + "balance_loss_clip": 1.31192601, + "balance_loss_mlp": 1.08108521, + "epoch": 0.0562753644972193, + "flos": 58725476589600.0, + "grad_norm": 0.8194967617090049, + "language_loss": 0.69085771, + "learning_rate": 3.99276280532499e-06, + "loss": 0.72177154, + "num_input_tokens_seen": 19988785, + "step": 936, + "time_per_iteration": 3.2990236282348633 + }, + { + "auxiliary_loss_clip": 0.01581575, + "auxiliary_loss_mlp": 0.01456997, + "balance_loss_clip": 1.19798982, + "balance_loss_mlp": 1.08868754, + "epoch": 0.05633548774988727, + "flos": 17458961330880.0, + "grad_norm": 3.469602442069956, + "language_loss": 0.76066935, + "learning_rate": 3.992729665360331e-06, + "loss": 0.79105502, + "num_input_tokens_seen": 20007685, + "step": 937, + "time_per_iteration": 2.812197208404541 + }, + { + "auxiliary_loss_clip": 0.01698276, + "auxiliary_loss_mlp": 0.01391762, + "balance_loss_clip": 1.31205893, + "balance_loss_mlp": 1.08353424, + "epoch": 0.05639561100255524, + "flos": 70661966955840.0, + "grad_norm": 0.8611666437594059, + "language_loss": 0.64223945, + "learning_rate": 3.992696449831162e-06, + "loss": 0.67313987, + "num_input_tokens_seen": 20072750, + "step": 938, + "time_per_iteration": 3.2762291431427 + }, + { + "auxiliary_loss_clip": 0.01579582, + "auxiliary_loss_mlp": 0.01468171, + "balance_loss_clip": 1.19592237, + "balance_loss_mlp": 1.09414029, + "epoch": 0.056455734255223204, + "flos": 20488124067840.0, + "grad_norm": 2.9085811700734583, + "language_loss": 0.80070096, + "learning_rate": 3.992663158738745e-06, + "loss": 0.83117843, + "num_input_tokens_seen": 20089070, + "step": 939, + "time_per_iteration": 2.8234293460845947 + }, + { + "auxiliary_loss_clip": 0.01585786, + "auxiliary_loss_mlp": 0.01457925, + "balance_loss_clip": 1.20034623, + "balance_loss_mlp": 1.09190512, + "epoch": 0.056515857507891176, + "flos": 22055710586400.0, + "grad_norm": 1.709023533130207, + "language_loss": 0.74022007, + "learning_rate": 3.992629792084341e-06, + "loss": 0.77065718, + "num_input_tokens_seen": 20108790, + "step": 940, + "time_per_iteration": 2.7640204429626465 + }, + { + "auxiliary_loss_clip": 0.01586617, + "auxiliary_loss_mlp": 0.0147237, + "balance_loss_clip": 1.20287585, + "balance_loss_mlp": 1.09738541, + "epoch": 0.05657598076055915, + "flos": 24027653226240.0, + "grad_norm": 2.4234592712837584, + "language_loss": 0.70766848, + "learning_rate": 3.992596349869216e-06, + "loss": 0.73825836, + "num_input_tokens_seen": 20128455, + "step": 941, + "time_per_iteration": 2.82121205329895 + }, + { + "auxiliary_loss_clip": 0.01582777, + "auxiliary_loss_mlp": 0.01468865, + "balance_loss_clip": 1.1992377, + "balance_loss_mlp": 1.09864855, + "epoch": 0.05663610401322711, + "flos": 20482434843840.0, + "grad_norm": 2.1956467509507935, + "language_loss": 0.81086397, + "learning_rate": 3.992562832094637e-06, + "loss": 0.84138042, + "num_input_tokens_seen": 20145775, + "step": 942, + "time_per_iteration": 2.756229877471924 + }, + { + "auxiliary_loss_clip": 0.01577364, + "auxiliary_loss_mlp": 0.01481111, + "balance_loss_clip": 1.19336879, + "balance_loss_mlp": 1.12157559, + "epoch": 0.056696227265895086, + "flos": 21071161572480.0, + "grad_norm": 2.4060743203335786, + "language_loss": 0.88188273, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.91246748, + "num_input_tokens_seen": 20164315, + "step": 943, + "time_per_iteration": 2.7706823348999023 + }, + { + "auxiliary_loss_clip": 0.01583609, + "auxiliary_loss_mlp": 0.01477628, + "balance_loss_clip": 1.19968808, + "balance_loss_mlp": 1.11885571, + "epoch": 0.05675635051856306, + "flos": 17823227022720.0, + "grad_norm": 2.4497849279820936, + "language_loss": 0.74846667, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77907908, + "num_input_tokens_seen": 20182760, + "step": 944, + "time_per_iteration": 2.737990617752075 + }, + { + "auxiliary_loss_clip": 0.01580062, + "auxiliary_loss_mlp": 0.01459662, + "balance_loss_clip": 1.19709134, + "balance_loss_mlp": 1.08887362, + "epoch": 0.05681647377123102, + "flos": 23117330350080.0, + "grad_norm": 1.7834004145771278, + "language_loss": 0.79329485, + "learning_rate": 3.992461825426906e-06, + "loss": 0.82369202, + "num_input_tokens_seen": 20203830, + "step": 945, + "time_per_iteration": 2.8189797401428223 + }, + { + "auxiliary_loss_clip": 0.01575189, + "auxiliary_loss_mlp": 0.01462345, + "balance_loss_clip": 1.19220757, + "balance_loss_mlp": 1.09060252, + "epoch": 0.056876597023898995, + "flos": 16072369813440.0, + "grad_norm": 3.986923225246933, + "language_loss": 0.82562339, + "learning_rate": 3.992428005427252e-06, + "loss": 0.85599869, + "num_input_tokens_seen": 20220365, + "step": 946, + "time_per_iteration": 2.7168540954589844 + }, + { + "auxiliary_loss_clip": 0.01581604, + "auxiliary_loss_mlp": 0.01468338, + "balance_loss_clip": 1.19846725, + "balance_loss_mlp": 1.10899353, + "epoch": 0.05693672027656696, + "flos": 16837464307680.0, + "grad_norm": 4.7284191346272495, + "language_loss": 0.79002994, + "learning_rate": 3.992394109874529e-06, + "loss": 0.8205294, + "num_input_tokens_seen": 20238640, + "step": 947, + "time_per_iteration": 2.789585828781128 + }, + { + "auxiliary_loss_clip": 0.01585303, + "auxiliary_loss_mlp": 0.0147102, + "balance_loss_clip": 1.20155537, + "balance_loss_mlp": 1.10633445, + "epoch": 0.05699684352923493, + "flos": 21390785455680.0, + "grad_norm": 209.26423641366537, + "language_loss": 0.86301112, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.89357436, + "num_input_tokens_seen": 20251025, + "step": 948, + "time_per_iteration": 2.774825096130371 + }, + { + "auxiliary_loss_clip": 0.01585212, + "auxiliary_loss_mlp": 0.01482477, + "balance_loss_clip": 1.2011565, + "balance_loss_mlp": 1.1235137, + "epoch": 0.057056966781902904, + "flos": 15562155104640.0, + "grad_norm": 2.363364249325727, + "language_loss": 0.87886238, + "learning_rate": 3.992326092115019e-06, + "loss": 0.90953934, + "num_input_tokens_seen": 20269775, + "step": 949, + "time_per_iteration": 2.8002846240997314 + }, + { + "auxiliary_loss_clip": 0.01584485, + "auxiliary_loss_mlp": 0.01469302, + "balance_loss_clip": 1.20193768, + "balance_loss_mlp": 1.10347271, + "epoch": 0.05711709003457087, + "flos": 19939904978400.0, + "grad_norm": 3.023365141567007, + "language_loss": 0.79115868, + "learning_rate": 3.992291969910811e-06, + "loss": 0.82169652, + "num_input_tokens_seen": 20287715, + "step": 950, + "time_per_iteration": 2.845205783843994 + }, + { + "auxiliary_loss_clip": 0.01581135, + "auxiliary_loss_mlp": 0.0146569, + "balance_loss_clip": 1.19609404, + "balance_loss_mlp": 1.09738123, + "epoch": 0.05717721328723884, + "flos": 30335131333440.0, + "grad_norm": 2.3484796881071164, + "language_loss": 0.83005273, + "learning_rate": 3.992257772158691e-06, + "loss": 0.86052096, + "num_input_tokens_seen": 20307070, + "step": 951, + "time_per_iteration": 2.8236043453216553 + }, + { + "auxiliary_loss_clip": 0.01577999, + "auxiliary_loss_mlp": 0.01459815, + "balance_loss_clip": 1.19411731, + "balance_loss_mlp": 1.0911243, + "epoch": 0.05723733653990681, + "flos": 23656332896640.0, + "grad_norm": 3.311629422901623, + "language_loss": 0.8749038, + "learning_rate": 3.992223498859958e-06, + "loss": 0.90528196, + "num_input_tokens_seen": 20324945, + "step": 952, + "time_per_iteration": 2.7385356426239014 + }, + { + "auxiliary_loss_clip": 0.01575465, + "auxiliary_loss_mlp": 0.0145845, + "balance_loss_clip": 1.1907599, + "balance_loss_mlp": 1.09719777, + "epoch": 0.05729745979257478, + "flos": 22058024204160.0, + "grad_norm": 2.975568004202963, + "language_loss": 0.79511917, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.82545829, + "num_input_tokens_seen": 20346135, + "step": 953, + "time_per_iteration": 2.868319034576416 + }, + { + "auxiliary_loss_clip": 0.01590255, + "auxiliary_loss_mlp": 0.01473928, + "balance_loss_clip": 1.20706141, + "balance_loss_mlp": 1.11000597, + "epoch": 0.05735758304524275, + "flos": 19606095963360.0, + "grad_norm": 2.6279910363178685, + "language_loss": 0.87278932, + "learning_rate": 3.992154725627848e-06, + "loss": 0.90343118, + "num_input_tokens_seen": 20364450, + "step": 954, + "time_per_iteration": 2.7753794193267822 + }, + { + "auxiliary_loss_clip": 0.01584845, + "auxiliary_loss_mlp": 0.01472296, + "balance_loss_clip": 1.19984365, + "balance_loss_mlp": 1.1114254, + "epoch": 0.057417706297910716, + "flos": 19101039484320.0, + "grad_norm": 2.6050203586468093, + "language_loss": 0.88134825, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.91191965, + "num_input_tokens_seen": 20383500, + "step": 955, + "time_per_iteration": 2.754324436187744 + }, + { + "auxiliary_loss_clip": 0.01577533, + "auxiliary_loss_mlp": 0.01464182, + "balance_loss_clip": 1.19411349, + "balance_loss_mlp": 1.10293055, + "epoch": 0.05747782955057869, + "flos": 16656393450240.0, + "grad_norm": 5.514131733370832, + "language_loss": 0.90110672, + "learning_rate": 3.992085650224914e-06, + "loss": 0.93152392, + "num_input_tokens_seen": 20400295, + "step": 956, + "time_per_iteration": 2.710756778717041 + }, + { + "auxiliary_loss_clip": 0.0157982, + "auxiliary_loss_mlp": 0.01456118, + "balance_loss_clip": 1.1962384, + "balance_loss_mlp": 1.0948658, + "epoch": 0.05753795280324665, + "flos": 14503835090880.0, + "grad_norm": 1.990681234269099, + "language_loss": 0.75764883, + "learning_rate": 3.99205099921266e-06, + "loss": 0.78800815, + "num_input_tokens_seen": 20419085, + "step": 957, + "time_per_iteration": 2.764216184616089 + }, + { + "auxiliary_loss_clip": 0.01587156, + "auxiliary_loss_mlp": 0.01460512, + "balance_loss_clip": 1.20361686, + "balance_loss_mlp": 1.09678042, + "epoch": 0.057598076055914625, + "flos": 18078220592640.0, + "grad_norm": 3.4231892518720515, + "language_loss": 0.79707068, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82754731, + "num_input_tokens_seen": 20437465, + "step": 958, + "time_per_iteration": 2.7222447395324707 + }, + { + "auxiliary_loss_clip": 0.01586549, + "auxiliary_loss_mlp": 0.01456033, + "balance_loss_clip": 1.20307267, + "balance_loss_mlp": 1.08448195, + "epoch": 0.0576581993085826, + "flos": 22126636974240.0, + "grad_norm": 5.895965668478713, + "language_loss": 0.88488895, + "learning_rate": 3.99198147057315e-06, + "loss": 0.91531479, + "num_input_tokens_seen": 20456235, + "step": 959, + "time_per_iteration": 2.7726938724517822 + }, + { + "auxiliary_loss_clip": 0.01598119, + "auxiliary_loss_mlp": 0.01470337, + "balance_loss_clip": 1.21482611, + "balance_loss_mlp": 1.11022973, + "epoch": 0.05771832256125056, + "flos": 33184512770400.0, + "grad_norm": 2.5802451510303155, + "language_loss": 0.78817844, + "learning_rate": 3.991946592948529e-06, + "loss": 0.81886297, + "num_input_tokens_seen": 20476825, + "step": 960, + "time_per_iteration": 2.8644144535064697 + }, + { + "auxiliary_loss_clip": 0.01588841, + "auxiliary_loss_mlp": 0.01457786, + "balance_loss_clip": 1.20583487, + "balance_loss_mlp": 1.08871388, + "epoch": 0.057778445813918534, + "flos": 24172426470240.0, + "grad_norm": 2.4210996167280268, + "language_loss": 0.92704183, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95750809, + "num_input_tokens_seen": 20496965, + "step": 961, + "time_per_iteration": 2.842827796936035 + }, + { + "auxiliary_loss_clip": 0.01596562, + "auxiliary_loss_mlp": 0.01467587, + "balance_loss_clip": 1.21466827, + "balance_loss_mlp": 1.10728908, + "epoch": 0.0578385690665865, + "flos": 29645628390720.0, + "grad_norm": 3.4883487031923064, + "language_loss": 0.68052918, + "learning_rate": 3.991876611096169e-06, + "loss": 0.71117067, + "num_input_tokens_seen": 20518035, + "step": 962, + "time_per_iteration": 2.8117475509643555 + }, + { + "auxiliary_loss_clip": 0.01598171, + "auxiliary_loss_mlp": 0.01461723, + "balance_loss_clip": 1.21557045, + "balance_loss_mlp": 1.09207916, + "epoch": 0.05789869231925447, + "flos": 20887473672000.0, + "grad_norm": 2.4863306740884794, + "language_loss": 0.88617617, + "learning_rate": 3.991841506871084e-06, + "loss": 0.91677511, + "num_input_tokens_seen": 20534740, + "step": 963, + "time_per_iteration": 2.849250316619873 + }, + { + "auxiliary_loss_clip": 0.01597236, + "auxiliary_loss_mlp": 0.01456095, + "balance_loss_clip": 1.21637082, + "balance_loss_mlp": 1.08225489, + "epoch": 0.057958815571922444, + "flos": 26033504005440.0, + "grad_norm": 2.9217149365754667, + "language_loss": 0.84962946, + "learning_rate": 3.99180632711517e-06, + "loss": 0.88016284, + "num_input_tokens_seen": 20553485, + "step": 964, + "time_per_iteration": 2.8980636596679688 + }, + { + "auxiliary_loss_clip": 0.0158718, + "auxiliary_loss_mlp": 0.01469311, + "balance_loss_clip": 1.20698845, + "balance_loss_mlp": 1.09947622, + "epoch": 0.05801893882459041, + "flos": 18079661862720.0, + "grad_norm": 4.594207475980621, + "language_loss": 0.77425355, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80481845, + "num_input_tokens_seen": 20572155, + "step": 965, + "time_per_iteration": 2.829529285430908 + }, + { + "auxiliary_loss_clip": 0.01590461, + "auxiliary_loss_mlp": 0.01461777, + "balance_loss_clip": 1.20958424, + "balance_loss_mlp": 1.09308672, + "epoch": 0.05807906207725838, + "flos": 17750821436640.0, + "grad_norm": 2.421154519005965, + "language_loss": 0.81440961, + "learning_rate": 3.99173574101619e-06, + "loss": 0.84493202, + "num_input_tokens_seen": 20590395, + "step": 966, + "time_per_iteration": 4.338393211364746 + }, + { + "auxiliary_loss_clip": 0.01591246, + "auxiliary_loss_mlp": 0.01459432, + "balance_loss_clip": 1.20969486, + "balance_loss_mlp": 1.08997846, + "epoch": 0.058139185329926346, + "flos": 18042302260800.0, + "grad_norm": 3.728385579888426, + "language_loss": 0.76393855, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.79444528, + "num_input_tokens_seen": 20608435, + "step": 967, + "time_per_iteration": 4.267562627792358 + }, + { + "auxiliary_loss_clip": 0.01754272, + "auxiliary_loss_mlp": 0.01427048, + "balance_loss_clip": 1.37182689, + "balance_loss_mlp": 1.14247131, + "epoch": 0.05819930858259432, + "flos": 62369347209120.0, + "grad_norm": 0.801513502434146, + "language_loss": 0.57278717, + "learning_rate": 3.991664852809939e-06, + "loss": 0.60460037, + "num_input_tokens_seen": 20668575, + "step": 968, + "time_per_iteration": 4.721943616867065 + }, + { + "auxiliary_loss_clip": 0.01587178, + "auxiliary_loss_mlp": 0.01471883, + "balance_loss_clip": 1.20780659, + "balance_loss_mlp": 1.09594464, + "epoch": 0.05825943183526229, + "flos": 19137375025920.0, + "grad_norm": 3.3387963846479662, + "language_loss": 0.82482082, + "learning_rate": 3.991629295419945e-06, + "loss": 0.85541141, + "num_input_tokens_seen": 20687355, + "step": 969, + "time_per_iteration": 4.342766523361206 + }, + { + "auxiliary_loss_clip": 0.01588853, + "auxiliary_loss_mlp": 0.01456821, + "balance_loss_clip": 1.20990503, + "balance_loss_mlp": 1.07840276, + "epoch": 0.058319555087930255, + "flos": 29025003715200.0, + "grad_norm": 2.2866480086686867, + "language_loss": 0.77953851, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80999529, + "num_input_tokens_seen": 20705710, + "step": 970, + "time_per_iteration": 2.8351151943206787 + }, + { + "auxiliary_loss_clip": 0.01582783, + "auxiliary_loss_mlp": 0.01443858, + "balance_loss_clip": 1.20180106, + "balance_loss_mlp": 1.06143451, + "epoch": 0.05837967834059823, + "flos": 18882002174400.0, + "grad_norm": 4.523498247025214, + "language_loss": 0.92043132, + "learning_rate": 3.991557954072958e-06, + "loss": 0.95069766, + "num_input_tokens_seen": 20722405, + "step": 971, + "time_per_iteration": 2.819286823272705 + }, + { + "auxiliary_loss_clip": 0.0159109, + "auxiliary_loss_mlp": 0.01452088, + "balance_loss_clip": 1.21214628, + "balance_loss_mlp": 1.07901037, + "epoch": 0.05843980159326619, + "flos": 25705460070720.0, + "grad_norm": 1.79862487918811, + "language_loss": 0.86109889, + "learning_rate": 3.991522170118673e-06, + "loss": 0.89153063, + "num_input_tokens_seen": 20741480, + "step": 972, + "time_per_iteration": 2.8129355907440186 + }, + { + "auxiliary_loss_clip": 0.01581949, + "auxiliary_loss_mlp": 0.01451943, + "balance_loss_clip": 1.20105445, + "balance_loss_mlp": 1.07485998, + "epoch": 0.058499924845934165, + "flos": 25554239039520.0, + "grad_norm": 2.343460852265241, + "language_loss": 0.87417006, + "learning_rate": 3.991486310645667e-06, + "loss": 0.90450895, + "num_input_tokens_seen": 20759685, + "step": 973, + "time_per_iteration": 2.770080804824829 + }, + { + "auxiliary_loss_clip": 0.01592965, + "auxiliary_loss_mlp": 0.01443707, + "balance_loss_clip": 1.21353006, + "balance_loss_mlp": 1.06261873, + "epoch": 0.05856004809860214, + "flos": 16438076775360.0, + "grad_norm": 2.013937424188502, + "language_loss": 0.75074714, + "learning_rate": 3.991450375655301e-06, + "loss": 0.78111386, + "num_input_tokens_seen": 20778180, + "step": 974, + "time_per_iteration": 2.81766939163208 + }, + { + "auxiliary_loss_clip": 0.01588145, + "auxiliary_loss_mlp": 0.0145674, + "balance_loss_clip": 1.20778787, + "balance_loss_mlp": 1.07336354, + "epoch": 0.0586201713512701, + "flos": 39462027266880.0, + "grad_norm": 2.0274102236525504, + "language_loss": 0.76761109, + "learning_rate": 3.991414365148936e-06, + "loss": 0.79805994, + "num_input_tokens_seen": 20802705, + "step": 975, + "time_per_iteration": 2.926370143890381 + }, + { + "auxiliary_loss_clip": 0.01578799, + "auxiliary_loss_mlp": 0.01459717, + "balance_loss_clip": 1.19844878, + "balance_loss_mlp": 1.08320653, + "epoch": 0.058680294603938074, + "flos": 23367279474720.0, + "grad_norm": 2.5420619256467347, + "language_loss": 0.76723957, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79762471, + "num_input_tokens_seen": 20822540, + "step": 976, + "time_per_iteration": 2.813633680343628 + }, + { + "auxiliary_loss_clip": 0.01579255, + "auxiliary_loss_mlp": 0.01470084, + "balance_loss_clip": 1.19936419, + "balance_loss_mlp": 1.09395456, + "epoch": 0.05874041785660604, + "flos": 32234668387200.0, + "grad_norm": 3.479574186278513, + "language_loss": 0.87596202, + "learning_rate": 3.991342117593679e-06, + "loss": 0.9064554, + "num_input_tokens_seen": 20844175, + "step": 977, + "time_per_iteration": 2.8534157276153564 + }, + { + "auxiliary_loss_clip": 0.01588876, + "auxiliary_loss_mlp": 0.01450643, + "balance_loss_clip": 1.20995307, + "balance_loss_mlp": 1.07298779, + "epoch": 0.05880054110927401, + "flos": 22312259210880.0, + "grad_norm": 1.7736246988101754, + "language_loss": 0.79492837, + "learning_rate": 3.991305880547527e-06, + "loss": 0.82532358, + "num_input_tokens_seen": 20864730, + "step": 978, + "time_per_iteration": 2.8150570392608643 + }, + { + "auxiliary_loss_clip": 0.01572908, + "auxiliary_loss_mlp": 0.01458496, + "balance_loss_clip": 1.19322228, + "balance_loss_mlp": 1.08866084, + "epoch": 0.05886066436194198, + "flos": 27382887633600.0, + "grad_norm": 2.37597952833445, + "language_loss": 0.81052637, + "learning_rate": 3.991269567990855e-06, + "loss": 0.8408404, + "num_input_tokens_seen": 20885200, + "step": 979, + "time_per_iteration": 2.817873239517212 + }, + { + "auxiliary_loss_clip": 0.01700253, + "auxiliary_loss_mlp": 0.01390762, + "balance_loss_clip": 1.31601453, + "balance_loss_mlp": 1.06117249, + "epoch": 0.05892078761460995, + "flos": 59590057740480.0, + "grad_norm": 0.941633116445978, + "language_loss": 0.58994734, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.62085748, + "num_input_tokens_seen": 20940325, + "step": 980, + "time_per_iteration": 3.275479316711426 + }, + { + "auxiliary_loss_clip": 0.01581108, + "auxiliary_loss_mlp": 0.01452618, + "balance_loss_clip": 1.20166945, + "balance_loss_mlp": 1.08030379, + "epoch": 0.05898091086727792, + "flos": 15415902662400.0, + "grad_norm": 2.3012904593466774, + "language_loss": 0.86973977, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.90007704, + "num_input_tokens_seen": 20958220, + "step": 981, + "time_per_iteration": 2.8399996757507324 + }, + { + "auxiliary_loss_clip": 0.01579816, + "auxiliary_loss_mlp": 0.01454524, + "balance_loss_clip": 1.20300817, + "balance_loss_mlp": 1.09060168, + "epoch": 0.059041034119945886, + "flos": 23657356956960.0, + "grad_norm": 2.6724171457313197, + "language_loss": 0.79569149, + "learning_rate": 3.991160177271513e-06, + "loss": 0.82603484, + "num_input_tokens_seen": 20978920, + "step": 982, + "time_per_iteration": 2.7902233600616455 + }, + { + "auxiliary_loss_clip": 0.01572346, + "auxiliary_loss_mlp": 0.01475318, + "balance_loss_clip": 1.19541907, + "balance_loss_mlp": 1.11711812, + "epoch": 0.05910115737261386, + "flos": 24756412178880.0, + "grad_norm": 2.693856715134075, + "language_loss": 0.84633398, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.87681061, + "num_input_tokens_seen": 20999490, + "step": 983, + "time_per_iteration": 2.8620145320892334 + }, + { + "auxiliary_loss_clip": 0.01579179, + "auxiliary_loss_mlp": 0.0146743, + "balance_loss_clip": 1.20177448, + "balance_loss_mlp": 1.10465264, + "epoch": 0.05916128062528183, + "flos": 11730348630720.0, + "grad_norm": 2.036730409553681, + "language_loss": 0.85111421, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.88158029, + "num_input_tokens_seen": 21017865, + "step": 984, + "time_per_iteration": 2.8130784034729004 + }, + { + "auxiliary_loss_clip": 0.01583016, + "auxiliary_loss_mlp": 0.01461166, + "balance_loss_clip": 1.20428002, + "balance_loss_mlp": 1.11116755, + "epoch": 0.059221403877949795, + "flos": 21904868836800.0, + "grad_norm": 2.730734996208866, + "language_loss": 0.77644145, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.80688322, + "num_input_tokens_seen": 21035900, + "step": 985, + "time_per_iteration": 2.764394760131836 + }, + { + "auxiliary_loss_clip": 0.01580737, + "auxiliary_loss_mlp": 0.01468256, + "balance_loss_clip": 1.2037847, + "balance_loss_mlp": 1.11043692, + "epoch": 0.05928152713061777, + "flos": 20516267126880.0, + "grad_norm": 1.9745155342165281, + "language_loss": 0.90719366, + "learning_rate": 3.991013265915661e-06, + "loss": 0.93768358, + "num_input_tokens_seen": 21053235, + "step": 986, + "time_per_iteration": 2.7723734378814697 + }, + { + "auxiliary_loss_clip": 0.01574736, + "auxiliary_loss_mlp": 0.01473509, + "balance_loss_clip": 1.19652343, + "balance_loss_mlp": 1.12732565, + "epoch": 0.05934165038328574, + "flos": 24497246511360.0, + "grad_norm": 2.04772008757915, + "language_loss": 0.75592124, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78640372, + "num_input_tokens_seen": 21073090, + "step": 987, + "time_per_iteration": 2.775637149810791 + }, + { + "auxiliary_loss_clip": 0.01566616, + "auxiliary_loss_mlp": 0.0147851, + "balance_loss_clip": 1.18798947, + "balance_loss_mlp": 1.1281302, + "epoch": 0.059401773635953704, + "flos": 38731144337280.0, + "grad_norm": 3.5778174755555967, + "language_loss": 0.72295278, + "learning_rate": 3.990939357235621e-06, + "loss": 0.75340402, + "num_input_tokens_seen": 21094895, + "step": 988, + "time_per_iteration": 2.947162628173828 + }, + { + "auxiliary_loss_clip": 0.01661243, + "auxiliary_loss_mlp": 0.01382408, + "balance_loss_clip": 1.27635455, + "balance_loss_mlp": 1.09172821, + "epoch": 0.059461896888621676, + "flos": 58029487931520.0, + "grad_norm": 0.9405889746123651, + "language_loss": 0.71094418, + "learning_rate": 3.99090228964997e-06, + "loss": 0.74138069, + "num_input_tokens_seen": 21147555, + "step": 989, + "time_per_iteration": 3.1573116779327393 + }, + { + "auxiliary_loss_clip": 0.01574279, + "auxiliary_loss_mlp": 0.0146174, + "balance_loss_clip": 1.19645834, + "balance_loss_mlp": 1.10697365, + "epoch": 0.05952202014128964, + "flos": 22129974652320.0, + "grad_norm": 2.7752587962786017, + "language_loss": 0.78156084, + "learning_rate": 3.990865146569105e-06, + "loss": 0.81192106, + "num_input_tokens_seen": 21167845, + "step": 990, + "time_per_iteration": 2.8605682849884033 + }, + { + "auxiliary_loss_clip": 0.01568276, + "auxiliary_loss_mlp": 0.01468919, + "balance_loss_clip": 1.18968832, + "balance_loss_mlp": 1.11796653, + "epoch": 0.059582143393957614, + "flos": 20447692284960.0, + "grad_norm": 3.6810489456344992, + "language_loss": 0.86376333, + "learning_rate": 3.990827927994434e-06, + "loss": 0.89413524, + "num_input_tokens_seen": 21185085, + "step": 991, + "time_per_iteration": 2.8428733348846436 + }, + { + "auxiliary_loss_clip": 0.01564618, + "auxiliary_loss_mlp": 0.01475999, + "balance_loss_clip": 1.18702626, + "balance_loss_mlp": 1.12600017, + "epoch": 0.059642266646625586, + "flos": 20596978980000.0, + "grad_norm": 2.6693321374406063, + "language_loss": 0.77362061, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.80402672, + "num_input_tokens_seen": 21204230, + "step": 992, + "time_per_iteration": 2.79559588432312 + }, + { + "auxiliary_loss_clip": 0.01568486, + "auxiliary_loss_mlp": 0.01462559, + "balance_loss_clip": 1.1909548, + "balance_loss_mlp": 1.10607612, + "epoch": 0.05970238989929355, + "flos": 19354591784160.0, + "grad_norm": 2.9623947489462736, + "language_loss": 0.75044954, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.78076005, + "num_input_tokens_seen": 21222655, + "step": 993, + "time_per_iteration": 2.7762410640716553 + }, + { + "auxiliary_loss_clip": 0.01574733, + "auxiliary_loss_mlp": 0.01485165, + "balance_loss_clip": 1.19532621, + "balance_loss_mlp": 1.13974404, + "epoch": 0.05976251315196152, + "flos": 30266708204160.0, + "grad_norm": 1.813045814934977, + "language_loss": 0.78694546, + "learning_rate": 3.990715819321712e-06, + "loss": 0.8175444, + "num_input_tokens_seen": 21242310, + "step": 994, + "time_per_iteration": 2.85969614982605 + }, + { + "auxiliary_loss_clip": 0.0156713, + "auxiliary_loss_mlp": 0.0148837, + "balance_loss_clip": 1.18691444, + "balance_loss_mlp": 1.13856256, + "epoch": 0.05982263640462949, + "flos": 23187384390240.0, + "grad_norm": 3.5139930550108547, + "language_loss": 0.80606866, + "learning_rate": 3.99067829878596e-06, + "loss": 0.83662367, + "num_input_tokens_seen": 21261410, + "step": 995, + "time_per_iteration": 2.896181344985962 + }, + { + "auxiliary_loss_clip": 0.01571679, + "auxiliary_loss_mlp": 0.01449579, + "balance_loss_clip": 1.19172478, + "balance_loss_mlp": 1.09004426, + "epoch": 0.05988275965729746, + "flos": 27853125697440.0, + "grad_norm": 2.8249957348031667, + "language_loss": 0.87120253, + "learning_rate": 3.990640702763487e-06, + "loss": 0.90141511, + "num_input_tokens_seen": 21280080, + "step": 996, + "time_per_iteration": 2.8022196292877197 + }, + { + "auxiliary_loss_clip": 0.01572597, + "auxiliary_loss_mlp": 0.01458499, + "balance_loss_clip": 1.19471979, + "balance_loss_mlp": 1.10544896, + "epoch": 0.05994288290996543, + "flos": 24682261897440.0, + "grad_norm": 4.153031608699103, + "language_loss": 0.8819797, + "learning_rate": 3.990603031255718e-06, + "loss": 0.91229075, + "num_input_tokens_seen": 21296765, + "step": 997, + "time_per_iteration": 2.8305349349975586 + }, + { + "auxiliary_loss_clip": 0.01660575, + "auxiliary_loss_mlp": 0.0137822, + "balance_loss_clip": 1.27721179, + "balance_loss_mlp": 1.08677673, + "epoch": 0.0600030061626334, + "flos": 69936811178400.0, + "grad_norm": 1.024483676942685, + "language_loss": 0.75398344, + "learning_rate": 3.990565284264083e-06, + "loss": 0.78437138, + "num_input_tokens_seen": 21363345, + "step": 998, + "time_per_iteration": 3.412402391433716 + }, + { + "auxiliary_loss_clip": 0.01570871, + "auxiliary_loss_mlp": 0.01459432, + "balance_loss_clip": 1.19294071, + "balance_loss_mlp": 1.10104108, + "epoch": 0.06006312941530137, + "flos": 26542467084960.0, + "grad_norm": 2.60501198739526, + "language_loss": 0.75896072, + "learning_rate": 3.990527461790013e-06, + "loss": 0.78926373, + "num_input_tokens_seen": 21385290, + "step": 999, + "time_per_iteration": 2.8070902824401855 + }, + { + "auxiliary_loss_clip": 0.01559784, + "auxiliary_loss_mlp": 0.01444374, + "balance_loss_clip": 1.18012595, + "balance_loss_mlp": 1.08769989, + "epoch": 0.060123252667969335, + "flos": 27346438307520.0, + "grad_norm": 3.605109504630504, + "language_loss": 0.82695311, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85699469, + "num_input_tokens_seen": 21407625, + "step": 1000, + "time_per_iteration": 2.887251138687134 + }, + { + "auxiliary_loss_clip": 0.01569124, + "auxiliary_loss_mlp": 0.01451657, + "balance_loss_clip": 1.19131756, + "balance_loss_mlp": 1.08315682, + "epoch": 0.06018337592063731, + "flos": 27019190864160.0, + "grad_norm": 3.4042124381777557, + "language_loss": 0.86219501, + "learning_rate": 3.990451590400309e-06, + "loss": 0.89240277, + "num_input_tokens_seen": 21426835, + "step": 1001, + "time_per_iteration": 2.77260422706604 + }, + { + "auxiliary_loss_clip": 0.01562262, + "auxiliary_loss_mlp": 0.01443024, + "balance_loss_clip": 1.18416238, + "balance_loss_mlp": 1.07261693, + "epoch": 0.06024349917330528, + "flos": 25595315601120.0, + "grad_norm": 19.40194862657331, + "language_loss": 0.74174196, + "learning_rate": 3.990413541487551e-06, + "loss": 0.77179486, + "num_input_tokens_seen": 21444920, + "step": 1002, + "time_per_iteration": 2.862771511077881 + }, + { + "auxiliary_loss_clip": 0.01564573, + "auxiliary_loss_mlp": 0.01449727, + "balance_loss_clip": 1.18760335, + "balance_loss_mlp": 1.07931948, + "epoch": 0.060303622425973244, + "flos": 26135266351680.0, + "grad_norm": 2.5078348797055203, + "language_loss": 0.75675941, + "learning_rate": 3.990375417098112e-06, + "loss": 0.78690243, + "num_input_tokens_seen": 21463555, + "step": 1003, + "time_per_iteration": 2.835632562637329 + }, + { + "auxiliary_loss_clip": 0.01558979, + "auxiliary_loss_mlp": 0.01448247, + "balance_loss_clip": 1.18078327, + "balance_loss_mlp": 1.07383466, + "epoch": 0.060363745678641216, + "flos": 20379345012000.0, + "grad_norm": 7.08205429133608, + "language_loss": 0.70418173, + "learning_rate": 3.990337217233437e-06, + "loss": 0.734254, + "num_input_tokens_seen": 21481990, + "step": 1004, + "time_per_iteration": 2.791011333465576 + }, + { + "auxiliary_loss_clip": 0.01558051, + "auxiliary_loss_mlp": 0.01460963, + "balance_loss_clip": 1.17926908, + "balance_loss_mlp": 1.08101892, + "epoch": 0.06042386893130918, + "flos": 17751124861920.0, + "grad_norm": 3.9626289368154284, + "language_loss": 0.83375919, + "learning_rate": 3.990298941894976e-06, + "loss": 0.8639493, + "num_input_tokens_seen": 21500385, + "step": 1005, + "time_per_iteration": 4.417477607727051 + }, + { + "auxiliary_loss_clip": 0.01652248, + "auxiliary_loss_mlp": 0.01358803, + "balance_loss_clip": 1.26990223, + "balance_loss_mlp": 1.05515289, + "epoch": 0.06048399218397715, + "flos": 68545402784640.0, + "grad_norm": 0.8815345850295025, + "language_loss": 0.58846903, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61857957, + "num_input_tokens_seen": 21561040, + "step": 1006, + "time_per_iteration": 4.912698745727539 + }, + { + "auxiliary_loss_clip": 0.01560601, + "auxiliary_loss_mlp": 0.01453713, + "balance_loss_clip": 1.18207395, + "balance_loss_mlp": 1.07739305, + "epoch": 0.060544115436645125, + "flos": 23260776108480.0, + "grad_norm": 2.050560615643458, + "language_loss": 0.74755138, + "learning_rate": 3.990222164802503e-06, + "loss": 0.77769452, + "num_input_tokens_seen": 21580655, + "step": 1007, + "time_per_iteration": 2.7700188159942627 + }, + { + "auxiliary_loss_clip": 0.0155828, + "auxiliary_loss_mlp": 0.01463129, + "balance_loss_clip": 1.17898583, + "balance_loss_mlp": 1.09558249, + "epoch": 0.06060423868931309, + "flos": 23880718077120.0, + "grad_norm": 2.093451104816588, + "language_loss": 0.80515856, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.83537269, + "num_input_tokens_seen": 21599650, + "step": 1008, + "time_per_iteration": 4.245863676071167 + }, + { + "auxiliary_loss_clip": 0.01562171, + "auxiliary_loss_mlp": 0.01443392, + "balance_loss_clip": 1.18456936, + "balance_loss_mlp": 1.06707191, + "epoch": 0.06066436194198106, + "flos": 18729719154720.0, + "grad_norm": 2.1576755075363248, + "language_loss": 0.78052461, + "learning_rate": 3.990145085832335e-06, + "loss": 0.81058025, + "num_input_tokens_seen": 21617550, + "step": 1009, + "time_per_iteration": 2.800006151199341 + }, + { + "auxiliary_loss_clip": 0.01562186, + "auxiliary_loss_mlp": 0.01452794, + "balance_loss_clip": 1.18361235, + "balance_loss_mlp": 1.08849013, + "epoch": 0.06072448519464903, + "flos": 24642550749600.0, + "grad_norm": 2.318431937528406, + "language_loss": 0.93020296, + "learning_rate": 3.990106433146769e-06, + "loss": 0.96035278, + "num_input_tokens_seen": 21635865, + "step": 1010, + "time_per_iteration": 2.7522380352020264 + }, + { + "auxiliary_loss_clip": 0.01559072, + "auxiliary_loss_mlp": 0.01461231, + "balance_loss_clip": 1.179919, + "balance_loss_mlp": 1.09616423, + "epoch": 0.060784608447317, + "flos": 17380449311040.0, + "grad_norm": 3.3785016601935123, + "language_loss": 0.72046161, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.75066459, + "num_input_tokens_seen": 21653945, + "step": 1011, + "time_per_iteration": 2.7589104175567627 + }, + { + "auxiliary_loss_clip": 0.01560574, + "auxiliary_loss_mlp": 0.01448159, + "balance_loss_clip": 1.18172812, + "balance_loss_mlp": 1.08156657, + "epoch": 0.06084473169998497, + "flos": 23694261420960.0, + "grad_norm": 2.0678846744245654, + "language_loss": 0.87203205, + "learning_rate": 3.990028901381999e-06, + "loss": 0.90211934, + "num_input_tokens_seen": 21671230, + "step": 1012, + "time_per_iteration": 2.821277379989624 + }, + { + "auxiliary_loss_clip": 0.01555013, + "auxiliary_loss_mlp": 0.01444124, + "balance_loss_clip": 1.17476201, + "balance_loss_mlp": 1.08344448, + "epoch": 0.06090485495265294, + "flos": 23548388260320.0, + "grad_norm": 2.299244766909432, + "language_loss": 0.77432346, + "learning_rate": 3.989990022305734e-06, + "loss": 0.80431485, + "num_input_tokens_seen": 21691155, + "step": 1013, + "time_per_iteration": 2.810105800628662 + }, + { + "auxiliary_loss_clip": 0.01558914, + "auxiliary_loss_mlp": 0.01453925, + "balance_loss_clip": 1.17991018, + "balance_loss_mlp": 1.08294547, + "epoch": 0.06096497820532091, + "flos": 20341378559520.0, + "grad_norm": 3.222785053499707, + "language_loss": 0.85569882, + "learning_rate": 3.98995106776885e-06, + "loss": 0.88582724, + "num_input_tokens_seen": 21707405, + "step": 1014, + "time_per_iteration": 2.7869768142700195 + }, + { + "auxiliary_loss_clip": 0.01562226, + "auxiliary_loss_mlp": 0.01446669, + "balance_loss_clip": 1.18208814, + "balance_loss_mlp": 1.08350992, + "epoch": 0.061025101457988874, + "flos": 26941247766720.0, + "grad_norm": 2.2867286191380023, + "language_loss": 0.73224044, + "learning_rate": 3.98991203777282e-06, + "loss": 0.76232934, + "num_input_tokens_seen": 21728090, + "step": 1015, + "time_per_iteration": 2.8315205574035645 + }, + { + "auxiliary_loss_clip": 0.01560128, + "auxiliary_loss_mlp": 0.01443784, + "balance_loss_clip": 1.17909992, + "balance_loss_mlp": 1.08272326, + "epoch": 0.061085224710656846, + "flos": 25377454064160.0, + "grad_norm": 1.7595578990171403, + "language_loss": 0.79137158, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.82141066, + "num_input_tokens_seen": 21747950, + "step": 1016, + "time_per_iteration": 2.854954719543457 + }, + { + "auxiliary_loss_clip": 0.01554137, + "auxiliary_loss_mlp": 0.01447163, + "balance_loss_clip": 1.17474723, + "balance_loss_mlp": 1.08228755, + "epoch": 0.06114534796332482, + "flos": 24826959285120.0, + "grad_norm": 1.8627638953658012, + "language_loss": 0.76038045, + "learning_rate": 3.989833751409254e-06, + "loss": 0.79039341, + "num_input_tokens_seen": 21767900, + "step": 1017, + "time_per_iteration": 2.858203887939453 + }, + { + "auxiliary_loss_clip": 0.01563746, + "auxiliary_loss_mlp": 0.01449519, + "balance_loss_clip": 1.18408084, + "balance_loss_mlp": 1.08674109, + "epoch": 0.061205471215992784, + "flos": 20633542090560.0, + "grad_norm": 2.1525758619453326, + "language_loss": 0.86066234, + "learning_rate": 3.989794495044685e-06, + "loss": 0.89079499, + "num_input_tokens_seen": 21787375, + "step": 1018, + "time_per_iteration": 2.894831657409668 + }, + { + "auxiliary_loss_clip": 0.01556227, + "auxiliary_loss_mlp": 0.01434818, + "balance_loss_clip": 1.17478871, + "balance_loss_mlp": 1.07471049, + "epoch": 0.061265594468660756, + "flos": 16510216864320.0, + "grad_norm": 3.0791384736641665, + "language_loss": 0.76989639, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79980683, + "num_input_tokens_seen": 21806275, + "step": 1019, + "time_per_iteration": 2.785355806350708 + }, + { + "auxiliary_loss_clip": 0.01555061, + "auxiliary_loss_mlp": 0.0145452, + "balance_loss_clip": 1.17334402, + "balance_loss_mlp": 1.10032499, + "epoch": 0.06132571772132872, + "flos": 26248407145920.0, + "grad_norm": 1.9950742504555519, + "language_loss": 0.84303641, + "learning_rate": 3.989715755957418e-06, + "loss": 0.87313223, + "num_input_tokens_seen": 21826430, + "step": 1020, + "time_per_iteration": 2.820850133895874 + }, + { + "auxiliary_loss_clip": 0.01552484, + "auxiliary_loss_mlp": 0.01443686, + "balance_loss_clip": 1.17221522, + "balance_loss_mlp": 1.07671165, + "epoch": 0.06138584097399669, + "flos": 37417830753600.0, + "grad_norm": 2.066165964260811, + "language_loss": 0.79507637, + "learning_rate": 3.989676273237705e-06, + "loss": 0.82503808, + "num_input_tokens_seen": 21847800, + "step": 1021, + "time_per_iteration": 2.8614273071289062 + }, + { + "auxiliary_loss_clip": 0.01563261, + "auxiliary_loss_mlp": 0.01463183, + "balance_loss_clip": 1.18345499, + "balance_loss_mlp": 1.11528289, + "epoch": 0.061445964226664665, + "flos": 17422587861120.0, + "grad_norm": 2.0341306845080287, + "language_loss": 0.87605602, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.90632045, + "num_input_tokens_seen": 21863385, + "step": 1022, + "time_per_iteration": 2.797210454940796 + }, + { + "auxiliary_loss_clip": 0.01559404, + "auxiliary_loss_mlp": 0.01449489, + "balance_loss_clip": 1.17852402, + "balance_loss_mlp": 1.08938205, + "epoch": 0.06150608747933263, + "flos": 22602147052320.0, + "grad_norm": 1.8667554588150634, + "language_loss": 0.83104599, + "learning_rate": 3.989597081453611e-06, + "loss": 0.86113495, + "num_input_tokens_seen": 21881880, + "step": 1023, + "time_per_iteration": 2.8064382076263428 + }, + { + "auxiliary_loss_clip": 0.01673526, + "auxiliary_loss_mlp": 0.01391342, + "balance_loss_clip": 1.28668714, + "balance_loss_mlp": 1.12355042, + "epoch": 0.0615662107320006, + "flos": 56747048234400.0, + "grad_norm": 0.9185197237164349, + "language_loss": 0.65049362, + "learning_rate": 3.989557372392231e-06, + "loss": 0.68114227, + "num_input_tokens_seen": 21940550, + "step": 1024, + "time_per_iteration": 3.370718479156494 + }, + { + "auxiliary_loss_clip": 0.01556995, + "auxiliary_loss_mlp": 0.01453449, + "balance_loss_clip": 1.17753482, + "balance_loss_mlp": 1.08838296, + "epoch": 0.06162633398466857, + "flos": 22566987283680.0, + "grad_norm": 2.2166279538737457, + "language_loss": 0.88455677, + "learning_rate": 3.989517587886636e-06, + "loss": 0.91466123, + "num_input_tokens_seen": 21958390, + "step": 1025, + "time_per_iteration": 2.823582649230957 + }, + { + "auxiliary_loss_clip": 0.01555241, + "auxiliary_loss_mlp": 0.014408, + "balance_loss_clip": 1.1757108, + "balance_loss_mlp": 1.07554293, + "epoch": 0.06168645723733654, + "flos": 25595467313760.0, + "grad_norm": 2.6661396073496486, + "language_loss": 0.84853524, + "learning_rate": 3.989477727938335e-06, + "loss": 0.87849569, + "num_input_tokens_seen": 21978625, + "step": 1026, + "time_per_iteration": 2.8407437801361084 + }, + { + "auxiliary_loss_clip": 0.01549881, + "auxiliary_loss_mlp": 0.01448912, + "balance_loss_clip": 1.16887784, + "balance_loss_mlp": 1.09013939, + "epoch": 0.06174658049000451, + "flos": 15999774586560.0, + "grad_norm": 2.869863819143944, + "language_loss": 0.822308, + "learning_rate": 3.989437792548839e-06, + "loss": 0.85229588, + "num_input_tokens_seen": 21996035, + "step": 1027, + "time_per_iteration": 2.796182870864868 + }, + { + "auxiliary_loss_clip": 0.01556731, + "auxiliary_loss_mlp": 0.01425275, + "balance_loss_clip": 1.17549753, + "balance_loss_mlp": 1.05276942, + "epoch": 0.06180670374267248, + "flos": 11287267493760.0, + "grad_norm": 2.9643228134249635, + "language_loss": 0.8456682, + "learning_rate": 3.989397781719663e-06, + "loss": 0.87548828, + "num_input_tokens_seen": 22011625, + "step": 1028, + "time_per_iteration": 2.8128223419189453 + }, + { + "auxiliary_loss_clip": 0.01677178, + "auxiliary_loss_mlp": 0.01330276, + "balance_loss_clip": 1.29127169, + "balance_loss_mlp": 1.03883362, + "epoch": 0.06186682699534045, + "flos": 65136006600480.0, + "grad_norm": 1.0420884603121159, + "language_loss": 0.6048755, + "learning_rate": 3.989357695452323e-06, + "loss": 0.63495004, + "num_input_tokens_seen": 22066035, + "step": 1029, + "time_per_iteration": 3.1564862728118896 + }, + { + "auxiliary_loss_clip": 0.01561313, + "auxiliary_loss_mlp": 0.01435042, + "balance_loss_clip": 1.17882967, + "balance_loss_mlp": 1.06921244, + "epoch": 0.061926950248008414, + "flos": 21107876395680.0, + "grad_norm": 3.338529957702823, + "language_loss": 0.8274368, + "learning_rate": 3.98931753374834e-06, + "loss": 0.85740036, + "num_input_tokens_seen": 22085015, + "step": 1030, + "time_per_iteration": 2.8239355087280273 + }, + { + "auxiliary_loss_clip": 0.01554655, + "auxiliary_loss_mlp": 0.01447744, + "balance_loss_clip": 1.1722523, + "balance_loss_mlp": 1.07352221, + "epoch": 0.061987073500676386, + "flos": 17750366298720.0, + "grad_norm": 4.994295910252714, + "language_loss": 0.80062056, + "learning_rate": 3.989277296609237e-06, + "loss": 0.83064461, + "num_input_tokens_seen": 22102775, + "step": 1031, + "time_per_iteration": 2.752439498901367 + }, + { + "auxiliary_loss_clip": 0.015627, + "auxiliary_loss_mlp": 0.01436965, + "balance_loss_clip": 1.18345904, + "balance_loss_mlp": 1.0682745, + "epoch": 0.06204719675334436, + "flos": 21838455900000.0, + "grad_norm": 1.734650182990616, + "language_loss": 0.77363253, + "learning_rate": 3.98923698403654e-06, + "loss": 0.80362916, + "num_input_tokens_seen": 22121680, + "step": 1032, + "time_per_iteration": 2.8213908672332764 + }, + { + "auxiliary_loss_clip": 0.015545, + "auxiliary_loss_mlp": 0.01442039, + "balance_loss_clip": 1.17427707, + "balance_loss_mlp": 1.07010603, + "epoch": 0.06210732000601232, + "flos": 19355691700800.0, + "grad_norm": 3.208246420923535, + "language_loss": 0.8929739, + "learning_rate": 3.989196596031776e-06, + "loss": 0.92293918, + "num_input_tokens_seen": 22138155, + "step": 1033, + "time_per_iteration": 2.7800076007843018 + }, + { + "auxiliary_loss_clip": 0.01552075, + "auxiliary_loss_mlp": 0.01437227, + "balance_loss_clip": 1.17075741, + "balance_loss_mlp": 1.07750106, + "epoch": 0.062167443258680295, + "flos": 24751443589920.0, + "grad_norm": 2.1068556510741487, + "language_loss": 0.85044795, + "learning_rate": 3.989156132596479e-06, + "loss": 0.88034099, + "num_input_tokens_seen": 22157420, + "step": 1034, + "time_per_iteration": 2.8670034408569336 + }, + { + "auxiliary_loss_clip": 0.01559437, + "auxiliary_loss_mlp": 0.01437835, + "balance_loss_clip": 1.17681551, + "balance_loss_mlp": 1.07410312, + "epoch": 0.06222756651134827, + "flos": 34461301171680.0, + "grad_norm": 4.772080604622932, + "language_loss": 0.81025326, + "learning_rate": 3.989115593732182e-06, + "loss": 0.84022599, + "num_input_tokens_seen": 22178620, + "step": 1035, + "time_per_iteration": 2.938075065612793 + }, + { + "auxiliary_loss_clip": 0.01550635, + "auxiliary_loss_mlp": 0.01418847, + "balance_loss_clip": 1.17116463, + "balance_loss_mlp": 1.05835867, + "epoch": 0.06228768976401623, + "flos": 25668859032000.0, + "grad_norm": 4.741618712661198, + "language_loss": 0.78576434, + "learning_rate": 3.989074979440421e-06, + "loss": 0.81545919, + "num_input_tokens_seen": 22197125, + "step": 1036, + "time_per_iteration": 2.8405590057373047 + }, + { + "auxiliary_loss_clip": 0.01555836, + "auxiliary_loss_mlp": 0.01420573, + "balance_loss_clip": 1.17498827, + "balance_loss_mlp": 1.06561553, + "epoch": 0.062347813016684205, + "flos": 25297652486880.0, + "grad_norm": 12.989147235903943, + "language_loss": 0.86719871, + "learning_rate": 3.989034289722739e-06, + "loss": 0.89696282, + "num_input_tokens_seen": 22217575, + "step": 1037, + "time_per_iteration": 2.876368999481201 + }, + { + "auxiliary_loss_clip": 0.01549678, + "auxiliary_loss_mlp": 0.01428988, + "balance_loss_clip": 1.16779494, + "balance_loss_mlp": 1.07498419, + "epoch": 0.06240793626935217, + "flos": 26909501532480.0, + "grad_norm": 3.1002434518236486, + "language_loss": 0.81398523, + "learning_rate": 3.988993524580676e-06, + "loss": 0.84377193, + "num_input_tokens_seen": 22236840, + "step": 1038, + "time_per_iteration": 2.7984304428100586 + }, + { + "auxiliary_loss_clip": 0.01557708, + "auxiliary_loss_mlp": 0.01430242, + "balance_loss_clip": 1.1770891, + "balance_loss_mlp": 1.07108855, + "epoch": 0.06246805952202014, + "flos": 21617901463680.0, + "grad_norm": 3.6065964096904293, + "language_loss": 0.85936272, + "learning_rate": 3.98895268401578e-06, + "loss": 0.88924217, + "num_input_tokens_seen": 22256465, + "step": 1039, + "time_per_iteration": 2.905783176422119 + }, + { + "auxiliary_loss_clip": 0.01552389, + "auxiliary_loss_mlp": 0.01439623, + "balance_loss_clip": 1.17057002, + "balance_loss_mlp": 1.08485603, + "epoch": 0.0625281827746881, + "flos": 19313591078880.0, + "grad_norm": 2.899156638074964, + "language_loss": 0.81092334, + "learning_rate": 3.9889117680296e-06, + "loss": 0.84084344, + "num_input_tokens_seen": 22274025, + "step": 1040, + "time_per_iteration": 2.8208913803100586 + }, + { + "auxiliary_loss_clip": 0.01568618, + "auxiliary_loss_mlp": 0.01430327, + "balance_loss_clip": 1.18830323, + "balance_loss_mlp": 1.07937431, + "epoch": 0.06258830602735609, + "flos": 27748253242080.0, + "grad_norm": 2.9835337541286795, + "language_loss": 0.70129943, + "learning_rate": 3.988870776623685e-06, + "loss": 0.73128885, + "num_input_tokens_seen": 22292245, + "step": 1041, + "time_per_iteration": 2.805424690246582 + }, + { + "auxiliary_loss_clip": 0.01556198, + "auxiliary_loss_mlp": 0.01435841, + "balance_loss_clip": 1.17553067, + "balance_loss_mlp": 1.08565176, + "epoch": 0.06264842928002405, + "flos": 23224895704800.0, + "grad_norm": 7.4739052534310515, + "language_loss": 0.81761485, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.84753525, + "num_input_tokens_seen": 22311455, + "step": 1042, + "time_per_iteration": 2.848066806793213 + }, + { + "auxiliary_loss_clip": 0.01553296, + "auxiliary_loss_mlp": 0.01430622, + "balance_loss_clip": 1.17166042, + "balance_loss_mlp": 1.08386612, + "epoch": 0.06270855253269202, + "flos": 38402569408320.0, + "grad_norm": 1.7969624379251956, + "language_loss": 0.76403576, + "learning_rate": 3.988788567558874e-06, + "loss": 0.79387498, + "num_input_tokens_seen": 22333750, + "step": 1043, + "time_per_iteration": 4.48112154006958 + }, + { + "auxiliary_loss_clip": 0.01576077, + "auxiliary_loss_mlp": 0.01434777, + "balance_loss_clip": 1.19111419, + "balance_loss_mlp": 1.09336138, + "epoch": 0.06276867578535998, + "flos": 22455629112960.0, + "grad_norm": 2.9062229294579196, + "language_loss": 0.92587316, + "learning_rate": 3.988747349903097e-06, + "loss": 0.95598167, + "num_input_tokens_seen": 22351940, + "step": 1044, + "time_per_iteration": 2.8743035793304443 + }, + { + "auxiliary_loss_clip": 0.01546297, + "auxiliary_loss_mlp": 0.01432624, + "balance_loss_clip": 1.16492617, + "balance_loss_mlp": 1.07976496, + "epoch": 0.06282879903802796, + "flos": 22932694245600.0, + "grad_norm": 1.8304806664663802, + "language_loss": 0.86267614, + "learning_rate": 3.988706056833821e-06, + "loss": 0.89246535, + "num_input_tokens_seen": 22372085, + "step": 1045, + "time_per_iteration": 4.424199342727661 + }, + { + "auxiliary_loss_clip": 0.01557899, + "auxiliary_loss_mlp": 0.01421506, + "balance_loss_clip": 1.17737627, + "balance_loss_mlp": 1.07093501, + "epoch": 0.06288892229069593, + "flos": 34821849903840.0, + "grad_norm": 2.1252344409883444, + "language_loss": 0.78704071, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.81683475, + "num_input_tokens_seen": 22392020, + "step": 1046, + "time_per_iteration": 4.363126516342163 + }, + { + "auxiliary_loss_clip": 0.01556431, + "auxiliary_loss_mlp": 0.01427866, + "balance_loss_clip": 1.17399037, + "balance_loss_mlp": 1.07767713, + "epoch": 0.06294904554336389, + "flos": 19429159275360.0, + "grad_norm": 3.795110689929799, + "language_loss": 0.77444422, + "learning_rate": 3.988623244461039e-06, + "loss": 0.8042872, + "num_input_tokens_seen": 22411180, + "step": 1047, + "time_per_iteration": 2.782405376434326 + }, + { + "auxiliary_loss_clip": 0.01552785, + "auxiliary_loss_mlp": 0.01426792, + "balance_loss_clip": 1.17116332, + "balance_loss_mlp": 1.0720253, + "epoch": 0.06300916879603187, + "flos": 40665082596480.0, + "grad_norm": 1.9735645674655882, + "language_loss": 0.77075887, + "learning_rate": 3.988581725160672e-06, + "loss": 0.80055463, + "num_input_tokens_seen": 22435105, + "step": 1048, + "time_per_iteration": 2.9547691345214844 + }, + { + "auxiliary_loss_clip": 0.01549882, + "auxiliary_loss_mlp": 0.0141383, + "balance_loss_clip": 1.167413, + "balance_loss_mlp": 1.05944431, + "epoch": 0.06306929204869983, + "flos": 23806340226720.0, + "grad_norm": 5.52762924030616, + "language_loss": 0.77652854, + "learning_rate": 3.988540130453087e-06, + "loss": 0.80616564, + "num_input_tokens_seen": 22452710, + "step": 1049, + "time_per_iteration": 2.7970621585845947 + }, + { + "auxiliary_loss_clip": 0.01551648, + "auxiliary_loss_mlp": 0.01414289, + "balance_loss_clip": 1.16899824, + "balance_loss_mlp": 1.05856848, + "epoch": 0.0631294153013678, + "flos": 18917768793600.0, + "grad_norm": 3.789184473243765, + "language_loss": 0.83168209, + "learning_rate": 3.988498460339862e-06, + "loss": 0.86134136, + "num_input_tokens_seen": 22470175, + "step": 1050, + "time_per_iteration": 2.745177984237671 + }, + { + "auxiliary_loss_clip": 0.01564246, + "auxiliary_loss_mlp": 0.01423678, + "balance_loss_clip": 1.18412566, + "balance_loss_mlp": 1.06299865, + "epoch": 0.06318953855403578, + "flos": 24282495083520.0, + "grad_norm": 1.865672884362007, + "language_loss": 0.76711112, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79699039, + "num_input_tokens_seen": 22490020, + "step": 1051, + "time_per_iteration": 2.7812657356262207 + }, + { + "auxiliary_loss_clip": 0.01556953, + "auxiliary_loss_mlp": 0.0141117, + "balance_loss_clip": 1.17514062, + "balance_loss_mlp": 1.0512538, + "epoch": 0.06324966180670374, + "flos": 22531144808160.0, + "grad_norm": 2.288815492765329, + "language_loss": 0.80441874, + "learning_rate": 3.98841489390281e-06, + "loss": 0.83410001, + "num_input_tokens_seen": 22509685, + "step": 1052, + "time_per_iteration": 2.9117908477783203 + }, + { + "auxiliary_loss_clip": 0.01560523, + "auxiliary_loss_mlp": 0.01409896, + "balance_loss_clip": 1.1759311, + "balance_loss_mlp": 1.04921591, + "epoch": 0.06330978505937171, + "flos": 15779902857120.0, + "grad_norm": 2.7804825882839546, + "language_loss": 0.78110421, + "learning_rate": 3.988372997582155e-06, + "loss": 0.81080842, + "num_input_tokens_seen": 22527905, + "step": 1053, + "time_per_iteration": 2.9599692821502686 + }, + { + "auxiliary_loss_clip": 0.01558401, + "auxiliary_loss_mlp": 0.01415315, + "balance_loss_clip": 1.17736578, + "balance_loss_mlp": 1.05387235, + "epoch": 0.06336990831203967, + "flos": 21473279932320.0, + "grad_norm": 4.012922955070183, + "language_loss": 0.85114527, + "learning_rate": 3.988331025862195e-06, + "loss": 0.88088238, + "num_input_tokens_seen": 22546335, + "step": 1054, + "time_per_iteration": 2.836310386657715 + }, + { + "auxiliary_loss_clip": 0.01565531, + "auxiliary_loss_mlp": 0.01425993, + "balance_loss_clip": 1.18209684, + "balance_loss_mlp": 1.06626701, + "epoch": 0.06343003156470765, + "flos": 18480869946720.0, + "grad_norm": 3.9685473177549055, + "language_loss": 0.85439897, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.88431418, + "num_input_tokens_seen": 22563885, + "step": 1055, + "time_per_iteration": 2.77882719039917 + }, + { + "auxiliary_loss_clip": 0.01558498, + "auxiliary_loss_mlp": 0.01417026, + "balance_loss_clip": 1.17547059, + "balance_loss_mlp": 1.05691862, + "epoch": 0.06349015481737562, + "flos": 25157203053120.0, + "grad_norm": 4.778051614742373, + "language_loss": 0.80886924, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83862448, + "num_input_tokens_seen": 22583035, + "step": 1056, + "time_per_iteration": 2.7981388568878174 + }, + { + "auxiliary_loss_clip": 0.01563236, + "auxiliary_loss_mlp": 0.01430047, + "balance_loss_clip": 1.17713964, + "balance_loss_mlp": 1.06707835, + "epoch": 0.06355027807004358, + "flos": 26874303835680.0, + "grad_norm": 6.703368019881627, + "language_loss": 0.81055927, + "learning_rate": 3.988204658322426e-06, + "loss": 0.84049201, + "num_input_tokens_seen": 22605055, + "step": 1057, + "time_per_iteration": 2.8811092376708984 + }, + { + "auxiliary_loss_clip": 0.0156838, + "auxiliary_loss_mlp": 0.01420068, + "balance_loss_clip": 1.18363619, + "balance_loss_mlp": 1.05938816, + "epoch": 0.06361040132271156, + "flos": 21398788297440.0, + "grad_norm": 2.0046875509972404, + "language_loss": 0.83591664, + "learning_rate": 3.988162385021196e-06, + "loss": 0.8658011, + "num_input_tokens_seen": 22623760, + "step": 1058, + "time_per_iteration": 2.832578659057617 + }, + { + "auxiliary_loss_clip": 0.01564831, + "auxiliary_loss_mlp": 0.01415005, + "balance_loss_clip": 1.17921865, + "balance_loss_mlp": 1.05432487, + "epoch": 0.06367052457537953, + "flos": 25735840891200.0, + "grad_norm": 2.186816618126924, + "language_loss": 0.8757689, + "learning_rate": 3.988120036328651e-06, + "loss": 0.90556729, + "num_input_tokens_seen": 22643000, + "step": 1059, + "time_per_iteration": 2.8000118732452393 + }, + { + "auxiliary_loss_clip": 0.01558958, + "auxiliary_loss_mlp": 0.01412187, + "balance_loss_clip": 1.17390716, + "balance_loss_mlp": 1.05303359, + "epoch": 0.0637306478280475, + "flos": 17632901694240.0, + "grad_norm": 2.312069092547139, + "language_loss": 0.912705, + "learning_rate": 3.988077612246394e-06, + "loss": 0.94241655, + "num_input_tokens_seen": 22660460, + "step": 1060, + "time_per_iteration": 2.7955174446105957 + }, + { + "auxiliary_loss_clip": 0.0156372, + "auxiliary_loss_mlp": 0.01419615, + "balance_loss_clip": 1.17755222, + "balance_loss_mlp": 1.06484795, + "epoch": 0.06379077108071547, + "flos": 13664324818080.0, + "grad_norm": 1.9380447514156367, + "language_loss": 0.87347877, + "learning_rate": 3.988035112776035e-06, + "loss": 0.90331209, + "num_input_tokens_seen": 22679270, + "step": 1061, + "time_per_iteration": 2.7185723781585693 + }, + { + "auxiliary_loss_clip": 0.01554265, + "auxiliary_loss_mlp": 0.01436198, + "balance_loss_clip": 1.16597438, + "balance_loss_mlp": 1.07570934, + "epoch": 0.06385089433338344, + "flos": 28481563573920.0, + "grad_norm": 2.517428895656694, + "language_loss": 0.77235258, + "learning_rate": 3.987992537919185e-06, + "loss": 0.80225724, + "num_input_tokens_seen": 22699330, + "step": 1062, + "time_per_iteration": 2.900573492050171 + }, + { + "auxiliary_loss_clip": 0.01549123, + "auxiliary_loss_mlp": 0.01424793, + "balance_loss_clip": 1.16180778, + "balance_loss_mlp": 1.06201553, + "epoch": 0.0639110175860514, + "flos": 24313065544800.0, + "grad_norm": 1.9233599919729505, + "language_loss": 0.86661446, + "learning_rate": 3.987949887677459e-06, + "loss": 0.8963536, + "num_input_tokens_seen": 22717945, + "step": 1063, + "time_per_iteration": 2.8093512058258057 + }, + { + "auxiliary_loss_clip": 0.01550088, + "auxiliary_loss_mlp": 0.014324, + "balance_loss_clip": 1.16250753, + "balance_loss_mlp": 1.08469009, + "epoch": 0.06397114083871938, + "flos": 22092956403840.0, + "grad_norm": 9.91870530905292, + "language_loss": 0.80679154, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.8366164, + "num_input_tokens_seen": 22736790, + "step": 1064, + "time_per_iteration": 2.8513870239257812 + }, + { + "auxiliary_loss_clip": 0.01568049, + "auxiliary_loss_mlp": 0.01442028, + "balance_loss_clip": 1.18234098, + "balance_loss_mlp": 1.09164774, + "epoch": 0.06403126409138735, + "flos": 19574842795200.0, + "grad_norm": 2.9556246408053837, + "language_loss": 0.83970845, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86980921, + "num_input_tokens_seen": 22754745, + "step": 1065, + "time_per_iteration": 2.8535056114196777 + }, + { + "auxiliary_loss_clip": 0.0156256, + "auxiliary_loss_mlp": 0.01427204, + "balance_loss_clip": 1.17509985, + "balance_loss_mlp": 1.07873154, + "epoch": 0.06409138734405531, + "flos": 40810614403680.0, + "grad_norm": 1.7929043160768872, + "language_loss": 0.67982829, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70972598, + "num_input_tokens_seen": 22776780, + "step": 1066, + "time_per_iteration": 2.954591751098633 + }, + { + "auxiliary_loss_clip": 0.01558772, + "auxiliary_loss_mlp": 0.01423603, + "balance_loss_clip": 1.17351961, + "balance_loss_mlp": 1.07036209, + "epoch": 0.06415151059672328, + "flos": 20443292618400.0, + "grad_norm": 2.172993810666, + "language_loss": 0.9033761, + "learning_rate": 3.987778532894181e-06, + "loss": 0.93319988, + "num_input_tokens_seen": 22793915, + "step": 1067, + "time_per_iteration": 2.8564586639404297 + }, + { + "auxiliary_loss_clip": 0.01554699, + "auxiliary_loss_mlp": 0.01449631, + "balance_loss_clip": 1.16663885, + "balance_loss_mlp": 1.10344732, + "epoch": 0.06421163384939126, + "flos": 18073707141600.0, + "grad_norm": 1.9411755803941126, + "language_loss": 0.83533114, + "learning_rate": 3.987735505752391e-06, + "loss": 0.86537445, + "num_input_tokens_seen": 22812670, + "step": 1068, + "time_per_iteration": 2.753685474395752 + }, + { + "auxiliary_loss_clip": 0.01558291, + "auxiliary_loss_mlp": 0.01435113, + "balance_loss_clip": 1.17060447, + "balance_loss_mlp": 1.09236205, + "epoch": 0.06427175710205922, + "flos": 25121815715520.0, + "grad_norm": 2.7014076236793083, + "language_loss": 0.90058887, + "learning_rate": 3.987692403235471e-06, + "loss": 0.9305228, + "num_input_tokens_seen": 22832440, + "step": 1069, + "time_per_iteration": 2.908350944519043 + }, + { + "auxiliary_loss_clip": 0.01567923, + "auxiliary_loss_mlp": 0.01433215, + "balance_loss_clip": 1.18030763, + "balance_loss_mlp": 1.0931344, + "epoch": 0.06433188035472719, + "flos": 17382269862720.0, + "grad_norm": 2.9910407326224457, + "language_loss": 0.96349102, + "learning_rate": 3.987649225345056e-06, + "loss": 0.9935025, + "num_input_tokens_seen": 22845495, + "step": 1070, + "time_per_iteration": 2.852461099624634 + }, + { + "auxiliary_loss_clip": 0.01562504, + "auxiliary_loss_mlp": 0.01415859, + "balance_loss_clip": 1.17591298, + "balance_loss_mlp": 1.06357157, + "epoch": 0.06439200360739517, + "flos": 23548312404000.0, + "grad_norm": 1.9061737978106736, + "language_loss": 0.88657635, + "learning_rate": 3.987605972082782e-06, + "loss": 0.91636002, + "num_input_tokens_seen": 22865390, + "step": 1071, + "time_per_iteration": 2.8642170429229736 + }, + { + "auxiliary_loss_clip": 0.01559782, + "auxiliary_loss_mlp": 0.01423091, + "balance_loss_clip": 1.17307401, + "balance_loss_mlp": 1.07499969, + "epoch": 0.06445212686006313, + "flos": 21981863730240.0, + "grad_norm": 3.5204741258747223, + "language_loss": 0.76003516, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78986382, + "num_input_tokens_seen": 22885495, + "step": 1072, + "time_per_iteration": 2.799455404281616 + }, + { + "auxiliary_loss_clip": 0.01565159, + "auxiliary_loss_mlp": 0.01454291, + "balance_loss_clip": 1.17759585, + "balance_loss_mlp": 1.1002872, + "epoch": 0.0645122501127311, + "flos": 25923662961120.0, + "grad_norm": 1.915866344329765, + "language_loss": 0.80919111, + "learning_rate": 3.987519239449226e-06, + "loss": 0.83938563, + "num_input_tokens_seen": 22904845, + "step": 1073, + "time_per_iteration": 2.8049657344818115 + }, + { + "auxiliary_loss_clip": 0.01568445, + "auxiliary_loss_mlp": 0.01398995, + "balance_loss_clip": 1.18084717, + "balance_loss_mlp": 1.05090415, + "epoch": 0.06457237336539907, + "flos": 25628199680160.0, + "grad_norm": 1.8121770297742197, + "language_loss": 0.80459356, + "learning_rate": 3.987475760081233e-06, + "loss": 0.83426797, + "num_input_tokens_seen": 22925940, + "step": 1074, + "time_per_iteration": 2.847715377807617 + }, + { + "auxiliary_loss_clip": 0.01564216, + "auxiliary_loss_mlp": 0.01415881, + "balance_loss_clip": 1.17693233, + "balance_loss_mlp": 1.06054187, + "epoch": 0.06463249661806704, + "flos": 19466025811200.0, + "grad_norm": 3.2700508461595645, + "language_loss": 0.79224181, + "learning_rate": 3.987432205347958e-06, + "loss": 0.82204282, + "num_input_tokens_seen": 22944375, + "step": 1075, + "time_per_iteration": 2.782768487930298 + }, + { + "auxiliary_loss_clip": 0.0157249, + "auxiliary_loss_mlp": 0.01438227, + "balance_loss_clip": 1.18409348, + "balance_loss_mlp": 1.08231592, + "epoch": 0.064692619870735, + "flos": 24500318692320.0, + "grad_norm": 3.035154342119932, + "language_loss": 0.88103372, + "learning_rate": 3.987388575251055e-06, + "loss": 0.91114086, + "num_input_tokens_seen": 22959145, + "step": 1076, + "time_per_iteration": 2.8873796463012695 + }, + { + "auxiliary_loss_clip": 0.01564219, + "auxiliary_loss_mlp": 0.01405709, + "balance_loss_clip": 1.17692113, + "balance_loss_mlp": 1.05132329, + "epoch": 0.06475274312340297, + "flos": 17020545357600.0, + "grad_norm": 2.670509120688908, + "language_loss": 0.81022561, + "learning_rate": 3.98734486979218e-06, + "loss": 0.83992487, + "num_input_tokens_seen": 22978100, + "step": 1077, + "time_per_iteration": 2.766221761703491 + }, + { + "auxiliary_loss_clip": 0.01569498, + "auxiliary_loss_mlp": 0.01416231, + "balance_loss_clip": 1.18085861, + "balance_loss_mlp": 1.06489754, + "epoch": 0.06481286637607095, + "flos": 24574734470880.0, + "grad_norm": 2.5331079391891356, + "language_loss": 0.91867697, + "learning_rate": 3.987301088972986e-06, + "loss": 0.94853431, + "num_input_tokens_seen": 22997285, + "step": 1078, + "time_per_iteration": 2.851654529571533 + }, + { + "auxiliary_loss_clip": 0.01559939, + "auxiliary_loss_mlp": 0.0141658, + "balance_loss_clip": 1.1732204, + "balance_loss_mlp": 1.05380225, + "epoch": 0.06487298962873891, + "flos": 21107762611200.0, + "grad_norm": 2.0464906094422597, + "language_loss": 0.78730166, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81706685, + "num_input_tokens_seen": 23016285, + "step": 1079, + "time_per_iteration": 2.8391621112823486 + }, + { + "auxiliary_loss_clip": 0.01566539, + "auxiliary_loss_mlp": 0.01417194, + "balance_loss_clip": 1.17761672, + "balance_loss_mlp": 1.06109238, + "epoch": 0.06493311288140688, + "flos": 24610501090080.0, + "grad_norm": 2.608213589023793, + "language_loss": 0.69628203, + "learning_rate": 3.987213301260294e-06, + "loss": 0.7261194, + "num_input_tokens_seen": 23036420, + "step": 1080, + "time_per_iteration": 4.397906303405762 + }, + { + "auxiliary_loss_clip": 0.01560848, + "auxiliary_loss_mlp": 0.01409635, + "balance_loss_clip": 1.17309153, + "balance_loss_mlp": 1.05162585, + "epoch": 0.06499323613407486, + "flos": 25340359959360.0, + "grad_norm": 1.9185314012490389, + "language_loss": 0.71767735, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74738222, + "num_input_tokens_seen": 23056945, + "step": 1081, + "time_per_iteration": 2.8311150074005127 + }, + { + "auxiliary_loss_clip": 0.01562493, + "auxiliary_loss_mlp": 0.01414306, + "balance_loss_clip": 1.17338383, + "balance_loss_mlp": 1.0572505, + "epoch": 0.06505335938674282, + "flos": 20377789957440.0, + "grad_norm": 2.4191428339455388, + "language_loss": 0.84712195, + "learning_rate": 3.987125212126294e-06, + "loss": 0.87689, + "num_input_tokens_seen": 23074940, + "step": 1082, + "time_per_iteration": 2.787815809249878 + }, + { + "auxiliary_loss_clip": 0.01559575, + "auxiliary_loss_mlp": 0.01418402, + "balance_loss_clip": 1.17180073, + "balance_loss_mlp": 1.06229973, + "epoch": 0.06511348263941079, + "flos": 25340284103040.0, + "grad_norm": 3.595546707970451, + "language_loss": 0.83150154, + "learning_rate": 3.987081054530478e-06, + "loss": 0.86128128, + "num_input_tokens_seen": 23093420, + "step": 1083, + "time_per_iteration": 5.785488128662109 + }, + { + "auxiliary_loss_clip": 0.01563586, + "auxiliary_loss_mlp": 0.01416119, + "balance_loss_clip": 1.17480087, + "balance_loss_mlp": 1.06039882, + "epoch": 0.06517360589207877, + "flos": 20334513562560.0, + "grad_norm": 5.216478755138177, + "language_loss": 0.79338956, + "learning_rate": 3.987036821584348e-06, + "loss": 0.82318664, + "num_input_tokens_seen": 23111550, + "step": 1084, + "time_per_iteration": 2.76487398147583 + }, + { + "auxiliary_loss_clip": 0.01571201, + "auxiliary_loss_mlp": 0.01421492, + "balance_loss_clip": 1.18218672, + "balance_loss_mlp": 1.06977677, + "epoch": 0.06523372914474673, + "flos": 31684097751840.0, + "grad_norm": 3.533467063796219, + "language_loss": 0.66199923, + "learning_rate": 3.986992513289584e-06, + "loss": 0.69192612, + "num_input_tokens_seen": 23130335, + "step": 1085, + "time_per_iteration": 4.402602434158325 + }, + { + "auxiliary_loss_clip": 0.0156679, + "auxiliary_loss_mlp": 0.01434157, + "balance_loss_clip": 1.1771934, + "balance_loss_mlp": 1.08701944, + "epoch": 0.0652938523974147, + "flos": 20780325527040.0, + "grad_norm": 2.241216827375069, + "language_loss": 0.77115238, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.80116189, + "num_input_tokens_seen": 23152380, + "step": 1086, + "time_per_iteration": 2.8302600383758545 + }, + { + "auxiliary_loss_clip": 0.01567435, + "auxiliary_loss_mlp": 0.01420501, + "balance_loss_clip": 1.178859, + "balance_loss_mlp": 1.07584262, + "epoch": 0.06535397565008266, + "flos": 16692880704480.0, + "grad_norm": 2.151482330942934, + "language_loss": 0.85268074, + "learning_rate": 3.986903670660872e-06, + "loss": 0.88256001, + "num_input_tokens_seen": 23171630, + "step": 1087, + "time_per_iteration": 2.9278628826141357 + }, + { + "auxiliary_loss_clip": 0.01564325, + "auxiliary_loss_mlp": 0.01428959, + "balance_loss_clip": 1.17550111, + "balance_loss_mlp": 1.08086812, + "epoch": 0.06541409890275064, + "flos": 26870548947840.0, + "grad_norm": 1.8243648415076317, + "language_loss": 0.7812596, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.81119239, + "num_input_tokens_seen": 23192520, + "step": 1088, + "time_per_iteration": 2.8282928466796875 + }, + { + "auxiliary_loss_clip": 0.01568034, + "auxiliary_loss_mlp": 0.01424091, + "balance_loss_clip": 1.17924201, + "balance_loss_mlp": 1.07523727, + "epoch": 0.06547422215541861, + "flos": 20523701046240.0, + "grad_norm": 2.338633461404379, + "language_loss": 0.71065015, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.74057138, + "num_input_tokens_seen": 23210710, + "step": 1089, + "time_per_iteration": 2.689005136489868 + }, + { + "auxiliary_loss_clip": 0.01572619, + "auxiliary_loss_mlp": 0.0143055, + "balance_loss_clip": 1.18426156, + "balance_loss_mlp": 1.09180522, + "epoch": 0.06553434540808657, + "flos": 22018730266080.0, + "grad_norm": 1.933251438904254, + "language_loss": 0.85687214, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.88690382, + "num_input_tokens_seen": 23230305, + "step": 1090, + "time_per_iteration": 2.8101155757904053 + }, + { + "auxiliary_loss_clip": 0.01572143, + "auxiliary_loss_mlp": 0.01428535, + "balance_loss_clip": 1.18447089, + "balance_loss_mlp": 1.07071686, + "epoch": 0.06559446866075455, + "flos": 24610994156160.0, + "grad_norm": 1.753884958701023, + "language_loss": 0.72018802, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.75019479, + "num_input_tokens_seen": 23249015, + "step": 1091, + "time_per_iteration": 2.8018383979797363 + }, + { + "auxiliary_loss_clip": 0.01566161, + "auxiliary_loss_mlp": 0.0142315, + "balance_loss_clip": 1.1779058, + "balance_loss_mlp": 1.07544005, + "epoch": 0.06565459191342252, + "flos": 24276692075040.0, + "grad_norm": 3.635667270741816, + "language_loss": 0.82819855, + "learning_rate": 3.986680245605936e-06, + "loss": 0.85809165, + "num_input_tokens_seen": 23265105, + "step": 1092, + "time_per_iteration": 2.766742706298828 + }, + { + "auxiliary_loss_clip": 0.01556707, + "auxiliary_loss_mlp": 0.01432859, + "balance_loss_clip": 1.16918695, + "balance_loss_mlp": 1.0876286, + "epoch": 0.06571471516609048, + "flos": 24789258329760.0, + "grad_norm": 1.9030568016604934, + "language_loss": 0.71231669, + "learning_rate": 3.986635334582814e-06, + "loss": 0.7422123, + "num_input_tokens_seen": 23283950, + "step": 1093, + "time_per_iteration": 2.8745081424713135 + }, + { + "auxiliary_loss_clip": 0.01564516, + "auxiliary_loss_mlp": 0.01415091, + "balance_loss_clip": 1.17484498, + "balance_loss_mlp": 1.07443881, + "epoch": 0.06577483841875846, + "flos": 26216509199040.0, + "grad_norm": 1.9832261949558825, + "language_loss": 0.8826015, + "learning_rate": 3.986590348226282e-06, + "loss": 0.91239762, + "num_input_tokens_seen": 23305005, + "step": 1094, + "time_per_iteration": 2.7905585765838623 + }, + { + "auxiliary_loss_clip": 0.01565251, + "auxiliary_loss_mlp": 0.01417942, + "balance_loss_clip": 1.17631757, + "balance_loss_mlp": 1.06946909, + "epoch": 0.06583496167142643, + "flos": 25083052771680.0, + "grad_norm": 2.050752764962017, + "language_loss": 0.81671774, + "learning_rate": 3.986545286538044e-06, + "loss": 0.84654963, + "num_input_tokens_seen": 23323220, + "step": 1095, + "time_per_iteration": 2.8960466384887695 + }, + { + "auxiliary_loss_clip": 0.01565165, + "auxiliary_loss_mlp": 0.01408901, + "balance_loss_clip": 1.17808974, + "balance_loss_mlp": 1.04898381, + "epoch": 0.06589508492409439, + "flos": 25632182136960.0, + "grad_norm": 2.504547024355932, + "language_loss": 0.70136803, + "learning_rate": 3.986500149519811e-06, + "loss": 0.73110867, + "num_input_tokens_seen": 23342235, + "step": 1096, + "time_per_iteration": 2.830862045288086 + }, + { + "auxiliary_loss_clip": 0.01567352, + "auxiliary_loss_mlp": 0.01408426, + "balance_loss_clip": 1.17881489, + "balance_loss_mlp": 1.05175233, + "epoch": 0.06595520817676236, + "flos": 23623410889440.0, + "grad_norm": 1.9442050464715808, + "language_loss": 0.77879363, + "learning_rate": 3.986454937173292e-06, + "loss": 0.80855143, + "num_input_tokens_seen": 23363680, + "step": 1097, + "time_per_iteration": 2.8626084327697754 + }, + { + "auxiliary_loss_clip": 0.01559023, + "auxiliary_loss_mlp": 0.0140902, + "balance_loss_clip": 1.17089486, + "balance_loss_mlp": 1.06378937, + "epoch": 0.06601533142943034, + "flos": 33804492667200.0, + "grad_norm": 2.6206693633568348, + "language_loss": 0.78405046, + "learning_rate": 3.986409649500203e-06, + "loss": 0.8137309, + "num_input_tokens_seen": 23385590, + "step": 1098, + "time_per_iteration": 2.932888984680176 + }, + { + "auxiliary_loss_clip": 0.01564678, + "auxiliary_loss_mlp": 0.01425411, + "balance_loss_clip": 1.17674112, + "balance_loss_mlp": 1.07522166, + "epoch": 0.0660754546820983, + "flos": 20260856347200.0, + "grad_norm": 1.908342699000693, + "language_loss": 0.81856692, + "learning_rate": 3.986364286502261e-06, + "loss": 0.84846783, + "num_input_tokens_seen": 23402945, + "step": 1099, + "time_per_iteration": 2.776193857192993 + }, + { + "auxiliary_loss_clip": 0.01559867, + "auxiliary_loss_mlp": 0.01410834, + "balance_loss_clip": 1.17123103, + "balance_loss_mlp": 1.06198001, + "epoch": 0.06613557793476627, + "flos": 19356184766880.0, + "grad_norm": 2.2998414399346485, + "language_loss": 0.82973063, + "learning_rate": 3.986318848181186e-06, + "loss": 0.8594377, + "num_input_tokens_seen": 23421410, + "step": 1100, + "time_per_iteration": 2.7959389686584473 + }, + { + "auxiliary_loss_clip": 0.01568389, + "auxiliary_loss_mlp": 0.01435347, + "balance_loss_clip": 1.17952657, + "balance_loss_mlp": 1.08573008, + "epoch": 0.06619570118743424, + "flos": 13774317575040.0, + "grad_norm": 2.4706809609401104, + "language_loss": 0.73225009, + "learning_rate": 3.986273334538702e-06, + "loss": 0.76228744, + "num_input_tokens_seen": 23438870, + "step": 1101, + "time_per_iteration": 2.7647793292999268 + }, + { + "auxiliary_loss_clip": 0.01555232, + "auxiliary_loss_mlp": 0.0141788, + "balance_loss_clip": 1.16757417, + "balance_loss_mlp": 1.068645, + "epoch": 0.06625582444010221, + "flos": 17859790133280.0, + "grad_norm": 8.822067881187836, + "language_loss": 0.86679387, + "learning_rate": 3.986227745576533e-06, + "loss": 0.89652497, + "num_input_tokens_seen": 23456975, + "step": 1102, + "time_per_iteration": 2.749812602996826 + }, + { + "auxiliary_loss_clip": 0.01560679, + "auxiliary_loss_mlp": 0.01418387, + "balance_loss_clip": 1.17204642, + "balance_loss_mlp": 1.06514633, + "epoch": 0.06631594769277017, + "flos": 11840341387680.0, + "grad_norm": 2.984796353500595, + "language_loss": 0.81949145, + "learning_rate": 3.98618208129641e-06, + "loss": 0.84928203, + "num_input_tokens_seen": 23473440, + "step": 1103, + "time_per_iteration": 2.7418813705444336 + }, + { + "auxiliary_loss_clip": 0.01563904, + "auxiliary_loss_mlp": 0.01418138, + "balance_loss_clip": 1.17496085, + "balance_loss_mlp": 1.0723362, + "epoch": 0.06637607094543815, + "flos": 19795473087840.0, + "grad_norm": 1.789510687958657, + "language_loss": 0.82059544, + "learning_rate": 3.986136341700063e-06, + "loss": 0.85041589, + "num_input_tokens_seen": 23493880, + "step": 1104, + "time_per_iteration": 2.797542095184326 + }, + { + "auxiliary_loss_clip": 0.01550305, + "auxiliary_loss_mlp": 0.01420661, + "balance_loss_clip": 1.16339588, + "balance_loss_mlp": 1.07390523, + "epoch": 0.06643619419810612, + "flos": 25488129528000.0, + "grad_norm": 2.271504998182669, + "language_loss": 0.80790007, + "learning_rate": 3.986090526789227e-06, + "loss": 0.83760977, + "num_input_tokens_seen": 23514920, + "step": 1105, + "time_per_iteration": 2.8334312438964844 + }, + { + "auxiliary_loss_clip": 0.01551508, + "auxiliary_loss_mlp": 0.0141617, + "balance_loss_clip": 1.16445374, + "balance_loss_mlp": 1.06769717, + "epoch": 0.06649631745077408, + "flos": 16948443196800.0, + "grad_norm": 2.129653248722037, + "language_loss": 0.96707678, + "learning_rate": 3.986044636565639e-06, + "loss": 0.99675357, + "num_input_tokens_seen": 23531635, + "step": 1106, + "time_per_iteration": 2.8259873390197754 + }, + { + "auxiliary_loss_clip": 0.01552003, + "auxiliary_loss_mlp": 0.01413435, + "balance_loss_clip": 1.16518927, + "balance_loss_mlp": 1.06801414, + "epoch": 0.06655644070344206, + "flos": 17860548696480.0, + "grad_norm": 1.914418938550486, + "language_loss": 0.83102161, + "learning_rate": 3.985998671031039e-06, + "loss": 0.86067605, + "num_input_tokens_seen": 23551020, + "step": 1107, + "time_per_iteration": 2.9121103286743164 + }, + { + "auxiliary_loss_clip": 0.01731606, + "auxiliary_loss_mlp": 0.01288406, + "balance_loss_clip": 1.34141159, + "balance_loss_mlp": 1.00840759, + "epoch": 0.06661656395611003, + "flos": 61425609259680.0, + "grad_norm": 0.8273956307512033, + "language_loss": 0.56648564, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.59668577, + "num_input_tokens_seen": 23610675, + "step": 1108, + "time_per_iteration": 3.3054513931274414 + }, + { + "auxiliary_loss_clip": 0.01555163, + "auxiliary_loss_mlp": 0.01417593, + "balance_loss_clip": 1.16725051, + "balance_loss_mlp": 1.07274401, + "epoch": 0.066676687208778, + "flos": 20664681474240.0, + "grad_norm": 3.447584914366571, + "language_loss": 0.72875488, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.75848246, + "num_input_tokens_seen": 23628710, + "step": 1109, + "time_per_iteration": 2.8574957847595215 + }, + { + "auxiliary_loss_clip": 0.01556084, + "auxiliary_loss_mlp": 0.01416263, + "balance_loss_clip": 1.16736317, + "balance_loss_mlp": 1.07885289, + "epoch": 0.06673681046144596, + "flos": 20925402196320.0, + "grad_norm": 1.966419733058551, + "language_loss": 0.78025573, + "learning_rate": 3.985860322578614e-06, + "loss": 0.8099792, + "num_input_tokens_seen": 23649160, + "step": 1110, + "time_per_iteration": 2.777780532836914 + }, + { + "auxiliary_loss_clip": 0.01555829, + "auxiliary_loss_mlp": 0.01424279, + "balance_loss_clip": 1.16790926, + "balance_loss_mlp": 1.08705997, + "epoch": 0.06679693371411394, + "flos": 31068479593440.0, + "grad_norm": 3.231161055481156, + "language_loss": 0.71863437, + "learning_rate": 3.985814055817427e-06, + "loss": 0.7484355, + "num_input_tokens_seen": 23671995, + "step": 1111, + "time_per_iteration": 2.928209066390991 + }, + { + "auxiliary_loss_clip": 0.0156364, + "auxiliary_loss_mlp": 0.0142975, + "balance_loss_clip": 1.17434621, + "balance_loss_mlp": 1.09043264, + "epoch": 0.0668570569667819, + "flos": 21728311430400.0, + "grad_norm": 1.9754414765840773, + "language_loss": 0.78289801, + "learning_rate": 3.985767713753971e-06, + "loss": 0.81283194, + "num_input_tokens_seen": 23690705, + "step": 1112, + "time_per_iteration": 2.8712189197540283 + }, + { + "auxiliary_loss_clip": 0.01555943, + "auxiliary_loss_mlp": 0.01427678, + "balance_loss_clip": 1.16651523, + "balance_loss_mlp": 1.0908401, + "epoch": 0.06691718021944987, + "flos": 22749461483040.0, + "grad_norm": 2.965500397428051, + "language_loss": 0.7940644, + "learning_rate": 3.985721296390005e-06, + "loss": 0.82390064, + "num_input_tokens_seen": 23709990, + "step": 1113, + "time_per_iteration": 2.814190626144409 + }, + { + "auxiliary_loss_clip": 0.0155726, + "auxiliary_loss_mlp": 0.01423197, + "balance_loss_clip": 1.16578031, + "balance_loss_mlp": 1.0911274, + "epoch": 0.06697730347211785, + "flos": 16547538538080.0, + "grad_norm": 2.234634606002276, + "language_loss": 0.83129835, + "learning_rate": 3.985674803727289e-06, + "loss": 0.86110288, + "num_input_tokens_seen": 23728485, + "step": 1114, + "time_per_iteration": 2.9237492084503174 + }, + { + "auxiliary_loss_clip": 0.0171608, + "auxiliary_loss_mlp": 0.01318863, + "balance_loss_clip": 1.32557571, + "balance_loss_mlp": 1.0556488, + "epoch": 0.06703742672478581, + "flos": 59788765192320.0, + "grad_norm": 0.8442876857230525, + "language_loss": 0.58095652, + "learning_rate": 3.985628235767584e-06, + "loss": 0.61130595, + "num_input_tokens_seen": 23786650, + "step": 1115, + "time_per_iteration": 3.20740008354187 + }, + { + "auxiliary_loss_clip": 0.01558226, + "auxiliary_loss_mlp": 0.01426047, + "balance_loss_clip": 1.16881585, + "balance_loss_mlp": 1.08596659, + "epoch": 0.06709754997745378, + "flos": 16802076970080.0, + "grad_norm": 2.675550596520659, + "language_loss": 0.91189861, + "learning_rate": 3.985581592512658e-06, + "loss": 0.94174135, + "num_input_tokens_seen": 23802555, + "step": 1116, + "time_per_iteration": 2.7810423374176025 + }, + { + "auxiliary_loss_clip": 0.01563868, + "auxiliary_loss_mlp": 0.01410662, + "balance_loss_clip": 1.17453885, + "balance_loss_mlp": 1.06505084, + "epoch": 0.06715767323012176, + "flos": 22125916339200.0, + "grad_norm": 2.187740946503104, + "language_loss": 0.87541759, + "learning_rate": 3.985534873964279e-06, + "loss": 0.90516287, + "num_input_tokens_seen": 23822945, + "step": 1117, + "time_per_iteration": 2.7745025157928467 + }, + { + "auxiliary_loss_clip": 0.01710601, + "auxiliary_loss_mlp": 0.01288109, + "balance_loss_clip": 1.31955183, + "balance_loss_mlp": 1.02336884, + "epoch": 0.06721779648278972, + "flos": 66623601900960.0, + "grad_norm": 0.855239784579456, + "language_loss": 0.59712756, + "learning_rate": 3.985488080124218e-06, + "loss": 0.62711465, + "num_input_tokens_seen": 23874075, + "step": 1118, + "time_per_iteration": 4.727833032608032 + }, + { + "auxiliary_loss_clip": 0.01555841, + "auxiliary_loss_mlp": 0.01419222, + "balance_loss_clip": 1.16437173, + "balance_loss_mlp": 1.07513642, + "epoch": 0.06727791973545769, + "flos": 22384664796960.0, + "grad_norm": 3.97282557718947, + "language_loss": 0.83855766, + "learning_rate": 3.985441210994251e-06, + "loss": 0.86830825, + "num_input_tokens_seen": 23889720, + "step": 1119, + "time_per_iteration": 2.824948787689209 + }, + { + "auxiliary_loss_clip": 0.01563576, + "auxiliary_loss_mlp": 0.01420799, + "balance_loss_clip": 1.17265463, + "balance_loss_mlp": 1.08129144, + "epoch": 0.06733804298812565, + "flos": 24282533011680.0, + "grad_norm": 1.8109077945404468, + "language_loss": 0.84980768, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87965143, + "num_input_tokens_seen": 23909385, + "step": 1120, + "time_per_iteration": 2.8205976486206055 + }, + { + "auxiliary_loss_clip": 0.01558965, + "auxiliary_loss_mlp": 0.01417914, + "balance_loss_clip": 1.16950178, + "balance_loss_mlp": 1.0805043, + "epoch": 0.06739816624079363, + "flos": 15919707512160.0, + "grad_norm": 2.2247136522982665, + "language_loss": 0.79153347, + "learning_rate": 3.985347246871708e-06, + "loss": 0.82130229, + "num_input_tokens_seen": 23926830, + "step": 1121, + "time_per_iteration": 4.235708236694336 + }, + { + "auxiliary_loss_clip": 0.01693194, + "auxiliary_loss_mlp": 0.01279503, + "balance_loss_clip": 1.30169296, + "balance_loss_mlp": 1.01476288, + "epoch": 0.0674582894934616, + "flos": 71406428166720.0, + "grad_norm": 0.7536152701690441, + "language_loss": 0.58314663, + "learning_rate": 3.985300151882694e-06, + "loss": 0.61287361, + "num_input_tokens_seen": 23992640, + "step": 1122, + "time_per_iteration": 4.94326376914978 + }, + { + "auxiliary_loss_clip": 0.01560071, + "auxiliary_loss_mlp": 0.01416635, + "balance_loss_clip": 1.17186594, + "balance_loss_mlp": 1.0725491, + "epoch": 0.06751841274612956, + "flos": 25267309594560.0, + "grad_norm": 2.1191093929003015, + "language_loss": 0.71442544, + "learning_rate": 3.985252981610901e-06, + "loss": 0.74419248, + "num_input_tokens_seen": 24011135, + "step": 1123, + "time_per_iteration": 2.813692331314087 + }, + { + "auxiliary_loss_clip": 0.0155239, + "auxiliary_loss_mlp": 0.0142615, + "balance_loss_clip": 1.16236246, + "balance_loss_mlp": 1.08511567, + "epoch": 0.06757853599879754, + "flos": 23804861028480.0, + "grad_norm": 2.0570448789218396, + "language_loss": 0.79286528, + "learning_rate": 3.985205736058114e-06, + "loss": 0.82265067, + "num_input_tokens_seen": 24030695, + "step": 1124, + "time_per_iteration": 4.124637603759766 + }, + { + "auxiliary_loss_clip": 0.01551797, + "auxiliary_loss_mlp": 0.01413624, + "balance_loss_clip": 1.16246128, + "balance_loss_mlp": 1.07163668, + "epoch": 0.0676386592514655, + "flos": 21035925947520.0, + "grad_norm": 2.037353151175138, + "language_loss": 0.71162164, + "learning_rate": 3.985158415226128e-06, + "loss": 0.74127585, + "num_input_tokens_seen": 24050680, + "step": 1125, + "time_per_iteration": 2.6730682849884033 + }, + { + "auxiliary_loss_clip": 0.01561169, + "auxiliary_loss_mlp": 0.01416238, + "balance_loss_clip": 1.17113054, + "balance_loss_mlp": 1.07520378, + "epoch": 0.06769878250413347, + "flos": 25558676634240.0, + "grad_norm": 2.8648263799443283, + "language_loss": 0.81616759, + "learning_rate": 3.985111019116736e-06, + "loss": 0.84594166, + "num_input_tokens_seen": 24067205, + "step": 1126, + "time_per_iteration": 2.852055072784424 + }, + { + "auxiliary_loss_clip": 0.01676304, + "auxiliary_loss_mlp": 0.01272507, + "balance_loss_clip": 1.28690875, + "balance_loss_mlp": 1.00318909, + "epoch": 0.06775890575680145, + "flos": 70662308309280.0, + "grad_norm": 0.7873138880632274, + "language_loss": 0.59706324, + "learning_rate": 3.985063547731735e-06, + "loss": 0.62655133, + "num_input_tokens_seen": 24131320, + "step": 1127, + "time_per_iteration": 3.2845773696899414 + }, + { + "auxiliary_loss_clip": 0.01557412, + "auxiliary_loss_mlp": 0.01405957, + "balance_loss_clip": 1.16823184, + "balance_loss_mlp": 1.06778455, + "epoch": 0.06781902900946941, + "flos": 24237360208800.0, + "grad_norm": 2.4839946848108148, + "language_loss": 0.81197059, + "learning_rate": 3.985016001072925e-06, + "loss": 0.84160435, + "num_input_tokens_seen": 24149930, + "step": 1128, + "time_per_iteration": 2.8039448261260986 + }, + { + "auxiliary_loss_clip": 0.01566645, + "auxiliary_loss_mlp": 0.01426343, + "balance_loss_clip": 1.17681813, + "balance_loss_mlp": 1.07996869, + "epoch": 0.06787915226213738, + "flos": 22419634924800.0, + "grad_norm": 4.145539123367221, + "language_loss": 0.75377488, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78370476, + "num_input_tokens_seen": 24169590, + "step": 1129, + "time_per_iteration": 2.7722599506378174 + }, + { + "auxiliary_loss_clip": 0.01562139, + "auxiliary_loss_mlp": 0.01413277, + "balance_loss_clip": 1.17340708, + "balance_loss_mlp": 1.0743413, + "epoch": 0.06793927551480534, + "flos": 37709994284640.0, + "grad_norm": 2.0726483716393025, + "language_loss": 0.71990085, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74965501, + "num_input_tokens_seen": 24189965, + "step": 1130, + "time_per_iteration": 2.901278018951416 + }, + { + "auxiliary_loss_clip": 0.01553792, + "auxiliary_loss_mlp": 0.01431739, + "balance_loss_clip": 1.16506052, + "balance_loss_mlp": 1.09699965, + "epoch": 0.06799939876747332, + "flos": 20633504162400.0, + "grad_norm": 3.3700917557758063, + "language_loss": 0.80918813, + "learning_rate": 3.984872909471688e-06, + "loss": 0.8390435, + "num_input_tokens_seen": 24208045, + "step": 1131, + "time_per_iteration": 2.8895576000213623 + }, + { + "auxiliary_loss_clip": 0.0154867, + "auxiliary_loss_mlp": 0.01432526, + "balance_loss_clip": 1.15913653, + "balance_loss_mlp": 1.09339905, + "epoch": 0.06805952202014129, + "flos": 14866431943680.0, + "grad_norm": 2.802320243239111, + "language_loss": 0.80360675, + "learning_rate": 3.984825061735701e-06, + "loss": 0.83341873, + "num_input_tokens_seen": 24223805, + "step": 1132, + "time_per_iteration": 2.717046022415161 + }, + { + "auxiliary_loss_clip": 0.01547701, + "auxiliary_loss_mlp": 0.01424173, + "balance_loss_clip": 1.16012466, + "balance_loss_mlp": 1.09115028, + "epoch": 0.06811964527280925, + "flos": 48913212247200.0, + "grad_norm": 1.568309543591805, + "language_loss": 0.63775021, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.66746897, + "num_input_tokens_seen": 24249475, + "step": 1133, + "time_per_iteration": 3.0311574935913086 + }, + { + "auxiliary_loss_clip": 0.01548926, + "auxiliary_loss_mlp": 0.01440172, + "balance_loss_clip": 1.16013789, + "balance_loss_mlp": 1.10829318, + "epoch": 0.06817976852547723, + "flos": 15379074054720.0, + "grad_norm": 1.8299562441521853, + "language_loss": 0.74838042, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77827144, + "num_input_tokens_seen": 24267980, + "step": 1134, + "time_per_iteration": 2.7247698307037354 + }, + { + "auxiliary_loss_clip": 0.01557196, + "auxiliary_loss_mlp": 0.01404755, + "balance_loss_clip": 1.16978133, + "balance_loss_mlp": 1.06333995, + "epoch": 0.0682398917781452, + "flos": 20157690659040.0, + "grad_norm": 1.8607552354271564, + "language_loss": 0.8726759, + "learning_rate": 3.984681066946423e-06, + "loss": 0.90229541, + "num_input_tokens_seen": 24286805, + "step": 1135, + "time_per_iteration": 2.7912356853485107 + }, + { + "auxiliary_loss_clip": 0.01553142, + "auxiliary_loss_mlp": 0.01407166, + "balance_loss_clip": 1.16497862, + "balance_loss_mlp": 1.06460571, + "epoch": 0.06830001503081316, + "flos": 23442871026240.0, + "grad_norm": 2.6095542496767314, + "language_loss": 0.7874704, + "learning_rate": 3.984632918162291e-06, + "loss": 0.81707346, + "num_input_tokens_seen": 24305855, + "step": 1136, + "time_per_iteration": 2.8089230060577393 + }, + { + "auxiliary_loss_clip": 0.01560352, + "auxiliary_loss_mlp": 0.01438961, + "balance_loss_clip": 1.17211521, + "balance_loss_mlp": 1.09773612, + "epoch": 0.06836013828348114, + "flos": 34352673828480.0, + "grad_norm": 2.672917593891496, + "language_loss": 0.84384227, + "learning_rate": 3.984584694120679e-06, + "loss": 0.87383538, + "num_input_tokens_seen": 24326535, + "step": 1137, + "time_per_iteration": 2.8876407146453857 + }, + { + "auxiliary_loss_clip": 0.01547193, + "auxiliary_loss_mlp": 0.01407534, + "balance_loss_clip": 1.1586132, + "balance_loss_mlp": 1.0630672, + "epoch": 0.06842026153614911, + "flos": 23151086776800.0, + "grad_norm": 3.1865030723807855, + "language_loss": 0.78902435, + "learning_rate": 3.984536394823418e-06, + "loss": 0.81857163, + "num_input_tokens_seen": 24345810, + "step": 1138, + "time_per_iteration": 2.85945987701416 + }, + { + "auxiliary_loss_clip": 0.01548334, + "auxiliary_loss_mlp": 0.0140953, + "balance_loss_clip": 1.16045702, + "balance_loss_mlp": 1.068115, + "epoch": 0.06848038478881707, + "flos": 24611335509600.0, + "grad_norm": 2.697707153308988, + "language_loss": 0.85886645, + "learning_rate": 3.984488020272336e-06, + "loss": 0.88844502, + "num_input_tokens_seen": 24366095, + "step": 1139, + "time_per_iteration": 2.7942333221435547 + }, + { + "auxiliary_loss_clip": 0.01548911, + "auxiliary_loss_mlp": 0.01411663, + "balance_loss_clip": 1.1631037, + "balance_loss_mlp": 1.06567025, + "epoch": 0.06854050804148504, + "flos": 40884233690880.0, + "grad_norm": 2.944940824304755, + "language_loss": 0.75190622, + "learning_rate": 3.984439570469271e-06, + "loss": 0.7815119, + "num_input_tokens_seen": 24388665, + "step": 1140, + "time_per_iteration": 2.9535324573516846 + }, + { + "auxiliary_loss_clip": 0.01550644, + "auxiliary_loss_mlp": 0.01412908, + "balance_loss_clip": 1.16439939, + "balance_loss_mlp": 1.06767833, + "epoch": 0.06860063129415302, + "flos": 31689066340800.0, + "grad_norm": 2.3767131064186593, + "language_loss": 0.67851108, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70814669, + "num_input_tokens_seen": 24407705, + "step": 1141, + "time_per_iteration": 2.8226773738861084 + }, + { + "auxiliary_loss_clip": 0.01552642, + "auxiliary_loss_mlp": 0.01408464, + "balance_loss_clip": 1.16554546, + "balance_loss_mlp": 1.06723952, + "epoch": 0.06866075454682098, + "flos": 26544439349280.0, + "grad_norm": 2.1570009547528985, + "language_loss": 0.79165471, + "learning_rate": 3.984342445114538e-06, + "loss": 0.82126582, + "num_input_tokens_seen": 24428390, + "step": 1142, + "time_per_iteration": 2.8430466651916504 + }, + { + "auxiliary_loss_clip": 0.01558124, + "auxiliary_loss_mlp": 0.0141644, + "balance_loss_clip": 1.17199492, + "balance_loss_mlp": 1.06033826, + "epoch": 0.06872087779948895, + "flos": 29792411827200.0, + "grad_norm": 1.819587121293572, + "language_loss": 0.69006151, + "learning_rate": 3.984293769566553e-06, + "loss": 0.71980715, + "num_input_tokens_seen": 24450810, + "step": 1143, + "time_per_iteration": 2.82395339012146 + }, + { + "auxiliary_loss_clip": 0.01549498, + "auxiliary_loss_mlp": 0.01407784, + "balance_loss_clip": 1.16287494, + "balance_loss_mlp": 1.06770396, + "epoch": 0.06878100105215693, + "flos": 26943447600000.0, + "grad_norm": 1.7242084288996773, + "language_loss": 0.74371719, + "learning_rate": 3.98424501877395e-06, + "loss": 0.77329004, + "num_input_tokens_seen": 24469965, + "step": 1144, + "time_per_iteration": 2.823671817779541 + }, + { + "auxiliary_loss_clip": 0.01545504, + "auxiliary_loss_mlp": 0.01427134, + "balance_loss_clip": 1.1599052, + "balance_loss_mlp": 1.08362079, + "epoch": 0.06884112430482489, + "flos": 10672294114080.0, + "grad_norm": 2.546371819795081, + "language_loss": 0.92327976, + "learning_rate": 3.984196192738577e-06, + "loss": 0.95300609, + "num_input_tokens_seen": 24486370, + "step": 1145, + "time_per_iteration": 2.7538955211639404 + }, + { + "auxiliary_loss_clip": 0.01550366, + "auxiliary_loss_mlp": 0.01410725, + "balance_loss_clip": 1.1637007, + "balance_loss_mlp": 1.0635879, + "epoch": 0.06890124755749286, + "flos": 20195732967840.0, + "grad_norm": 2.6876682922941484, + "language_loss": 0.81995165, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84956253, + "num_input_tokens_seen": 24503780, + "step": 1146, + "time_per_iteration": 2.7251944541931152 + }, + { + "auxiliary_loss_clip": 0.01554549, + "auxiliary_loss_mlp": 0.01412095, + "balance_loss_clip": 1.16801023, + "balance_loss_mlp": 1.07315958, + "epoch": 0.06896137081016084, + "flos": 20451333388320.0, + "grad_norm": 2.1808726863750536, + "language_loss": 0.8550601, + "learning_rate": 3.98409831494693e-06, + "loss": 0.88472652, + "num_input_tokens_seen": 24522320, + "step": 1147, + "time_per_iteration": 2.775402069091797 + }, + { + "auxiliary_loss_clip": 0.01542668, + "auxiliary_loss_mlp": 0.01397931, + "balance_loss_clip": 1.15803015, + "balance_loss_mlp": 1.05785108, + "epoch": 0.0690214940628288, + "flos": 18370649620800.0, + "grad_norm": 2.1551159890784968, + "language_loss": 0.85990745, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88931346, + "num_input_tokens_seen": 24540445, + "step": 1148, + "time_per_iteration": 2.714855909347534 + }, + { + "auxiliary_loss_clip": 0.0155716, + "auxiliary_loss_mlp": 0.01410694, + "balance_loss_clip": 1.17063546, + "balance_loss_mlp": 1.07175851, + "epoch": 0.06908161731549677, + "flos": 20560112444160.0, + "grad_norm": 3.901368674401871, + "language_loss": 0.69468379, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.72436231, + "num_input_tokens_seen": 24557105, + "step": 1149, + "time_per_iteration": 2.806434392929077 + }, + { + "auxiliary_loss_clip": 0.01539407, + "auxiliary_loss_mlp": 0.0142181, + "balance_loss_clip": 1.15288234, + "balance_loss_mlp": 1.08897746, + "epoch": 0.06914174056816474, + "flos": 27566158324320.0, + "grad_norm": 3.1494214124040587, + "language_loss": 0.83570391, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86531609, + "num_input_tokens_seen": 24578240, + "step": 1150, + "time_per_iteration": 2.9906060695648193 + }, + { + "auxiliary_loss_clip": 0.01544488, + "auxiliary_loss_mlp": 0.01427077, + "balance_loss_clip": 1.16072547, + "balance_loss_mlp": 1.0911932, + "epoch": 0.06920186382083271, + "flos": 15305796120960.0, + "grad_norm": 4.529426134459248, + "language_loss": 0.81659138, + "learning_rate": 3.983901656532052e-06, + "loss": 0.84630704, + "num_input_tokens_seen": 24593585, + "step": 1151, + "time_per_iteration": 2.7741503715515137 + }, + { + "auxiliary_loss_clip": 0.01544696, + "auxiliary_loss_mlp": 0.01407786, + "balance_loss_clip": 1.15991628, + "balance_loss_mlp": 1.07476282, + "epoch": 0.06926198707350067, + "flos": 25193538594720.0, + "grad_norm": 2.5938115820148724, + "language_loss": 0.85746515, + "learning_rate": 3.983852303849291e-06, + "loss": 0.88698995, + "num_input_tokens_seen": 24613110, + "step": 1152, + "time_per_iteration": 2.8771867752075195 + }, + { + "auxiliary_loss_clip": 0.01546014, + "auxiliary_loss_mlp": 0.01421237, + "balance_loss_clip": 1.16104412, + "balance_loss_mlp": 1.09031248, + "epoch": 0.06932211032616864, + "flos": 13257541294560.0, + "grad_norm": 2.2823403229873085, + "language_loss": 0.9133364, + "learning_rate": 3.983802875938651e-06, + "loss": 0.94300896, + "num_input_tokens_seen": 24628795, + "step": 1153, + "time_per_iteration": 2.7304961681365967 + }, + { + "auxiliary_loss_clip": 0.01549688, + "auxiliary_loss_mlp": 0.01424557, + "balance_loss_clip": 1.165622, + "balance_loss_mlp": 1.09820926, + "epoch": 0.06938223357883662, + "flos": 24829728040800.0, + "grad_norm": 2.467390061493246, + "language_loss": 0.82047325, + "learning_rate": 3.983753372802008e-06, + "loss": 0.85021567, + "num_input_tokens_seen": 24645480, + "step": 1154, + "time_per_iteration": 2.8212239742279053 + }, + { + "auxiliary_loss_clip": 0.01557382, + "auxiliary_loss_mlp": 0.0142693, + "balance_loss_clip": 1.17339063, + "balance_loss_mlp": 1.09810352, + "epoch": 0.06944235683150458, + "flos": 27270239905440.0, + "grad_norm": 4.400849174286001, + "language_loss": 0.74912816, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77897131, + "num_input_tokens_seen": 24664630, + "step": 1155, + "time_per_iteration": 2.798280715942383 + }, + { + "auxiliary_loss_clip": 0.01539706, + "auxiliary_loss_mlp": 0.01398852, + "balance_loss_clip": 1.15602553, + "balance_loss_mlp": 1.06449413, + "epoch": 0.06950248008417255, + "flos": 25810218741600.0, + "grad_norm": 1.6432174245284816, + "language_loss": 0.70835686, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73774242, + "num_input_tokens_seen": 24684210, + "step": 1156, + "time_per_iteration": 4.2834718227386475 + }, + { + "auxiliary_loss_clip": 0.01551403, + "auxiliary_loss_mlp": 0.01408784, + "balance_loss_clip": 1.16708302, + "balance_loss_mlp": 1.07690501, + "epoch": 0.06956260333684053, + "flos": 22273647979680.0, + "grad_norm": 2.315381441194363, + "language_loss": 0.7504437, + "learning_rate": 3.98360441205484e-06, + "loss": 0.78004551, + "num_input_tokens_seen": 24702490, + "step": 1157, + "time_per_iteration": 2.7959916591644287 + }, + { + "auxiliary_loss_clip": 0.01541818, + "auxiliary_loss_mlp": 0.01402962, + "balance_loss_clip": 1.1590271, + "balance_loss_mlp": 1.06784058, + "epoch": 0.0696227265895085, + "flos": 29684087909280.0, + "grad_norm": 2.4140844983057725, + "language_loss": 0.71967649, + "learning_rate": 3.983554608032982e-06, + "loss": 0.74912429, + "num_input_tokens_seen": 24724340, + "step": 1158, + "time_per_iteration": 2.8083951473236084 + }, + { + "auxiliary_loss_clip": 0.01546133, + "auxiliary_loss_mlp": 0.01407145, + "balance_loss_clip": 1.16194344, + "balance_loss_mlp": 1.07087958, + "epoch": 0.06968284984217646, + "flos": 25526095980480.0, + "grad_norm": 1.952928524666496, + "language_loss": 0.79798752, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82752031, + "num_input_tokens_seen": 24745550, + "step": 1159, + "time_per_iteration": 4.293460369110107 + }, + { + "auxiliary_loss_clip": 0.01552632, + "auxiliary_loss_mlp": 0.01424429, + "balance_loss_clip": 1.16860843, + "balance_loss_mlp": 1.0864464, + "epoch": 0.06974297309484444, + "flos": 20699955027360.0, + "grad_norm": 3.1892741522468393, + "language_loss": 0.80952573, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83929628, + "num_input_tokens_seen": 24762575, + "step": 1160, + "time_per_iteration": 4.181583642959595 + }, + { + "auxiliary_loss_clip": 0.01544704, + "auxiliary_loss_mlp": 0.0141096, + "balance_loss_clip": 1.16116118, + "balance_loss_mlp": 1.06820989, + "epoch": 0.0698030963475124, + "flos": 26507610741600.0, + "grad_norm": 2.216391670334817, + "language_loss": 0.76056826, + "learning_rate": 3.983404744675437e-06, + "loss": 0.79012489, + "num_input_tokens_seen": 24782605, + "step": 1161, + "time_per_iteration": 4.328184604644775 + }, + { + "auxiliary_loss_clip": 0.01539607, + "auxiliary_loss_mlp": 0.01392752, + "balance_loss_clip": 1.15671003, + "balance_loss_mlp": 1.05572379, + "epoch": 0.06986321960018037, + "flos": 23042724930720.0, + "grad_norm": 2.5091023769813314, + "language_loss": 0.82812083, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85744441, + "num_input_tokens_seen": 24802910, + "step": 1162, + "time_per_iteration": 2.807595729827881 + }, + { + "auxiliary_loss_clip": 0.01538745, + "auxiliary_loss_mlp": 0.01388101, + "balance_loss_clip": 1.1565299, + "balance_loss_mlp": 1.04916525, + "epoch": 0.06992334285284833, + "flos": 28587346305120.0, + "grad_norm": 2.0698514423996395, + "language_loss": 0.79522157, + "learning_rate": 3.983304459712716e-06, + "loss": 0.82449001, + "num_input_tokens_seen": 24823305, + "step": 1163, + "time_per_iteration": 2.8221232891082764 + }, + { + "auxiliary_loss_clip": 0.0154151, + "auxiliary_loss_mlp": 0.01398935, + "balance_loss_clip": 1.15965915, + "balance_loss_mlp": 1.05809164, + "epoch": 0.06998346610551631, + "flos": 20597472046080.0, + "grad_norm": 7.7853455571184575, + "language_loss": 0.79304141, + "learning_rate": 3.983254204419749e-06, + "loss": 0.82244587, + "num_input_tokens_seen": 24842155, + "step": 1164, + "time_per_iteration": 2.790013313293457 + }, + { + "auxiliary_loss_clip": 0.01544373, + "auxiliary_loss_mlp": 0.01400328, + "balance_loss_clip": 1.16276956, + "balance_loss_mlp": 1.06158292, + "epoch": 0.07004358935818428, + "flos": 22531031023680.0, + "grad_norm": 1.7110177426182904, + "language_loss": 0.73189175, + "learning_rate": 3.983203873921583e-06, + "loss": 0.76133871, + "num_input_tokens_seen": 24862080, + "step": 1165, + "time_per_iteration": 2.794698715209961 + }, + { + "auxiliary_loss_clip": 0.01537044, + "auxiliary_loss_mlp": 0.01397657, + "balance_loss_clip": 1.1545496, + "balance_loss_mlp": 1.04746771, + "epoch": 0.07010371261085224, + "flos": 28952560200960.0, + "grad_norm": 2.103545059079891, + "language_loss": 0.81694412, + "learning_rate": 3.983153468220128e-06, + "loss": 0.84629107, + "num_input_tokens_seen": 24886165, + "step": 1166, + "time_per_iteration": 2.8883607387542725 + }, + { + "auxiliary_loss_clip": 0.01533804, + "auxiliary_loss_mlp": 0.01393608, + "balance_loss_clip": 1.1537503, + "balance_loss_mlp": 1.04608858, + "epoch": 0.07016383586352022, + "flos": 23661415270080.0, + "grad_norm": 3.53834669854642, + "language_loss": 0.84985983, + "learning_rate": 3.983102987317295e-06, + "loss": 0.87913394, + "num_input_tokens_seen": 24905775, + "step": 1167, + "time_per_iteration": 2.7844293117523193 + }, + { + "auxiliary_loss_clip": 0.01536581, + "auxiliary_loss_mlp": 0.01396219, + "balance_loss_clip": 1.15497708, + "balance_loss_mlp": 1.05423129, + "epoch": 0.07022395911618819, + "flos": 19794183530400.0, + "grad_norm": 2.5472020694207074, + "language_loss": 0.89537382, + "learning_rate": 3.983052431214997e-06, + "loss": 0.92470181, + "num_input_tokens_seen": 24924295, + "step": 1168, + "time_per_iteration": 2.778076410293579 + }, + { + "auxiliary_loss_clip": 0.01546935, + "auxiliary_loss_mlp": 0.01407632, + "balance_loss_clip": 1.1655643, + "balance_loss_mlp": 1.06507194, + "epoch": 0.07028408236885615, + "flos": 21691293181920.0, + "grad_norm": 2.334329753788549, + "language_loss": 0.88970292, + "learning_rate": 3.983001799915153e-06, + "loss": 0.91924858, + "num_input_tokens_seen": 24943210, + "step": 1169, + "time_per_iteration": 2.774759292602539 + }, + { + "auxiliary_loss_clip": 0.01553982, + "auxiliary_loss_mlp": 0.01401364, + "balance_loss_clip": 1.17127633, + "balance_loss_mlp": 1.05670607, + "epoch": 0.07034420562152413, + "flos": 25632675203040.0, + "grad_norm": 2.4874099349460046, + "language_loss": 0.84051508, + "learning_rate": 3.982951093419681e-06, + "loss": 0.87006855, + "num_input_tokens_seen": 24960360, + "step": 1170, + "time_per_iteration": 2.9072377681732178 + }, + { + "auxiliary_loss_clip": 0.01545384, + "auxiliary_loss_mlp": 0.01411524, + "balance_loss_clip": 1.16461718, + "balance_loss_mlp": 1.07430482, + "epoch": 0.0704043288741921, + "flos": 20812375186560.0, + "grad_norm": 3.7902469661624587, + "language_loss": 0.75739539, + "learning_rate": 3.982900311730506e-06, + "loss": 0.78696448, + "num_input_tokens_seen": 24978290, + "step": 1171, + "time_per_iteration": 2.7784388065338135 + }, + { + "auxiliary_loss_clip": 0.01546306, + "auxiliary_loss_mlp": 0.01396154, + "balance_loss_clip": 1.16356778, + "balance_loss_mlp": 1.06065154, + "epoch": 0.07046445212686006, + "flos": 25595543170080.0, + "grad_norm": 1.911701842986215, + "language_loss": 0.89364988, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.92307448, + "num_input_tokens_seen": 24997055, + "step": 1172, + "time_per_iteration": 2.882727861404419 + }, + { + "auxiliary_loss_clip": 0.01543933, + "auxiliary_loss_mlp": 0.01406977, + "balance_loss_clip": 1.16315866, + "balance_loss_mlp": 1.07681501, + "epoch": 0.07052457537952803, + "flos": 25559359341120.0, + "grad_norm": 1.8990129370154176, + "language_loss": 0.82126606, + "learning_rate": 3.982798522778748e-06, + "loss": 0.85077512, + "num_input_tokens_seen": 25017490, + "step": 1173, + "time_per_iteration": 2.786450147628784 + }, + { + "auxiliary_loss_clip": 0.01549737, + "auxiliary_loss_mlp": 0.01393343, + "balance_loss_clip": 1.16943645, + "balance_loss_mlp": 1.04925776, + "epoch": 0.070584698632196, + "flos": 17970503525280.0, + "grad_norm": 3.022023409193968, + "language_loss": 0.82615399, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.8555848, + "num_input_tokens_seen": 25035660, + "step": 1174, + "time_per_iteration": 2.7188758850097656 + }, + { + "auxiliary_loss_clip": 0.01538251, + "auxiliary_loss_mlp": 0.01406926, + "balance_loss_clip": 1.15806723, + "balance_loss_mlp": 1.07561994, + "epoch": 0.07064482188486397, + "flos": 25373054397600.0, + "grad_norm": 2.5247614139084518, + "language_loss": 0.85694718, + "learning_rate": 3.982696433075317e-06, + "loss": 0.88639891, + "num_input_tokens_seen": 25054785, + "step": 1175, + "time_per_iteration": 2.8816118240356445 + }, + { + "auxiliary_loss_clip": 0.01547726, + "auxiliary_loss_mlp": 0.01405323, + "balance_loss_clip": 1.1662035, + "balance_loss_mlp": 1.07325363, + "epoch": 0.07070494513753194, + "flos": 24902171555040.0, + "grad_norm": 1.86594974026921, + "language_loss": 0.83613116, + "learning_rate": 3.982645275446563e-06, + "loss": 0.86566162, + "num_input_tokens_seen": 25075180, + "step": 1176, + "time_per_iteration": 2.81636381149292 + }, + { + "auxiliary_loss_clip": 0.01548364, + "auxiliary_loss_mlp": 0.01416894, + "balance_loss_clip": 1.16735005, + "balance_loss_mlp": 1.08787608, + "epoch": 0.07076506839019991, + "flos": 22340212629120.0, + "grad_norm": 3.1416754113986327, + "language_loss": 0.7413615, + "learning_rate": 3.982594042635701e-06, + "loss": 0.77101403, + "num_input_tokens_seen": 25093035, + "step": 1177, + "time_per_iteration": 2.8161003589630127 + }, + { + "auxiliary_loss_clip": 0.01542067, + "auxiliary_loss_mlp": 0.01422095, + "balance_loss_clip": 1.16083741, + "balance_loss_mlp": 1.09174204, + "epoch": 0.07082519164286788, + "flos": 18662775223680.0, + "grad_norm": 5.499336957054827, + "language_loss": 0.85797679, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88761842, + "num_input_tokens_seen": 25112520, + "step": 1178, + "time_per_iteration": 2.8161509037017822 + }, + { + "auxiliary_loss_clip": 0.01699141, + "auxiliary_loss_mlp": 0.0132579, + "balance_loss_clip": 1.30741405, + "balance_loss_mlp": 1.08164978, + "epoch": 0.07088531489553584, + "flos": 63661003449120.0, + "grad_norm": 0.8741478432230897, + "language_loss": 0.631814, + "learning_rate": 3.982491351475427e-06, + "loss": 0.6620633, + "num_input_tokens_seen": 25177760, + "step": 1179, + "time_per_iteration": 3.4171628952026367 + }, + { + "auxiliary_loss_clip": 0.01542623, + "auxiliary_loss_mlp": 0.01406253, + "balance_loss_clip": 1.16017127, + "balance_loss_mlp": 1.06808031, + "epoch": 0.07094543814820382, + "flos": 21574435428000.0, + "grad_norm": 3.001407819468357, + "language_loss": 0.83740985, + "learning_rate": 3.98243989312991e-06, + "loss": 0.86689866, + "num_input_tokens_seen": 25195260, + "step": 1180, + "time_per_iteration": 2.774428606033325 + }, + { + "auxiliary_loss_clip": 0.01545369, + "auxiliary_loss_mlp": 0.0141299, + "balance_loss_clip": 1.16368043, + "balance_loss_mlp": 1.0799669, + "epoch": 0.07100556140087179, + "flos": 22092084056160.0, + "grad_norm": 2.9570255999718973, + "language_loss": 0.88574177, + "learning_rate": 3.982388359610074e-06, + "loss": 0.9153254, + "num_input_tokens_seen": 25212740, + "step": 1181, + "time_per_iteration": 2.8820719718933105 + }, + { + "auxiliary_loss_clip": 0.01542745, + "auxiliary_loss_mlp": 0.01388039, + "balance_loss_clip": 1.16064978, + "balance_loss_mlp": 1.05253625, + "epoch": 0.07106568465353975, + "flos": 47925894477600.0, + "grad_norm": 2.9012253124097844, + "language_loss": 0.83493245, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.86424029, + "num_input_tokens_seen": 25236420, + "step": 1182, + "time_per_iteration": 2.978659152984619 + }, + { + "auxiliary_loss_clip": 0.01551289, + "auxiliary_loss_mlp": 0.01388331, + "balance_loss_clip": 1.16932833, + "balance_loss_mlp": 1.04310071, + "epoch": 0.07112580790620772, + "flos": 23443060667040.0, + "grad_norm": 3.4694460557112894, + "language_loss": 0.7989127, + "learning_rate": 3.982285067055262e-06, + "loss": 0.82830888, + "num_input_tokens_seen": 25255120, + "step": 1183, + "time_per_iteration": 2.7994682788848877 + }, + { + "auxiliary_loss_clip": 0.01539995, + "auxiliary_loss_mlp": 0.01397247, + "balance_loss_clip": 1.15797043, + "balance_loss_mlp": 1.04896486, + "epoch": 0.0711859311588757, + "flos": 31871616396480.0, + "grad_norm": 2.457594581603235, + "language_loss": 0.79443526, + "learning_rate": 3.982233308024204e-06, + "loss": 0.82380766, + "num_input_tokens_seen": 25275150, + "step": 1184, + "time_per_iteration": 2.8504369258880615 + }, + { + "auxiliary_loss_clip": 0.01545038, + "auxiliary_loss_mlp": 0.01405583, + "balance_loss_clip": 1.16335607, + "balance_loss_mlp": 1.05787313, + "epoch": 0.07124605441154366, + "flos": 19612392037920.0, + "grad_norm": 3.4352329646059876, + "language_loss": 0.76956081, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79906702, + "num_input_tokens_seen": 25293680, + "step": 1185, + "time_per_iteration": 2.8169491291046143 + }, + { + "auxiliary_loss_clip": 0.01541464, + "auxiliary_loss_mlp": 0.01405218, + "balance_loss_clip": 1.15874052, + "balance_loss_mlp": 1.05750847, + "epoch": 0.07130617766421163, + "flos": 14686233433920.0, + "grad_norm": 2.224939660535614, + "language_loss": 0.65622103, + "learning_rate": 3.982129564464596e-06, + "loss": 0.6856879, + "num_input_tokens_seen": 25310050, + "step": 1186, + "time_per_iteration": 2.8219802379608154 + }, + { + "auxiliary_loss_clip": 0.01541869, + "auxiliary_loss_mlp": 0.01403784, + "balance_loss_clip": 1.16022992, + "balance_loss_mlp": 1.05817199, + "epoch": 0.07136630091687961, + "flos": 26070332613120.0, + "grad_norm": 2.29774661744082, + "language_loss": 0.69550693, + "learning_rate": 3.98207757993998e-06, + "loss": 0.72496343, + "num_input_tokens_seen": 25331020, + "step": 1187, + "time_per_iteration": 2.7634642124176025 + }, + { + "auxiliary_loss_clip": 0.01543771, + "auxiliary_loss_mlp": 0.01386673, + "balance_loss_clip": 1.16136861, + "balance_loss_mlp": 1.04125261, + "epoch": 0.07142642416954757, + "flos": 15671010016800.0, + "grad_norm": 2.9405712792753635, + "language_loss": 0.78398263, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.81328714, + "num_input_tokens_seen": 25347875, + "step": 1188, + "time_per_iteration": 2.7043306827545166 + }, + { + "auxiliary_loss_clip": 0.01550669, + "auxiliary_loss_mlp": 0.01395975, + "balance_loss_clip": 1.16986346, + "balance_loss_mlp": 1.05150759, + "epoch": 0.07148654742221554, + "flos": 19757354922720.0, + "grad_norm": 2.2226412407877025, + "language_loss": 0.85142541, + "learning_rate": 3.981973385410981e-06, + "loss": 0.8808918, + "num_input_tokens_seen": 25366715, + "step": 1189, + "time_per_iteration": 2.777466058731079 + }, + { + "auxiliary_loss_clip": 0.01538538, + "auxiliary_loss_mlp": 0.01406997, + "balance_loss_clip": 1.15669048, + "balance_loss_mlp": 1.06195736, + "epoch": 0.07154667067488352, + "flos": 23473782840960.0, + "grad_norm": 1.8907301545181026, + "language_loss": 0.76817715, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.79763246, + "num_input_tokens_seen": 25385450, + "step": 1190, + "time_per_iteration": 2.764641761779785 + }, + { + "auxiliary_loss_clip": 0.01536305, + "auxiliary_loss_mlp": 0.01399374, + "balance_loss_clip": 1.15420699, + "balance_loss_mlp": 1.06177306, + "epoch": 0.07160679392755148, + "flos": 18334920929760.0, + "grad_norm": 2.1483913016864804, + "language_loss": 0.75921309, + "learning_rate": 3.981868890255468e-06, + "loss": 0.78856987, + "num_input_tokens_seen": 25403940, + "step": 1191, + "time_per_iteration": 2.7808594703674316 + }, + { + "auxiliary_loss_clip": 0.01538019, + "auxiliary_loss_mlp": 0.01432345, + "balance_loss_clip": 1.15485764, + "balance_loss_mlp": 1.10466218, + "epoch": 0.07166691718021945, + "flos": 17748849172320.0, + "grad_norm": 4.025682767654247, + "language_loss": 0.73983908, + "learning_rate": 3.981816529947719e-06, + "loss": 0.76954269, + "num_input_tokens_seen": 25420410, + "step": 1192, + "time_per_iteration": 2.82206392288208 + }, + { + "auxiliary_loss_clip": 0.01538853, + "auxiliary_loss_mlp": 0.01417167, + "balance_loss_clip": 1.15630925, + "balance_loss_mlp": 1.08280909, + "epoch": 0.07172704043288743, + "flos": 22453922345760.0, + "grad_norm": 3.01267327822766, + "language_loss": 0.78444278, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.81400299, + "num_input_tokens_seen": 25439415, + "step": 1193, + "time_per_iteration": 2.779081106185913 + }, + { + "auxiliary_loss_clip": 0.01537096, + "auxiliary_loss_mlp": 0.01405605, + "balance_loss_clip": 1.1545105, + "balance_loss_mlp": 1.07181883, + "epoch": 0.07178716368555539, + "flos": 23224478495040.0, + "grad_norm": 2.117370501924125, + "language_loss": 0.85609925, + "learning_rate": 3.981711583882166e-06, + "loss": 0.8855263, + "num_input_tokens_seen": 25458715, + "step": 1194, + "time_per_iteration": 4.340369939804077 + }, + { + "auxiliary_loss_clip": 0.01535836, + "auxiliary_loss_mlp": 0.01396979, + "balance_loss_clip": 1.15568769, + "balance_loss_mlp": 1.070822, + "epoch": 0.07184728693822336, + "flos": 25152917171040.0, + "grad_norm": 2.0215565946043275, + "language_loss": 0.81892371, + "learning_rate": 3.981658998128341e-06, + "loss": 0.84825182, + "num_input_tokens_seen": 25477985, + "step": 1195, + "time_per_iteration": 2.7514936923980713 + }, + { + "auxiliary_loss_clip": 0.01533332, + "auxiliary_loss_mlp": 0.01398831, + "balance_loss_clip": 1.15223694, + "balance_loss_mlp": 1.07305598, + "epoch": 0.07190741019089132, + "flos": 22713467294880.0, + "grad_norm": 2.9173606010606243, + "language_loss": 0.79908371, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82840526, + "num_input_tokens_seen": 25497110, + "step": 1196, + "time_per_iteration": 2.847046136856079 + }, + { + "auxiliary_loss_clip": 0.01534825, + "auxiliary_loss_mlp": 0.01395983, + "balance_loss_clip": 1.15288937, + "balance_loss_mlp": 1.06715608, + "epoch": 0.0719675334435593, + "flos": 29352289086720.0, + "grad_norm": 2.867991610175318, + "language_loss": 0.70758712, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.7368952, + "num_input_tokens_seen": 25516555, + "step": 1197, + "time_per_iteration": 4.481780290603638 + }, + { + "auxiliary_loss_clip": 0.01531524, + "auxiliary_loss_mlp": 0.01409422, + "balance_loss_clip": 1.15062618, + "balance_loss_mlp": 1.08669853, + "epoch": 0.07202765669622727, + "flos": 17641738955520.0, + "grad_norm": 5.80889684076205, + "language_loss": 0.86289698, + "learning_rate": 3.98150079000661e-06, + "loss": 0.89230645, + "num_input_tokens_seen": 25533895, + "step": 1198, + "time_per_iteration": 4.248533487319946 + }, + { + "auxiliary_loss_clip": 0.01538959, + "auxiliary_loss_mlp": 0.01423893, + "balance_loss_clip": 1.15829468, + "balance_loss_mlp": 1.10040724, + "epoch": 0.07208777994889523, + "flos": 21436185827520.0, + "grad_norm": 2.167626928845158, + "language_loss": 0.84152412, + "learning_rate": 3.981447903685947e-06, + "loss": 0.87115264, + "num_input_tokens_seen": 25554195, + "step": 1199, + "time_per_iteration": 4.2842793464660645 + }, + { + "auxiliary_loss_clip": 0.01541301, + "auxiliary_loss_mlp": 0.0140519, + "balance_loss_clip": 1.16117668, + "balance_loss_mlp": 1.08017778, + "epoch": 0.07214790320156321, + "flos": 26943182102880.0, + "grad_norm": 8.564793174685017, + "language_loss": 0.76771802, + "learning_rate": 3.981394942228581e-06, + "loss": 0.79718298, + "num_input_tokens_seen": 25574155, + "step": 1200, + "time_per_iteration": 2.833315849304199 + }, + { + "auxiliary_loss_clip": 0.01538566, + "auxiliary_loss_mlp": 0.01401528, + "balance_loss_clip": 1.15738988, + "balance_loss_mlp": 1.0704124, + "epoch": 0.07220802645423118, + "flos": 23882311059840.0, + "grad_norm": 2.505535116564164, + "language_loss": 0.83312428, + "learning_rate": 3.98134190563652e-06, + "loss": 0.86252522, + "num_input_tokens_seen": 25592735, + "step": 1201, + "time_per_iteration": 2.9298365116119385 + }, + { + "auxiliary_loss_clip": 0.01539119, + "auxiliary_loss_mlp": 0.01424708, + "balance_loss_clip": 1.15819645, + "balance_loss_mlp": 1.09340119, + "epoch": 0.07226814970689914, + "flos": 19245585159360.0, + "grad_norm": 3.1675181090025752, + "language_loss": 0.69441593, + "learning_rate": 3.981288793911775e-06, + "loss": 0.72405422, + "num_input_tokens_seen": 25611510, + "step": 1202, + "time_per_iteration": 2.767388105392456 + }, + { + "auxiliary_loss_clip": 0.01545089, + "auxiliary_loss_mlp": 0.01415783, + "balance_loss_clip": 1.16616201, + "balance_loss_mlp": 1.08733797, + "epoch": 0.07232827295956712, + "flos": 19174051920960.0, + "grad_norm": 2.124888544037077, + "language_loss": 0.88186276, + "learning_rate": 3.98123560705636e-06, + "loss": 0.91147155, + "num_input_tokens_seen": 25629560, + "step": 1203, + "time_per_iteration": 2.7834653854370117 + }, + { + "auxiliary_loss_clip": 0.01542517, + "auxiliary_loss_mlp": 0.01418126, + "balance_loss_clip": 1.16448784, + "balance_loss_mlp": 1.0931139, + "epoch": 0.07238839621223508, + "flos": 17641701027360.0, + "grad_norm": 3.3983842570737384, + "language_loss": 0.78525269, + "learning_rate": 3.981182345072293e-06, + "loss": 0.81485915, + "num_input_tokens_seen": 25648330, + "step": 1204, + "time_per_iteration": 2.777996301651001 + }, + { + "auxiliary_loss_clip": 0.01534685, + "auxiliary_loss_mlp": 0.01407547, + "balance_loss_clip": 1.1557405, + "balance_loss_mlp": 1.07471478, + "epoch": 0.07244851946490305, + "flos": 28294879348800.0, + "grad_norm": 1.845197303701982, + "language_loss": 0.82402509, + "learning_rate": 3.981129007961593e-06, + "loss": 0.85344738, + "num_input_tokens_seen": 25669470, + "step": 1205, + "time_per_iteration": 2.8542683124542236 + }, + { + "auxiliary_loss_clip": 0.01542206, + "auxiliary_loss_mlp": 0.01402747, + "balance_loss_clip": 1.16312099, + "balance_loss_mlp": 1.07048702, + "epoch": 0.07250864271757101, + "flos": 22567025211840.0, + "grad_norm": 3.6812585483775107, + "language_loss": 0.76599902, + "learning_rate": 3.981075595726283e-06, + "loss": 0.79544854, + "num_input_tokens_seen": 25690470, + "step": 1206, + "time_per_iteration": 2.7715492248535156 + }, + { + "auxiliary_loss_clip": 0.01536415, + "auxiliary_loss_mlp": 0.01394811, + "balance_loss_clip": 1.15675783, + "balance_loss_mlp": 1.06388628, + "epoch": 0.072568765970239, + "flos": 21764571115680.0, + "grad_norm": 2.065471636446354, + "language_loss": 0.77427351, + "learning_rate": 3.981022108368387e-06, + "loss": 0.80358577, + "num_input_tokens_seen": 25709205, + "step": 1207, + "time_per_iteration": 2.814512014389038 + }, + { + "auxiliary_loss_clip": 0.01539865, + "auxiliary_loss_mlp": 0.01411043, + "balance_loss_clip": 1.15974426, + "balance_loss_mlp": 1.08450449, + "epoch": 0.07262888922290696, + "flos": 25522037667360.0, + "grad_norm": 2.670323501416346, + "language_loss": 0.79575908, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.82526815, + "num_input_tokens_seen": 25728485, + "step": 1208, + "time_per_iteration": 2.775247573852539 + }, + { + "auxiliary_loss_clip": 0.01535368, + "auxiliary_loss_mlp": 0.01392736, + "balance_loss_clip": 1.15749717, + "balance_loss_mlp": 1.05532634, + "epoch": 0.07268901247557492, + "flos": 21248401685760.0, + "grad_norm": 7.703639482061694, + "language_loss": 0.78554779, + "learning_rate": 3.980914908292955e-06, + "loss": 0.81482875, + "num_input_tokens_seen": 25747730, + "step": 1209, + "time_per_iteration": 2.789743423461914 + }, + { + "auxiliary_loss_clip": 0.0153025, + "auxiliary_loss_mlp": 0.01397285, + "balance_loss_clip": 1.15028095, + "balance_loss_mlp": 1.05129206, + "epoch": 0.0727491357282429, + "flos": 25481416243680.0, + "grad_norm": 3.7916178624870023, + "language_loss": 0.81081676, + "learning_rate": 3.980861195579486e-06, + "loss": 0.84009206, + "num_input_tokens_seen": 25768050, + "step": 1210, + "time_per_iteration": 2.82869815826416 + }, + { + "auxiliary_loss_clip": 0.0154598, + "auxiliary_loss_mlp": 0.01395726, + "balance_loss_clip": 1.16711652, + "balance_loss_mlp": 1.05621839, + "epoch": 0.07280925898091087, + "flos": 24464476216800.0, + "grad_norm": 4.360331277057054, + "language_loss": 0.84496439, + "learning_rate": 3.98080740775156e-06, + "loss": 0.87438142, + "num_input_tokens_seen": 25787985, + "step": 1211, + "time_per_iteration": 2.7710111141204834 + }, + { + "auxiliary_loss_clip": 0.01538685, + "auxiliary_loss_mlp": 0.01398844, + "balance_loss_clip": 1.16025996, + "balance_loss_mlp": 1.06105232, + "epoch": 0.07286938223357883, + "flos": 18289710198720.0, + "grad_norm": 2.6869819563964694, + "language_loss": 0.91184556, + "learning_rate": 3.98075354481122e-06, + "loss": 0.94122088, + "num_input_tokens_seen": 25803620, + "step": 1212, + "time_per_iteration": 2.792691230773926 + }, + { + "auxiliary_loss_clip": 0.01542945, + "auxiliary_loss_mlp": 0.01386766, + "balance_loss_clip": 1.16591191, + "balance_loss_mlp": 1.0546968, + "epoch": 0.07292950548624681, + "flos": 21216920948640.0, + "grad_norm": 2.5564479837128857, + "language_loss": 0.72621155, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.75550866, + "num_input_tokens_seen": 25823315, + "step": 1213, + "time_per_iteration": 2.780059576034546 + }, + { + "auxiliary_loss_clip": 0.01535559, + "auxiliary_loss_mlp": 0.01390282, + "balance_loss_clip": 1.15833783, + "balance_loss_mlp": 1.05191803, + "epoch": 0.07298962873891478, + "flos": 24644181660480.0, + "grad_norm": 2.802195101210954, + "language_loss": 0.83969069, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86894917, + "num_input_tokens_seen": 25842605, + "step": 1214, + "time_per_iteration": 2.7608654499053955 + }, + { + "auxiliary_loss_clip": 0.01534051, + "auxiliary_loss_mlp": 0.01398452, + "balance_loss_clip": 1.15639925, + "balance_loss_mlp": 1.06409383, + "epoch": 0.07304975199158274, + "flos": 27055222980480.0, + "grad_norm": 2.7485941069595605, + "language_loss": 0.83917749, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86850262, + "num_input_tokens_seen": 25863030, + "step": 1215, + "time_per_iteration": 2.7929630279541016 + }, + { + "auxiliary_loss_clip": 0.0154193, + "auxiliary_loss_mlp": 0.01392581, + "balance_loss_clip": 1.16588211, + "balance_loss_mlp": 1.05326414, + "epoch": 0.07310987524425071, + "flos": 33552495421920.0, + "grad_norm": 1.8591654106834319, + "language_loss": 0.81222677, + "learning_rate": 3.980537341966595e-06, + "loss": 0.84157193, + "num_input_tokens_seen": 25888015, + "step": 1216, + "time_per_iteration": 2.8416879177093506 + }, + { + "auxiliary_loss_clip": 0.01540861, + "auxiliary_loss_mlp": 0.01418628, + "balance_loss_clip": 1.16579163, + "balance_loss_mlp": 1.07225382, + "epoch": 0.07316999849691869, + "flos": 28113467137920.0, + "grad_norm": 2.779818571143074, + "language_loss": 0.76317978, + "learning_rate": 3.980483103494872e-06, + "loss": 0.79277468, + "num_input_tokens_seen": 25908660, + "step": 1217, + "time_per_iteration": 2.8517470359802246 + }, + { + "auxiliary_loss_clip": 0.01537969, + "auxiliary_loss_mlp": 0.01387955, + "balance_loss_clip": 1.16033864, + "balance_loss_mlp": 1.05569506, + "epoch": 0.07323012174958665, + "flos": 14394373328160.0, + "grad_norm": 3.0745864858838736, + "language_loss": 0.86115277, + "learning_rate": 3.98042878992303e-06, + "loss": 0.89041197, + "num_input_tokens_seen": 25927215, + "step": 1218, + "time_per_iteration": 2.798534631729126 + }, + { + "auxiliary_loss_clip": 0.01534708, + "auxiliary_loss_mlp": 0.01407636, + "balance_loss_clip": 1.15869164, + "balance_loss_mlp": 1.08281481, + "epoch": 0.07329024500225462, + "flos": 21618584170560.0, + "grad_norm": 2.1112076211779973, + "language_loss": 0.87009799, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.89952141, + "num_input_tokens_seen": 25945500, + "step": 1219, + "time_per_iteration": 2.770651340484619 + }, + { + "auxiliary_loss_clip": 0.0153817, + "auxiliary_loss_mlp": 0.01393031, + "balance_loss_clip": 1.16249299, + "balance_loss_mlp": 1.06534803, + "epoch": 0.0733503682549226, + "flos": 13225795060320.0, + "grad_norm": 3.971243553100551, + "language_loss": 0.8485111, + "learning_rate": 3.980319937487235e-06, + "loss": 0.87782311, + "num_input_tokens_seen": 25963105, + "step": 1220, + "time_per_iteration": 2.7548880577087402 + }, + { + "auxiliary_loss_clip": 0.01546604, + "auxiliary_loss_mlp": 0.01407411, + "balance_loss_clip": 1.17092383, + "balance_loss_mlp": 1.0812546, + "epoch": 0.07341049150759056, + "flos": 20889104582880.0, + "grad_norm": 4.44274129972197, + "language_loss": 0.77572179, + "learning_rate": 3.98026539862741e-06, + "loss": 0.80526197, + "num_input_tokens_seen": 25981690, + "step": 1221, + "time_per_iteration": 2.756230115890503 + }, + { + "auxiliary_loss_clip": 0.01539183, + "auxiliary_loss_mlp": 0.01395685, + "balance_loss_clip": 1.16504288, + "balance_loss_mlp": 1.07486868, + "epoch": 0.07347061476025853, + "flos": 15415409596320.0, + "grad_norm": 9.971197101138928, + "language_loss": 0.91863787, + "learning_rate": 3.980210784675722e-06, + "loss": 0.9479866, + "num_input_tokens_seen": 25999890, + "step": 1222, + "time_per_iteration": 2.701707601547241 + }, + { + "auxiliary_loss_clip": 0.01536335, + "auxiliary_loss_mlp": 0.01392191, + "balance_loss_clip": 1.16185546, + "balance_loss_mlp": 1.06584382, + "epoch": 0.0735307380129265, + "flos": 11110444590240.0, + "grad_norm": 4.025229080406087, + "language_loss": 0.90932953, + "learning_rate": 3.980156095634242e-06, + "loss": 0.93861479, + "num_input_tokens_seen": 26016445, + "step": 1223, + "time_per_iteration": 2.753675937652588 + }, + { + "auxiliary_loss_clip": 0.01536169, + "auxiliary_loss_mlp": 0.01408113, + "balance_loss_clip": 1.16242814, + "balance_loss_mlp": 1.07833254, + "epoch": 0.07359086126559447, + "flos": 23734655275680.0, + "grad_norm": 3.0722621918666775, + "language_loss": 0.82151401, + "learning_rate": 3.980101331505045e-06, + "loss": 0.8509568, + "num_input_tokens_seen": 26036080, + "step": 1224, + "time_per_iteration": 2.767146110534668 + }, + { + "auxiliary_loss_clip": 0.01545716, + "auxiliary_loss_mlp": 0.01404045, + "balance_loss_clip": 1.1713177, + "balance_loss_mlp": 1.07426453, + "epoch": 0.07365098451826244, + "flos": 20995228667520.0, + "grad_norm": 4.300351130512412, + "language_loss": 0.83408076, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.86357832, + "num_input_tokens_seen": 26055805, + "step": 1225, + "time_per_iteration": 2.7693395614624023 + }, + { + "auxiliary_loss_clip": 0.01536036, + "auxiliary_loss_mlp": 0.01399574, + "balance_loss_clip": 1.16338944, + "balance_loss_mlp": 1.077232, + "epoch": 0.0737111077709304, + "flos": 19935391527360.0, + "grad_norm": 8.588133694568203, + "language_loss": 0.902964, + "learning_rate": 3.979991577991808e-06, + "loss": 0.93232, + "num_input_tokens_seen": 26073905, + "step": 1226, + "time_per_iteration": 2.7755563259124756 + }, + { + "auxiliary_loss_clip": 0.01531897, + "auxiliary_loss_mlp": 0.01422529, + "balance_loss_clip": 1.1580565, + "balance_loss_mlp": 1.10266674, + "epoch": 0.07377123102359838, + "flos": 16583418941760.0, + "grad_norm": 4.126585737618073, + "language_loss": 0.76919222, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79873651, + "num_input_tokens_seen": 26091700, + "step": 1227, + "time_per_iteration": 2.71624755859375 + }, + { + "auxiliary_loss_clip": 0.01538298, + "auxiliary_loss_mlp": 0.0140116, + "balance_loss_clip": 1.16499114, + "balance_loss_mlp": 1.07786405, + "epoch": 0.07383135427626634, + "flos": 28331025249600.0, + "grad_norm": 3.0699914511315236, + "language_loss": 0.85862279, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.88801742, + "num_input_tokens_seen": 26114105, + "step": 1228, + "time_per_iteration": 2.876105308532715 + }, + { + "auxiliary_loss_clip": 0.0154233, + "auxiliary_loss_mlp": 0.01400146, + "balance_loss_clip": 1.17054176, + "balance_loss_mlp": 1.07189107, + "epoch": 0.07389147752893431, + "flos": 20049215028480.0, + "grad_norm": 4.854262446251023, + "language_loss": 0.79763091, + "learning_rate": 3.97982638461608e-06, + "loss": 0.82705563, + "num_input_tokens_seen": 26131165, + "step": 1229, + "time_per_iteration": 2.738325834274292 + }, + { + "auxiliary_loss_clip": 0.01541113, + "auxiliary_loss_mlp": 0.01398187, + "balance_loss_clip": 1.16987252, + "balance_loss_mlp": 1.06745303, + "epoch": 0.07395160078160229, + "flos": 18116111188800.0, + "grad_norm": 10.504918290104063, + "language_loss": 0.78195226, + "learning_rate": 3.979771170004287e-06, + "loss": 0.81134528, + "num_input_tokens_seen": 26150040, + "step": 1230, + "time_per_iteration": 2.773695707321167 + }, + { + "auxiliary_loss_clip": 0.01541107, + "auxiliary_loss_mlp": 0.01397565, + "balance_loss_clip": 1.16713595, + "balance_loss_mlp": 1.06950092, + "epoch": 0.07401172403427025, + "flos": 23589199324800.0, + "grad_norm": 6.479987282838815, + "language_loss": 0.81560749, + "learning_rate": 3.979715880319372e-06, + "loss": 0.84499419, + "num_input_tokens_seen": 26169380, + "step": 1231, + "time_per_iteration": 2.825664758682251 + }, + { + "auxiliary_loss_clip": 0.01535697, + "auxiliary_loss_mlp": 0.01387065, + "balance_loss_clip": 1.16480207, + "balance_loss_mlp": 1.04984546, + "epoch": 0.07407184728693822, + "flos": 26362344431520.0, + "grad_norm": 4.600218912854876, + "language_loss": 0.95499074, + "learning_rate": 3.979660515563434e-06, + "loss": 0.98421836, + "num_input_tokens_seen": 26189420, + "step": 1232, + "time_per_iteration": 4.3448004722595215 + }, + { + "auxiliary_loss_clip": 0.01533119, + "auxiliary_loss_mlp": 0.01407385, + "balance_loss_clip": 1.1604867, + "balance_loss_mlp": 1.08103752, + "epoch": 0.0741319705396062, + "flos": 22202645735520.0, + "grad_norm": 4.1930468950882185, + "language_loss": 0.81339043, + "learning_rate": 3.979605075738569e-06, + "loss": 0.84279549, + "num_input_tokens_seen": 26209300, + "step": 1233, + "time_per_iteration": 2.843179225921631 + }, + { + "auxiliary_loss_clip": 0.015344, + "auxiliary_loss_mlp": 0.01395017, + "balance_loss_clip": 1.16227651, + "balance_loss_mlp": 1.06599903, + "epoch": 0.07419209379227416, + "flos": 39203506378080.0, + "grad_norm": 3.1894596358415734, + "language_loss": 0.71129394, + "learning_rate": 3.979549560846883e-06, + "loss": 0.74058807, + "num_input_tokens_seen": 26228110, + "step": 1234, + "time_per_iteration": 2.9950995445251465 + }, + { + "auxiliary_loss_clip": 0.01538042, + "auxiliary_loss_mlp": 0.01381795, + "balance_loss_clip": 1.16797769, + "balance_loss_mlp": 1.04781818, + "epoch": 0.07425221704494213, + "flos": 22783748904000.0, + "grad_norm": 2.1434533195058605, + "language_loss": 0.76939744, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79859579, + "num_input_tokens_seen": 26247020, + "step": 1235, + "time_per_iteration": 2.7879624366760254 + }, + { + "auxiliary_loss_clip": 0.01527176, + "auxiliary_loss_mlp": 0.01391068, + "balance_loss_clip": 1.15669823, + "balance_loss_mlp": 1.06090641, + "epoch": 0.0743123402976101, + "flos": 22275165106080.0, + "grad_norm": 3.592225459353943, + "language_loss": 0.83047152, + "learning_rate": 3.979438305871464e-06, + "loss": 0.85965395, + "num_input_tokens_seen": 26265750, + "step": 1236, + "time_per_iteration": 4.320820569992065 + }, + { + "auxiliary_loss_clip": 0.01533706, + "auxiliary_loss_mlp": 0.01392156, + "balance_loss_clip": 1.16374707, + "balance_loss_mlp": 1.06485486, + "epoch": 0.07437246355027807, + "flos": 29317622384160.0, + "grad_norm": 2.446287501960669, + "language_loss": 0.75530696, + "learning_rate": 3.979382565791951e-06, + "loss": 0.78456557, + "num_input_tokens_seen": 26287905, + "step": 1237, + "time_per_iteration": 4.356599807739258 + }, + { + "auxiliary_loss_clip": 0.01531217, + "auxiliary_loss_mlp": 0.01388362, + "balance_loss_clip": 1.16056561, + "balance_loss_mlp": 1.05743718, + "epoch": 0.07443258680294604, + "flos": 31947625157760.0, + "grad_norm": 2.478861252618702, + "language_loss": 0.77510512, + "learning_rate": 3.979326750654053e-06, + "loss": 0.8043009, + "num_input_tokens_seen": 26311795, + "step": 1238, + "time_per_iteration": 4.378015756607056 + }, + { + "auxiliary_loss_clip": 0.01542011, + "auxiliary_loss_mlp": 0.01394802, + "balance_loss_clip": 1.17038155, + "balance_loss_mlp": 1.06406784, + "epoch": 0.074492710055614, + "flos": 22677548963040.0, + "grad_norm": 4.519276918987281, + "language_loss": 0.8633846, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.89275277, + "num_input_tokens_seen": 26330330, + "step": 1239, + "time_per_iteration": 2.7446532249450684 + }, + { + "auxiliary_loss_clip": 0.0153472, + "auxiliary_loss_mlp": 0.01383583, + "balance_loss_clip": 1.16271281, + "balance_loss_mlp": 1.05399287, + "epoch": 0.07455283330828198, + "flos": 21286861204320.0, + "grad_norm": 3.6313722655303367, + "language_loss": 0.89198095, + "learning_rate": 3.979214895211569e-06, + "loss": 0.92116392, + "num_input_tokens_seen": 26348865, + "step": 1240, + "time_per_iteration": 2.823768138885498 + }, + { + "auxiliary_loss_clip": 0.01546808, + "auxiliary_loss_mlp": 0.01395813, + "balance_loss_clip": 1.17515755, + "balance_loss_mlp": 1.06050158, + "epoch": 0.07461295656094995, + "flos": 24390667288800.0, + "grad_norm": 2.1447315036186536, + "language_loss": 0.88836563, + "learning_rate": 3.979158854911225e-06, + "loss": 0.91779184, + "num_input_tokens_seen": 26368210, + "step": 1241, + "time_per_iteration": 2.787790298461914 + }, + { + "auxiliary_loss_clip": 0.0164373, + "auxiliary_loss_mlp": 0.01391457, + "balance_loss_clip": 1.26639962, + "balance_loss_mlp": 1.0878067, + "epoch": 0.07467307981361791, + "flos": 62115794909280.0, + "grad_norm": 0.9611138866186316, + "language_loss": 0.63016677, + "learning_rate": 3.979102739560979e-06, + "loss": 0.66051865, + "num_input_tokens_seen": 26424890, + "step": 1242, + "time_per_iteration": 3.3484058380126953 + }, + { + "auxiliary_loss_clip": 0.01532325, + "auxiliary_loss_mlp": 0.01408007, + "balance_loss_clip": 1.1616987, + "balance_loss_mlp": 1.08528376, + "epoch": 0.07473320306628589, + "flos": 24865456731840.0, + "grad_norm": 5.329188888129539, + "language_loss": 0.62885666, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65825999, + "num_input_tokens_seen": 26446405, + "step": 1243, + "time_per_iteration": 2.859637498855591 + }, + { + "auxiliary_loss_clip": 0.01533373, + "auxiliary_loss_mlp": 0.01422554, + "balance_loss_clip": 1.16320968, + "balance_loss_mlp": 1.10745943, + "epoch": 0.07479332631895386, + "flos": 24899630368320.0, + "grad_norm": 2.257349805060259, + "language_loss": 0.75735784, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78691709, + "num_input_tokens_seen": 26466070, + "step": 1244, + "time_per_iteration": 2.769562244415283 + }, + { + "auxiliary_loss_clip": 0.01530039, + "auxiliary_loss_mlp": 0.0142149, + "balance_loss_clip": 1.15842748, + "balance_loss_mlp": 1.10410762, + "epoch": 0.07485344957162182, + "flos": 17816817163680.0, + "grad_norm": 9.946321717532081, + "language_loss": 0.69041562, + "learning_rate": 3.978933943232123e-06, + "loss": 0.71993089, + "num_input_tokens_seen": 26479350, + "step": 1245, + "time_per_iteration": 2.7209627628326416 + }, + { + "auxiliary_loss_clip": 0.01531326, + "auxiliary_loss_mlp": 0.01421651, + "balance_loss_clip": 1.15960109, + "balance_loss_mlp": 1.10846424, + "epoch": 0.0749135728242898, + "flos": 25012581521760.0, + "grad_norm": 4.801974810369009, + "language_loss": 0.88616014, + "learning_rate": 3.978877527703576e-06, + "loss": 0.91568995, + "num_input_tokens_seen": 26498255, + "step": 1246, + "time_per_iteration": 2.799713611602783 + }, + { + "auxiliary_loss_clip": 0.01531206, + "auxiliary_loss_mlp": 0.01431284, + "balance_loss_clip": 1.16046691, + "balance_loss_mlp": 1.11638141, + "epoch": 0.07497369607695777, + "flos": 17824023514080.0, + "grad_norm": 7.982369891545817, + "language_loss": 0.88120973, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.91083461, + "num_input_tokens_seen": 26515375, + "step": 1247, + "time_per_iteration": 2.7724335193634033 + }, + { + "auxiliary_loss_clip": 0.01539684, + "auxiliary_loss_mlp": 0.01421204, + "balance_loss_clip": 1.1695025, + "balance_loss_mlp": 1.11030614, + "epoch": 0.07503381932962573, + "flos": 15122828855520.0, + "grad_norm": 8.395427041476424, + "language_loss": 0.6448561, + "learning_rate": 3.978764471530921e-06, + "loss": 0.674465, + "num_input_tokens_seen": 26533595, + "step": 1248, + "time_per_iteration": 2.69744610786438 + }, + { + "auxiliary_loss_clip": 0.01542227, + "auxiliary_loss_mlp": 0.01444638, + "balance_loss_clip": 1.17307389, + "balance_loss_mlp": 1.14117861, + "epoch": 0.0750939425822937, + "flos": 12818025404640.0, + "grad_norm": 4.608279468358942, + "language_loss": 0.74268532, + "learning_rate": 3.978707830891102e-06, + "loss": 0.77255392, + "num_input_tokens_seen": 26549405, + "step": 1249, + "time_per_iteration": 2.7253546714782715 + }, + { + "auxiliary_loss_clip": 0.01536103, + "auxiliary_loss_mlp": 0.0143858, + "balance_loss_clip": 1.1654768, + "balance_loss_mlp": 1.12787282, + "epoch": 0.07515406583496168, + "flos": 24209141293440.0, + "grad_norm": 4.618737363076973, + "language_loss": 0.82527149, + "learning_rate": 3.978651115218482e-06, + "loss": 0.85501838, + "num_input_tokens_seen": 26567200, + "step": 1250, + "time_per_iteration": 2.743619441986084 + }, + { + "auxiliary_loss_clip": 0.01542288, + "auxiliary_loss_mlp": 0.0144806, + "balance_loss_clip": 1.17565346, + "balance_loss_mlp": 1.14364719, + "epoch": 0.07521418908762964, + "flos": 26690615935200.0, + "grad_norm": 5.13698415592876, + "language_loss": 0.66767919, + "learning_rate": 3.978594324515215e-06, + "loss": 0.69758266, + "num_input_tokens_seen": 26586190, + "step": 1251, + "time_per_iteration": 2.8037407398223877 + }, + { + "auxiliary_loss_clip": 0.01617048, + "auxiliary_loss_mlp": 0.0152594, + "balance_loss_clip": 1.24843562, + "balance_loss_mlp": 1.3008728, + "epoch": 0.0752743123402976, + "flos": 59101879292640.0, + "grad_norm": 1.1431470042201801, + "language_loss": 0.70286345, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.73429334, + "num_input_tokens_seen": 26650710, + "step": 1252, + "time_per_iteration": 3.3894565105438232 + }, + { + "auxiliary_loss_clip": 0.01535921, + "auxiliary_loss_mlp": 0.01393053, + "balance_loss_clip": 1.16761899, + "balance_loss_mlp": 1.07071114, + "epoch": 0.07533443559296558, + "flos": 23479282424160.0, + "grad_norm": 4.741084993336576, + "language_loss": 0.79827261, + "learning_rate": 3.97848051802535e-06, + "loss": 0.82756233, + "num_input_tokens_seen": 26669000, + "step": 1253, + "time_per_iteration": 2.7703959941864014 + }, + { + "auxiliary_loss_clip": 0.01546, + "auxiliary_loss_mlp": 0.01420912, + "balance_loss_clip": 1.17779338, + "balance_loss_mlp": 1.10009599, + "epoch": 0.07539455884563355, + "flos": 20880798315840.0, + "grad_norm": 3.494473036485446, + "language_loss": 0.93380678, + "learning_rate": 3.978423502243069e-06, + "loss": 0.96347594, + "num_input_tokens_seen": 26683075, + "step": 1254, + "time_per_iteration": 2.8315250873565674 + }, + { + "auxiliary_loss_clip": 0.01545735, + "auxiliary_loss_mlp": 0.01394722, + "balance_loss_clip": 1.17768705, + "balance_loss_mlp": 1.06322455, + "epoch": 0.07545468209830151, + "flos": 27675392518080.0, + "grad_norm": 2.9399486326222064, + "language_loss": 0.88161969, + "learning_rate": 3.97836641143877e-06, + "loss": 0.91102421, + "num_input_tokens_seen": 26701875, + "step": 1255, + "time_per_iteration": 2.808361053466797 + }, + { + "auxiliary_loss_clip": 0.01544295, + "auxiliary_loss_mlp": 0.01387128, + "balance_loss_clip": 1.17696059, + "balance_loss_mlp": 1.05543995, + "epoch": 0.0755148053509695, + "flos": 14138734979520.0, + "grad_norm": 2.401371635696973, + "language_loss": 0.79355294, + "learning_rate": 3.978309245614618e-06, + "loss": 0.82286716, + "num_input_tokens_seen": 26719050, + "step": 1256, + "time_per_iteration": 2.8060495853424072 + }, + { + "auxiliary_loss_clip": 0.01618174, + "auxiliary_loss_mlp": 0.01337402, + "balance_loss_clip": 1.25584877, + "balance_loss_mlp": 1.04138184, + "epoch": 0.07557492860363746, + "flos": 58240787896800.0, + "grad_norm": 0.8268143607517549, + "language_loss": 0.57924211, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60879785, + "num_input_tokens_seen": 26780650, + "step": 1257, + "time_per_iteration": 3.45904278755188 + }, + { + "auxiliary_loss_clip": 0.01550385, + "auxiliary_loss_mlp": 0.01416589, + "balance_loss_clip": 1.18404353, + "balance_loss_mlp": 1.07460141, + "epoch": 0.07563505185630542, + "flos": 24646684919040.0, + "grad_norm": 4.856951362233865, + "language_loss": 0.89683771, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92650747, + "num_input_tokens_seen": 26798725, + "step": 1258, + "time_per_iteration": 2.786902666091919 + }, + { + "auxiliary_loss_clip": 0.01549363, + "auxiliary_loss_mlp": 0.0140438, + "balance_loss_clip": 1.18198633, + "balance_loss_mlp": 1.06105769, + "epoch": 0.07569517510897339, + "flos": 15524795502720.0, + "grad_norm": 13.672100282215204, + "language_loss": 0.81487226, + "learning_rate": 3.978137298044741e-06, + "loss": 0.8444097, + "num_input_tokens_seen": 26817005, + "step": 1259, + "time_per_iteration": 2.766390323638916 + }, + { + "auxiliary_loss_clip": 0.01540725, + "auxiliary_loss_mlp": 0.01391252, + "balance_loss_clip": 1.17373729, + "balance_loss_mlp": 1.05555844, + "epoch": 0.07575529836164137, + "flos": 22930684053120.0, + "grad_norm": 2.142807820771586, + "language_loss": 0.75565308, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78497285, + "num_input_tokens_seen": 26836655, + "step": 1260, + "time_per_iteration": 2.7769439220428467 + }, + { + "auxiliary_loss_clip": 0.01543918, + "auxiliary_loss_mlp": 0.01403057, + "balance_loss_clip": 1.17766106, + "balance_loss_mlp": 1.05305886, + "epoch": 0.07581542161430933, + "flos": 19502475137280.0, + "grad_norm": 5.5915023453633435, + "language_loss": 0.84976029, + "learning_rate": 3.978022291272044e-06, + "loss": 0.87923014, + "num_input_tokens_seen": 26854925, + "step": 1261, + "time_per_iteration": 2.752650260925293 + }, + { + "auxiliary_loss_clip": 0.01549752, + "auxiliary_loss_mlp": 0.01411289, + "balance_loss_clip": 1.18396544, + "balance_loss_mlp": 1.06167185, + "epoch": 0.0758755448669773, + "flos": 24975828770400.0, + "grad_norm": 8.443888562655808, + "language_loss": 0.82794207, + "learning_rate": 3.977964675374399e-06, + "loss": 0.85755247, + "num_input_tokens_seen": 26876170, + "step": 1262, + "time_per_iteration": 2.8132219314575195 + }, + { + "auxiliary_loss_clip": 0.01542846, + "auxiliary_loss_mlp": 0.01403031, + "balance_loss_clip": 1.1751442, + "balance_loss_mlp": 1.0574193, + "epoch": 0.07593566811964528, + "flos": 22750220046240.0, + "grad_norm": 4.289984120586939, + "language_loss": 0.82391882, + "learning_rate": 3.977906984472136e-06, + "loss": 0.85337758, + "num_input_tokens_seen": 26895005, + "step": 1263, + "time_per_iteration": 2.932849168777466 + }, + { + "auxiliary_loss_clip": 0.01540126, + "auxiliary_loss_mlp": 0.01417549, + "balance_loss_clip": 1.17152357, + "balance_loss_mlp": 1.07918525, + "epoch": 0.07599579137231324, + "flos": 23114637450720.0, + "grad_norm": 3.4447655930101067, + "language_loss": 0.7642312, + "learning_rate": 3.977849218567442e-06, + "loss": 0.79380798, + "num_input_tokens_seen": 26913930, + "step": 1264, + "time_per_iteration": 2.8022282123565674 + }, + { + "auxiliary_loss_clip": 0.01543706, + "auxiliary_loss_mlp": 0.0140649, + "balance_loss_clip": 1.17465162, + "balance_loss_mlp": 1.06717288, + "epoch": 0.07605591462498121, + "flos": 14503835090880.0, + "grad_norm": 2.90342052818427, + "language_loss": 0.81060421, + "learning_rate": 3.977791377662507e-06, + "loss": 0.84010613, + "num_input_tokens_seen": 26931485, + "step": 1265, + "time_per_iteration": 2.784001350402832 + }, + { + "auxiliary_loss_clip": 0.01539169, + "auxiliary_loss_mlp": 0.01400692, + "balance_loss_clip": 1.17115331, + "balance_loss_mlp": 1.056988, + "epoch": 0.07611603787764919, + "flos": 23516490313440.0, + "grad_norm": 3.0626343707951875, + "language_loss": 0.65564275, + "learning_rate": 3.977733461759524e-06, + "loss": 0.68504131, + "num_input_tokens_seen": 26951670, + "step": 1266, + "time_per_iteration": 2.773404359817505 + }, + { + "auxiliary_loss_clip": 0.01540969, + "auxiliary_loss_mlp": 0.0138674, + "balance_loss_clip": 1.17281938, + "balance_loss_mlp": 1.04704142, + "epoch": 0.07617616113031715, + "flos": 21509349976800.0, + "grad_norm": 3.139939198528914, + "language_loss": 0.79490179, + "learning_rate": 3.977675470860691e-06, + "loss": 0.82417887, + "num_input_tokens_seen": 26970335, + "step": 1267, + "time_per_iteration": 2.7867512702941895 + }, + { + "auxiliary_loss_clip": 0.01540201, + "auxiliary_loss_mlp": 0.01384563, + "balance_loss_clip": 1.17153835, + "balance_loss_mlp": 1.05459189, + "epoch": 0.07623628438298512, + "flos": 14574837335040.0, + "grad_norm": 4.181276697495103, + "language_loss": 0.729936, + "learning_rate": 3.977617404968205e-06, + "loss": 0.75918365, + "num_input_tokens_seen": 26986025, + "step": 1268, + "time_per_iteration": 2.742469072341919 + }, + { + "auxiliary_loss_clip": 0.01543852, + "auxiliary_loss_mlp": 0.01393997, + "balance_loss_clip": 1.17521465, + "balance_loss_mlp": 1.0626905, + "epoch": 0.07629640763565308, + "flos": 14722151765760.0, + "grad_norm": 3.417993274984344, + "language_loss": 0.82472968, + "learning_rate": 3.977559264084269e-06, + "loss": 0.85410815, + "num_input_tokens_seen": 27004045, + "step": 1269, + "time_per_iteration": 2.7138683795928955 + }, + { + "auxiliary_loss_clip": 0.01533632, + "auxiliary_loss_mlp": 0.01394947, + "balance_loss_clip": 1.16316104, + "balance_loss_mlp": 1.06383097, + "epoch": 0.07635653088832106, + "flos": 14904701821440.0, + "grad_norm": 3.832135989479865, + "language_loss": 0.88481486, + "learning_rate": 3.977501048211088e-06, + "loss": 0.91410059, + "num_input_tokens_seen": 27022070, + "step": 1270, + "time_per_iteration": 4.452094793319702 + }, + { + "auxiliary_loss_clip": 0.01545758, + "auxiliary_loss_mlp": 0.01392844, + "balance_loss_clip": 1.177212, + "balance_loss_mlp": 1.0670687, + "epoch": 0.07641665414098903, + "flos": 26654014896480.0, + "grad_norm": 2.4838499052580425, + "language_loss": 0.71155, + "learning_rate": 3.977442757350869e-06, + "loss": 0.74093604, + "num_input_tokens_seen": 27041755, + "step": 1271, + "time_per_iteration": 2.885051965713501 + }, + { + "auxiliary_loss_clip": 0.01547295, + "auxiliary_loss_mlp": 0.01395921, + "balance_loss_clip": 1.17889738, + "balance_loss_mlp": 1.06881046, + "epoch": 0.07647677739365699, + "flos": 25195207433760.0, + "grad_norm": 2.341421989486008, + "language_loss": 0.82564878, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85508096, + "num_input_tokens_seen": 27061540, + "step": 1272, + "time_per_iteration": 2.848280668258667 + }, + { + "auxiliary_loss_clip": 0.01540337, + "auxiliary_loss_mlp": 0.01403093, + "balance_loss_clip": 1.17077386, + "balance_loss_mlp": 1.08361244, + "epoch": 0.07653690064632497, + "flos": 20560226228640.0, + "grad_norm": 3.1098741131135066, + "language_loss": 0.80183935, + "learning_rate": 3.977325950678162e-06, + "loss": 0.83127373, + "num_input_tokens_seen": 27081395, + "step": 1273, + "time_per_iteration": 2.83123779296875 + }, + { + "auxiliary_loss_clip": 0.01539958, + "auxiliary_loss_mlp": 0.01416345, + "balance_loss_clip": 1.17021894, + "balance_loss_mlp": 1.0896163, + "epoch": 0.07659702389899294, + "flos": 22271296433760.0, + "grad_norm": 2.056174549156194, + "language_loss": 0.81039274, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83995575, + "num_input_tokens_seen": 27101175, + "step": 1274, + "time_per_iteration": 4.3397908210754395 + }, + { + "auxiliary_loss_clip": 0.01543033, + "auxiliary_loss_mlp": 0.01408995, + "balance_loss_clip": 1.17177463, + "balance_loss_mlp": 1.08760715, + "epoch": 0.0766571471516609, + "flos": 32638948652160.0, + "grad_norm": 2.273435290341352, + "language_loss": 0.73064494, + "learning_rate": 3.977208844083865e-06, + "loss": 0.76016521, + "num_input_tokens_seen": 27124505, + "step": 1275, + "time_per_iteration": 5.982100248336792 + }, + { + "auxiliary_loss_clip": 0.01551708, + "auxiliary_loss_mlp": 0.01408163, + "balance_loss_clip": 1.18074608, + "balance_loss_mlp": 1.08906388, + "epoch": 0.07671727040432888, + "flos": 15269157154080.0, + "grad_norm": 2.263547305660562, + "language_loss": 0.79413933, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.82373798, + "num_input_tokens_seen": 27140960, + "step": 1276, + "time_per_iteration": 2.7368948459625244 + }, + { + "auxiliary_loss_clip": 0.01539614, + "auxiliary_loss_mlp": 0.01410403, + "balance_loss_clip": 1.16936564, + "balance_loss_mlp": 1.08844233, + "epoch": 0.07677739365699685, + "flos": 28186631287200.0, + "grad_norm": 3.8046588231017657, + "language_loss": 0.60023999, + "learning_rate": 3.97709143758574e-06, + "loss": 0.62974024, + "num_input_tokens_seen": 27160985, + "step": 1277, + "time_per_iteration": 2.8227314949035645 + }, + { + "auxiliary_loss_clip": 0.01543077, + "auxiliary_loss_mlp": 0.01397017, + "balance_loss_clip": 1.17134142, + "balance_loss_mlp": 1.07410288, + "epoch": 0.07683751690966481, + "flos": 18298092322080.0, + "grad_norm": 9.344313031107982, + "language_loss": 0.74719286, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77659386, + "num_input_tokens_seen": 27178390, + "step": 1278, + "time_per_iteration": 2.782482624053955 + }, + { + "auxiliary_loss_clip": 0.01544062, + "auxiliary_loss_mlp": 0.01405813, + "balance_loss_clip": 1.17431188, + "balance_loss_mlp": 1.09186351, + "epoch": 0.07689764016233278, + "flos": 21983267072160.0, + "grad_norm": 4.675638281001182, + "language_loss": 0.88301384, + "learning_rate": 3.976973731201596e-06, + "loss": 0.9125126, + "num_input_tokens_seen": 27197505, + "step": 1279, + "time_per_iteration": 2.7344698905944824 + }, + { + "auxiliary_loss_clip": 0.01547451, + "auxiliary_loss_mlp": 0.01417688, + "balance_loss_clip": 1.1772666, + "balance_loss_mlp": 1.10488307, + "epoch": 0.07695776341500075, + "flos": 22238146857600.0, + "grad_norm": 3.4467756520877946, + "language_loss": 0.82888263, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85853398, + "num_input_tokens_seen": 27214260, + "step": 1280, + "time_per_iteration": 2.825683116912842 + }, + { + "auxiliary_loss_clip": 0.01542841, + "auxiliary_loss_mlp": 0.01407695, + "balance_loss_clip": 1.17226422, + "balance_loss_mlp": 1.0935545, + "epoch": 0.07701788666766872, + "flos": 16145609819040.0, + "grad_norm": 4.548410995393567, + "language_loss": 0.76241708, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.79192245, + "num_input_tokens_seen": 27232525, + "step": 1281, + "time_per_iteration": 2.8017890453338623 + }, + { + "auxiliary_loss_clip": 0.0154053, + "auxiliary_loss_mlp": 0.01401348, + "balance_loss_clip": 1.16980863, + "balance_loss_mlp": 1.08224869, + "epoch": 0.07707800992033668, + "flos": 19465039679040.0, + "grad_norm": 2.787201448525758, + "language_loss": 0.75531185, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.78473067, + "num_input_tokens_seen": 27249800, + "step": 1282, + "time_per_iteration": 2.775053024291992 + }, + { + "auxiliary_loss_clip": 0.01548704, + "auxiliary_loss_mlp": 0.01408245, + "balance_loss_clip": 1.17766154, + "balance_loss_mlp": 1.08971751, + "epoch": 0.07713813317300466, + "flos": 18992336284800.0, + "grad_norm": 2.1921676340582676, + "language_loss": 0.83840436, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86797386, + "num_input_tokens_seen": 27268895, + "step": 1283, + "time_per_iteration": 2.807598829269409 + }, + { + "auxiliary_loss_clip": 0.01549103, + "auxiliary_loss_mlp": 0.01393197, + "balance_loss_clip": 1.17865002, + "balance_loss_mlp": 1.07447934, + "epoch": 0.07719825642567263, + "flos": 18115921548000.0, + "grad_norm": 4.697916996652721, + "language_loss": 0.74962807, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77905095, + "num_input_tokens_seen": 27288180, + "step": 1284, + "time_per_iteration": 2.79958176612854 + }, + { + "auxiliary_loss_clip": 0.01543937, + "auxiliary_loss_mlp": 0.0138972, + "balance_loss_clip": 1.17269969, + "balance_loss_mlp": 1.06299114, + "epoch": 0.0772583796783406, + "flos": 42197736915360.0, + "grad_norm": 1.7896121914427217, + "language_loss": 0.76368618, + "learning_rate": 3.976618812911817e-06, + "loss": 0.79302281, + "num_input_tokens_seen": 27311815, + "step": 1285, + "time_per_iteration": 3.0697133541107178 + }, + { + "auxiliary_loss_clip": 0.0154675, + "auxiliary_loss_mlp": 0.01406614, + "balance_loss_clip": 1.17707074, + "balance_loss_mlp": 1.08751476, + "epoch": 0.07731850293100857, + "flos": 24755994969120.0, + "grad_norm": 2.148175467743634, + "language_loss": 0.83877021, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86830389, + "num_input_tokens_seen": 27331890, + "step": 1286, + "time_per_iteration": 2.8001880645751953 + }, + { + "auxiliary_loss_clip": 0.01550066, + "auxiliary_loss_mlp": 0.01412578, + "balance_loss_clip": 1.1810503, + "balance_loss_mlp": 1.0847044, + "epoch": 0.07737862618367654, + "flos": 17567778314880.0, + "grad_norm": 4.5013888245828575, + "language_loss": 0.77181947, + "learning_rate": 3.97649990716259e-06, + "loss": 0.80144584, + "num_input_tokens_seen": 27348320, + "step": 1287, + "time_per_iteration": 2.8038008213043213 + }, + { + "auxiliary_loss_clip": 0.0153956, + "auxiliary_loss_mlp": 0.01386114, + "balance_loss_clip": 1.16855562, + "balance_loss_mlp": 1.06663322, + "epoch": 0.0774387494363445, + "flos": 25629375453120.0, + "grad_norm": 2.42030278532193, + "language_loss": 0.84696507, + "learning_rate": 3.976440341863237e-06, + "loss": 0.87622178, + "num_input_tokens_seen": 27367670, + "step": 1288, + "time_per_iteration": 2.826308012008667 + }, + { + "auxiliary_loss_clip": 0.01542996, + "auxiliary_loss_mlp": 0.01405956, + "balance_loss_clip": 1.17317581, + "balance_loss_mlp": 1.07522202, + "epoch": 0.07749887268901248, + "flos": 12241511543520.0, + "grad_norm": 2.253097041298333, + "language_loss": 0.8533181, + "learning_rate": 3.976380701617068e-06, + "loss": 0.88280767, + "num_input_tokens_seen": 27385485, + "step": 1289, + "time_per_iteration": 2.748944044113159 + }, + { + "auxiliary_loss_clip": 0.01540951, + "auxiliary_loss_mlp": 0.01393835, + "balance_loss_clip": 1.16904318, + "balance_loss_mlp": 1.07130218, + "epoch": 0.07755899594168045, + "flos": 25084038903840.0, + "grad_norm": 2.0452007680241557, + "language_loss": 0.85279787, + "learning_rate": 3.976320986426344e-06, + "loss": 0.88214564, + "num_input_tokens_seen": 27405110, + "step": 1290, + "time_per_iteration": 2.8099372386932373 + }, + { + "auxiliary_loss_clip": 0.01545542, + "auxiliary_loss_mlp": 0.01390975, + "balance_loss_clip": 1.17540359, + "balance_loss_mlp": 1.06539035, + "epoch": 0.07761911919434841, + "flos": 14248158814080.0, + "grad_norm": 2.4340611326371877, + "language_loss": 0.90629047, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.93565571, + "num_input_tokens_seen": 27422855, + "step": 1291, + "time_per_iteration": 2.784433126449585 + }, + { + "auxiliary_loss_clip": 0.01666904, + "auxiliary_loss_mlp": 0.01308571, + "balance_loss_clip": 1.30720639, + "balance_loss_mlp": 1.02246857, + "epoch": 0.07767924244701638, + "flos": 67244719278240.0, + "grad_norm": 0.9475697929906562, + "language_loss": 0.65006816, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67982292, + "num_input_tokens_seen": 27487190, + "step": 1292, + "time_per_iteration": 3.4358532428741455 + }, + { + "auxiliary_loss_clip": 0.01545606, + "auxiliary_loss_mlp": 0.01394394, + "balance_loss_clip": 1.17539358, + "balance_loss_mlp": 1.06652057, + "epoch": 0.07773936569968436, + "flos": 28553286453120.0, + "grad_norm": 2.058828301653417, + "language_loss": 0.87623167, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.90563166, + "num_input_tokens_seen": 27510465, + "step": 1293, + "time_per_iteration": 2.8647608757019043 + }, + { + "auxiliary_loss_clip": 0.01550992, + "auxiliary_loss_mlp": 0.01404624, + "balance_loss_clip": 1.1816628, + "balance_loss_mlp": 1.0819006, + "epoch": 0.07779948895235232, + "flos": 27492728677920.0, + "grad_norm": 3.876171333427401, + "language_loss": 0.84812552, + "learning_rate": 3.976081376263239e-06, + "loss": 0.87768173, + "num_input_tokens_seen": 27528645, + "step": 1294, + "time_per_iteration": 2.8771796226501465 + }, + { + "auxiliary_loss_clip": 0.01558957, + "auxiliary_loss_mlp": 0.01398663, + "balance_loss_clip": 1.19021368, + "balance_loss_mlp": 1.07708442, + "epoch": 0.07785961220502029, + "flos": 18225269526240.0, + "grad_norm": 2.633286750166362, + "language_loss": 0.79007518, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81965137, + "num_input_tokens_seen": 27546165, + "step": 1295, + "time_per_iteration": 2.7637779712677 + }, + { + "auxiliary_loss_clip": 0.01541602, + "auxiliary_loss_mlp": 0.01394434, + "balance_loss_clip": 1.17278469, + "balance_loss_mlp": 1.07171106, + "epoch": 0.07791973545768827, + "flos": 24610539018240.0, + "grad_norm": 2.9134179531529885, + "language_loss": 0.88225937, + "learning_rate": 3.975961121573371e-06, + "loss": 0.91161978, + "num_input_tokens_seen": 27566520, + "step": 1296, + "time_per_iteration": 2.9769580364227295 + }, + { + "auxiliary_loss_clip": 0.01542373, + "auxiliary_loss_mlp": 0.01431425, + "balance_loss_clip": 1.17467821, + "balance_loss_mlp": 1.11499643, + "epoch": 0.07797985871035623, + "flos": 14283773720640.0, + "grad_norm": 4.195503566078445, + "language_loss": 0.96401531, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.99375319, + "num_input_tokens_seen": 27581960, + "step": 1297, + "time_per_iteration": 2.717576026916504 + }, + { + "auxiliary_loss_clip": 0.01540093, + "auxiliary_loss_mlp": 0.01402054, + "balance_loss_clip": 1.17094314, + "balance_loss_mlp": 1.08314538, + "epoch": 0.0780399819630242, + "flos": 26612634909600.0, + "grad_norm": 3.8987582989300114, + "language_loss": 0.76385748, + "learning_rate": 3.97584056716893e-06, + "loss": 0.79327905, + "num_input_tokens_seen": 27601415, + "step": 1298, + "time_per_iteration": 2.871975898742676 + }, + { + "auxiliary_loss_clip": 0.01540119, + "auxiliary_loss_mlp": 0.01391397, + "balance_loss_clip": 1.17077851, + "balance_loss_mlp": 1.07611227, + "epoch": 0.07810010521569218, + "flos": 21836635348320.0, + "grad_norm": 1.8336268715058401, + "language_loss": 0.81010902, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.83942413, + "num_input_tokens_seen": 27621490, + "step": 1299, + "time_per_iteration": 2.8128280639648438 + }, + { + "auxiliary_loss_clip": 0.0154858, + "auxiliary_loss_mlp": 0.01412276, + "balance_loss_clip": 1.17908418, + "balance_loss_mlp": 1.09584665, + "epoch": 0.07816022846836014, + "flos": 25083507909600.0, + "grad_norm": 3.687654022872223, + "language_loss": 0.86487174, + "learning_rate": 3.975719713068202e-06, + "loss": 0.89448035, + "num_input_tokens_seen": 27640600, + "step": 1300, + "time_per_iteration": 2.8540070056915283 + }, + { + "auxiliary_loss_clip": 0.01537423, + "auxiliary_loss_mlp": 0.01402559, + "balance_loss_clip": 1.1689657, + "balance_loss_mlp": 1.0830785, + "epoch": 0.0782203517210281, + "flos": 40920758873280.0, + "grad_norm": 2.583890792164007, + "language_loss": 0.72032279, + "learning_rate": 3.975659173637458e-06, + "loss": 0.7497226, + "num_input_tokens_seen": 27663070, + "step": 1301, + "time_per_iteration": 2.956479549407959 + }, + { + "auxiliary_loss_clip": 0.0154241, + "auxiliary_loss_mlp": 0.01389216, + "balance_loss_clip": 1.1734333, + "balance_loss_mlp": 1.07259655, + "epoch": 0.07828047497369607, + "flos": 41175221448960.0, + "grad_norm": 5.280183883251125, + "language_loss": 0.70771492, + "learning_rate": 3.97559855928952e-06, + "loss": 0.73703116, + "num_input_tokens_seen": 27686425, + "step": 1302, + "time_per_iteration": 2.943161964416504 + }, + { + "auxiliary_loss_clip": 0.01536303, + "auxiliary_loss_mlp": 0.0140384, + "balance_loss_clip": 1.16724825, + "balance_loss_mlp": 1.08149838, + "epoch": 0.07834059822636405, + "flos": 23510080454400.0, + "grad_norm": 6.338936812280721, + "language_loss": 0.82102019, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.85042155, + "num_input_tokens_seen": 27704900, + "step": 1303, + "time_per_iteration": 2.801980495452881 + }, + { + "auxiliary_loss_clip": 0.01541333, + "auxiliary_loss_mlp": 0.01402678, + "balance_loss_clip": 1.17321086, + "balance_loss_mlp": 1.08777452, + "epoch": 0.07840072147903202, + "flos": 20195998464960.0, + "grad_norm": 2.531767015278311, + "language_loss": 0.75046766, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77990776, + "num_input_tokens_seen": 27724890, + "step": 1304, + "time_per_iteration": 2.794884204864502 + }, + { + "auxiliary_loss_clip": 0.01544123, + "auxiliary_loss_mlp": 0.01381135, + "balance_loss_clip": 1.17540061, + "balance_loss_mlp": 1.05822062, + "epoch": 0.07846084473169998, + "flos": 21362945821920.0, + "grad_norm": 3.319785507330533, + "language_loss": 0.76017022, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78942281, + "num_input_tokens_seen": 27743115, + "step": 1305, + "time_per_iteration": 2.782195568084717 + }, + { + "auxiliary_loss_clip": 0.01542592, + "auxiliary_loss_mlp": 0.01378742, + "balance_loss_clip": 1.17371774, + "balance_loss_mlp": 1.05582833, + "epoch": 0.07852096798436796, + "flos": 25413334467840.0, + "grad_norm": 3.255135277155947, + "language_loss": 0.85042346, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87963676, + "num_input_tokens_seen": 27763570, + "step": 1306, + "time_per_iteration": 2.7706830501556396 + }, + { + "auxiliary_loss_clip": 0.01539853, + "auxiliary_loss_mlp": 0.01397137, + "balance_loss_clip": 1.17270255, + "balance_loss_mlp": 1.08032608, + "epoch": 0.07858109123703592, + "flos": 24573786266880.0, + "grad_norm": 3.1841649997810904, + "language_loss": 0.90415859, + "learning_rate": 3.975294363872468e-06, + "loss": 0.93352842, + "num_input_tokens_seen": 27780030, + "step": 1307, + "time_per_iteration": 2.908076524734497 + }, + { + "auxiliary_loss_clip": 0.01539158, + "auxiliary_loss_mlp": 0.0140557, + "balance_loss_clip": 1.17148435, + "balance_loss_mlp": 1.09143007, + "epoch": 0.07864121448970389, + "flos": 20700675662400.0, + "grad_norm": 2.5041074296520143, + "language_loss": 0.83041584, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85986316, + "num_input_tokens_seen": 27796225, + "step": 1308, + "time_per_iteration": 2.7838926315307617 + }, + { + "auxiliary_loss_clip": 0.01533022, + "auxiliary_loss_mlp": 0.01403742, + "balance_loss_clip": 1.16686082, + "balance_loss_mlp": 1.08693123, + "epoch": 0.07870133774237187, + "flos": 22968764290080.0, + "grad_norm": 2.459624182228681, + "language_loss": 0.77708906, + "learning_rate": 3.975172161365958e-06, + "loss": 0.80645674, + "num_input_tokens_seen": 27815975, + "step": 1309, + "time_per_iteration": 4.329012393951416 + }, + { + "auxiliary_loss_clip": 0.01535264, + "auxiliary_loss_mlp": 0.01388399, + "balance_loss_clip": 1.16681063, + "balance_loss_mlp": 1.0672015, + "epoch": 0.07876146099503983, + "flos": 18844794285120.0, + "grad_norm": 3.121130207544018, + "language_loss": 0.80379593, + "learning_rate": 3.975110947763453e-06, + "loss": 0.83303261, + "num_input_tokens_seen": 27832255, + "step": 1310, + "time_per_iteration": 2.782475709915161 + }, + { + "auxiliary_loss_clip": 0.01538675, + "auxiliary_loss_mlp": 0.01381453, + "balance_loss_clip": 1.17129827, + "balance_loss_mlp": 1.06120908, + "epoch": 0.0788215842477078, + "flos": 23808084922080.0, + "grad_norm": 2.077768073847608, + "language_loss": 0.73204619, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.76124746, + "num_input_tokens_seen": 27852180, + "step": 1311, + "time_per_iteration": 2.783296585083008 + }, + { + "auxiliary_loss_clip": 0.01545835, + "auxiliary_loss_mlp": 0.01387648, + "balance_loss_clip": 1.17886424, + "balance_loss_mlp": 1.07064676, + "epoch": 0.07888170750037576, + "flos": 21582058988160.0, + "grad_norm": 2.2361847585599146, + "language_loss": 0.85980797, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88914275, + "num_input_tokens_seen": 27871435, + "step": 1312, + "time_per_iteration": 2.768625259399414 + }, + { + "auxiliary_loss_clip": 0.01540779, + "auxiliary_loss_mlp": 0.01399167, + "balance_loss_clip": 1.17384779, + "balance_loss_mlp": 1.08521748, + "epoch": 0.07894183075304374, + "flos": 19866892541760.0, + "grad_norm": 3.0991570211455275, + "language_loss": 0.8233189, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.85271835, + "num_input_tokens_seen": 27890625, + "step": 1313, + "time_per_iteration": 4.354744911193848 + }, + { + "auxiliary_loss_clip": 0.01538544, + "auxiliary_loss_mlp": 0.01385831, + "balance_loss_clip": 1.17030549, + "balance_loss_mlp": 1.06596816, + "epoch": 0.07900195400571171, + "flos": 16145306393760.0, + "grad_norm": 4.050756978273631, + "language_loss": 0.73443437, + "learning_rate": 3.97486534441264e-06, + "loss": 0.76367807, + "num_input_tokens_seen": 27906530, + "step": 1314, + "time_per_iteration": 2.8269641399383545 + }, + { + "auxiliary_loss_clip": 0.01534861, + "auxiliary_loss_mlp": 0.01393519, + "balance_loss_clip": 1.16679537, + "balance_loss_mlp": 1.07594526, + "epoch": 0.07906207725837967, + "flos": 23732569226880.0, + "grad_norm": 1.9867431124820323, + "language_loss": 0.79628438, + "learning_rate": 3.974803756351379e-06, + "loss": 0.8255682, + "num_input_tokens_seen": 27926725, + "step": 1315, + "time_per_iteration": 2.7788772583007812 + }, + { + "auxiliary_loss_clip": 0.01534301, + "auxiliary_loss_mlp": 0.01391705, + "balance_loss_clip": 1.16668034, + "balance_loss_mlp": 1.06078017, + "epoch": 0.07912220051104765, + "flos": 24318185846400.0, + "grad_norm": 2.4313977489588123, + "language_loss": 0.73821276, + "learning_rate": 3.974742093405362e-06, + "loss": 0.76747286, + "num_input_tokens_seen": 27947875, + "step": 1316, + "time_per_iteration": 2.8443472385406494 + }, + { + "auxiliary_loss_clip": 0.01541756, + "auxiliary_loss_mlp": 0.0139635, + "balance_loss_clip": 1.17401564, + "balance_loss_mlp": 1.07553351, + "epoch": 0.07918232376371562, + "flos": 18882040102560.0, + "grad_norm": 3.047834740167464, + "language_loss": 0.65570152, + "learning_rate": 3.974680355576927e-06, + "loss": 0.68508255, + "num_input_tokens_seen": 27965040, + "step": 1317, + "time_per_iteration": 2.8173577785491943 + }, + { + "auxiliary_loss_clip": 0.01545662, + "auxiliary_loss_mlp": 0.0140975, + "balance_loss_clip": 1.17745543, + "balance_loss_mlp": 1.08130455, + "epoch": 0.07924244701638358, + "flos": 27378487967040.0, + "grad_norm": 4.766476632360868, + "language_loss": 0.73440611, + "learning_rate": 3.974618542868415e-06, + "loss": 0.76396024, + "num_input_tokens_seen": 27985330, + "step": 1318, + "time_per_iteration": 2.836533308029175 + }, + { + "auxiliary_loss_clip": 0.01533501, + "auxiliary_loss_mlp": 0.01391644, + "balance_loss_clip": 1.16554904, + "balance_loss_mlp": 1.07407069, + "epoch": 0.07930257026905156, + "flos": 25122763919520.0, + "grad_norm": 2.588449808678129, + "language_loss": 0.90675187, + "learning_rate": 3.97455665528217e-06, + "loss": 0.93600333, + "num_input_tokens_seen": 28007615, + "step": 1319, + "time_per_iteration": 2.792752742767334 + }, + { + "auxiliary_loss_clip": 0.0152837, + "auxiliary_loss_mlp": 0.0138093, + "balance_loss_clip": 1.15926242, + "balance_loss_mlp": 1.06221247, + "epoch": 0.07936269352171953, + "flos": 21836597420160.0, + "grad_norm": 12.996843275788526, + "language_loss": 0.80246502, + "learning_rate": 3.974494692820539e-06, + "loss": 0.83155799, + "num_input_tokens_seen": 28027765, + "step": 1320, + "time_per_iteration": 2.8296186923980713 + }, + { + "auxiliary_loss_clip": 0.01547183, + "auxiliary_loss_mlp": 0.01383284, + "balance_loss_clip": 1.17762327, + "balance_loss_mlp": 1.06590164, + "epoch": 0.07942281677438749, + "flos": 16941426487200.0, + "grad_norm": 8.240231764921008, + "language_loss": 0.68966269, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71896738, + "num_input_tokens_seen": 28044225, + "step": 1321, + "time_per_iteration": 2.7843594551086426 + }, + { + "auxiliary_loss_clip": 0.01536927, + "auxiliary_loss_mlp": 0.01394945, + "balance_loss_clip": 1.16892934, + "balance_loss_mlp": 1.08004189, + "epoch": 0.07948294002705546, + "flos": 18988808965920.0, + "grad_norm": 6.1256181950299435, + "language_loss": 0.84577322, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.87509191, + "num_input_tokens_seen": 28062915, + "step": 1322, + "time_per_iteration": 2.755990505218506 + }, + { + "auxiliary_loss_clip": 0.01535373, + "auxiliary_loss_mlp": 0.01395387, + "balance_loss_clip": 1.16690516, + "balance_loss_mlp": 1.07705069, + "epoch": 0.07954306327972344, + "flos": 21655640347200.0, + "grad_norm": 2.8832479634239516, + "language_loss": 0.90297592, + "learning_rate": 3.974308356206838e-06, + "loss": 0.93228346, + "num_input_tokens_seen": 28082175, + "step": 1323, + "time_per_iteration": 2.8029022216796875 + }, + { + "auxiliary_loss_clip": 0.01537303, + "auxiliary_loss_mlp": 0.01391051, + "balance_loss_clip": 1.16768384, + "balance_loss_mlp": 1.07233262, + "epoch": 0.0796031865323914, + "flos": 23222430374400.0, + "grad_norm": 2.4456694059329562, + "language_loss": 0.82329047, + "learning_rate": 3.974246094267187e-06, + "loss": 0.85257399, + "num_input_tokens_seen": 28102645, + "step": 1324, + "time_per_iteration": 2.8305251598358154 + }, + { + "auxiliary_loss_clip": 0.01538669, + "auxiliary_loss_mlp": 0.01383986, + "balance_loss_clip": 1.16895115, + "balance_loss_mlp": 1.06507695, + "epoch": 0.07966330978505937, + "flos": 23296770296640.0, + "grad_norm": 3.1283906358157725, + "language_loss": 0.79596007, + "learning_rate": 3.974183757463925e-06, + "loss": 0.82518667, + "num_input_tokens_seen": 28122805, + "step": 1325, + "time_per_iteration": 2.755195379257202 + }, + { + "auxiliary_loss_clip": 0.01533103, + "auxiliary_loss_mlp": 0.01373243, + "balance_loss_clip": 1.16387928, + "balance_loss_mlp": 1.06062853, + "epoch": 0.07972343303772735, + "flos": 18365453462880.0, + "grad_norm": 3.561051080381938, + "language_loss": 0.87781584, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90687931, + "num_input_tokens_seen": 28140530, + "step": 1326, + "time_per_iteration": 2.8556110858917236 + }, + { + "auxiliary_loss_clip": 0.01533052, + "auxiliary_loss_mlp": 0.01397625, + "balance_loss_clip": 1.16279364, + "balance_loss_mlp": 1.08405721, + "epoch": 0.07978355629039531, + "flos": 21764684900160.0, + "grad_norm": 3.455590948423584, + "language_loss": 0.83092821, + "learning_rate": 3.974058859276032e-06, + "loss": 0.86023492, + "num_input_tokens_seen": 28159640, + "step": 1327, + "time_per_iteration": 2.795534610748291 + }, + { + "auxiliary_loss_clip": 0.0154002, + "auxiliary_loss_mlp": 0.01396921, + "balance_loss_clip": 1.16895962, + "balance_loss_mlp": 1.08258998, + "epoch": 0.07984367954306328, + "flos": 18553275532800.0, + "grad_norm": 7.742912835814766, + "language_loss": 0.78795564, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.81732512, + "num_input_tokens_seen": 28177050, + "step": 1328, + "time_per_iteration": 2.7553317546844482 + }, + { + "auxiliary_loss_clip": 0.01542757, + "auxiliary_loss_mlp": 0.01392145, + "balance_loss_clip": 1.17272401, + "balance_loss_mlp": 1.07075632, + "epoch": 0.07990380279573125, + "flos": 16905621939840.0, + "grad_norm": 3.246829070005736, + "language_loss": 0.74435228, + "learning_rate": 3.973933661662101e-06, + "loss": 0.77370125, + "num_input_tokens_seen": 28193245, + "step": 1329, + "time_per_iteration": 2.798393964767456 + }, + { + "auxiliary_loss_clip": 0.01537836, + "auxiliary_loss_mlp": 0.01384381, + "balance_loss_clip": 1.1668421, + "balance_loss_mlp": 1.0648998, + "epoch": 0.07996392604839922, + "flos": 24100703591040.0, + "grad_norm": 2.379989105285263, + "language_loss": 0.81177676, + "learning_rate": 3.973870950576305e-06, + "loss": 0.84099895, + "num_input_tokens_seen": 28213570, + "step": 1330, + "time_per_iteration": 2.828829526901245 + }, + { + "auxiliary_loss_clip": 0.01545692, + "auxiliary_loss_mlp": 0.01391217, + "balance_loss_clip": 1.17645741, + "balance_loss_mlp": 1.07860279, + "epoch": 0.08002404930106718, + "flos": 14280246401760.0, + "grad_norm": 4.677497502057414, + "language_loss": 0.88587379, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.91524285, + "num_input_tokens_seen": 28229980, + "step": 1331, + "time_per_iteration": 2.722627878189087 + }, + { + "auxiliary_loss_clip": 0.0154022, + "auxiliary_loss_mlp": 0.01389301, + "balance_loss_clip": 1.17105579, + "balance_loss_mlp": 1.0766871, + "epoch": 0.08008417255373516, + "flos": 40409406319680.0, + "grad_norm": 4.075659656127336, + "language_loss": 0.73099768, + "learning_rate": 3.973745303858942e-06, + "loss": 0.76029289, + "num_input_tokens_seen": 28253840, + "step": 1332, + "time_per_iteration": 2.930510997772217 + }, + { + "auxiliary_loss_clip": 0.01542778, + "auxiliary_loss_mlp": 0.01393051, + "balance_loss_clip": 1.17198944, + "balance_loss_mlp": 1.08482313, + "epoch": 0.08014429580640313, + "flos": 18480907874880.0, + "grad_norm": 13.618671069749006, + "language_loss": 0.82723254, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85659081, + "num_input_tokens_seen": 28271675, + "step": 1333, + "time_per_iteration": 2.7503976821899414 + }, + { + "auxiliary_loss_clip": 0.01537593, + "auxiliary_loss_mlp": 0.01384974, + "balance_loss_clip": 1.16593516, + "balance_loss_mlp": 1.07293129, + "epoch": 0.0802044190590711, + "flos": 22055596801920.0, + "grad_norm": 4.425556095677385, + "language_loss": 0.75075543, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.77998114, + "num_input_tokens_seen": 28291850, + "step": 1334, + "time_per_iteration": 2.7706332206726074 + }, + { + "auxiliary_loss_clip": 0.01550463, + "auxiliary_loss_mlp": 0.013843, + "balance_loss_clip": 1.1790669, + "balance_loss_mlp": 1.07893348, + "epoch": 0.08026454231173906, + "flos": 24574468973760.0, + "grad_norm": 2.433796266327732, + "language_loss": 0.80152047, + "learning_rate": 3.973556272454221e-06, + "loss": 0.83086807, + "num_input_tokens_seen": 28310780, + "step": 1335, + "time_per_iteration": 2.7831802368164062 + }, + { + "auxiliary_loss_clip": 0.0168449, + "auxiliary_loss_mlp": 0.01410248, + "balance_loss_clip": 1.31274033, + "balance_loss_mlp": 1.17297363, + "epoch": 0.08032466556440704, + "flos": 52587159897600.0, + "grad_norm": 1.0228774367979088, + "language_loss": 0.55938947, + "learning_rate": 3.973493112307889e-06, + "loss": 0.59033692, + "num_input_tokens_seen": 28369985, + "step": 1336, + "time_per_iteration": 3.361768960952759 + }, + { + "auxiliary_loss_clip": 0.01540692, + "auxiliary_loss_mlp": 0.0138258, + "balance_loss_clip": 1.16980422, + "balance_loss_mlp": 1.07187271, + "epoch": 0.080384788817075, + "flos": 23844913529760.0, + "grad_norm": 4.856468119320041, + "language_loss": 0.67657411, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.70580679, + "num_input_tokens_seen": 28388670, + "step": 1337, + "time_per_iteration": 2.8461148738861084 + }, + { + "auxiliary_loss_clip": 0.01549296, + "auxiliary_loss_mlp": 0.01393732, + "balance_loss_clip": 1.17714715, + "balance_loss_mlp": 1.07215285, + "epoch": 0.08044491206974297, + "flos": 25303038285600.0, + "grad_norm": 2.409555401374337, + "language_loss": 0.86644173, + "learning_rate": 3.973366567512453e-06, + "loss": 0.895872, + "num_input_tokens_seen": 28411845, + "step": 1338, + "time_per_iteration": 2.823152780532837 + }, + { + "auxiliary_loss_clip": 0.0154017, + "auxiliary_loss_mlp": 0.01386197, + "balance_loss_clip": 1.16697574, + "balance_loss_mlp": 1.06499898, + "epoch": 0.08050503532241095, + "flos": 22378406650560.0, + "grad_norm": 4.865836409518068, + "language_loss": 0.8762666, + "learning_rate": 3.973303182868147e-06, + "loss": 0.90553021, + "num_input_tokens_seen": 28427875, + "step": 1339, + "time_per_iteration": 2.833986282348633 + }, + { + "auxiliary_loss_clip": 0.01541789, + "auxiliary_loss_mlp": 0.01401361, + "balance_loss_clip": 1.170573, + "balance_loss_mlp": 1.07730198, + "epoch": 0.08056515857507891, + "flos": 18371408184000.0, + "grad_norm": 3.02366177557949, + "language_loss": 0.89545846, + "learning_rate": 3.973239723395988e-06, + "loss": 0.92488992, + "num_input_tokens_seen": 28446615, + "step": 1340, + "time_per_iteration": 2.8129358291625977 + }, + { + "auxiliary_loss_clip": 0.01671166, + "auxiliary_loss_mlp": 0.01281891, + "balance_loss_clip": 1.29832923, + "balance_loss_mlp": 1.01104736, + "epoch": 0.08062528182774688, + "flos": 51353761675680.0, + "grad_norm": 0.9011019979116912, + "language_loss": 0.64789218, + "learning_rate": 3.97317618909838e-06, + "loss": 0.67742276, + "num_input_tokens_seen": 28505290, + "step": 1341, + "time_per_iteration": 3.3221802711486816 + }, + { + "auxiliary_loss_clip": 0.01547262, + "auxiliary_loss_mlp": 0.01405804, + "balance_loss_clip": 1.174909, + "balance_loss_mlp": 1.07545114, + "epoch": 0.08068540508041486, + "flos": 17601458885280.0, + "grad_norm": 2.264843772412161, + "language_loss": 0.896864, + "learning_rate": 3.973112579977733e-06, + "loss": 0.9263947, + "num_input_tokens_seen": 28522735, + "step": 1342, + "time_per_iteration": 2.785850763320923 + }, + { + "auxiliary_loss_clip": 0.01545652, + "auxiliary_loss_mlp": 0.01382266, + "balance_loss_clip": 1.17276156, + "balance_loss_mlp": 1.05095947, + "epoch": 0.08074552833308282, + "flos": 10562718566880.0, + "grad_norm": 3.0271960268811466, + "language_loss": 0.76632029, + "learning_rate": 3.973048896036459e-06, + "loss": 0.79559946, + "num_input_tokens_seen": 28539460, + "step": 1343, + "time_per_iteration": 2.8026392459869385 + }, + { + "auxiliary_loss_clip": 0.01659357, + "auxiliary_loss_mlp": 0.01287178, + "balance_loss_clip": 1.2864635, + "balance_loss_mlp": 1.01328278, + "epoch": 0.08080565158575079, + "flos": 60846743852640.0, + "grad_norm": 0.8200848720396311, + "language_loss": 0.57391483, + "learning_rate": 3.972985137276974e-06, + "loss": 0.6033802, + "num_input_tokens_seen": 28599855, + "step": 1344, + "time_per_iteration": 3.1760079860687256 + }, + { + "auxiliary_loss_clip": 0.0154558, + "auxiliary_loss_mlp": 0.01384542, + "balance_loss_clip": 1.17402542, + "balance_loss_mlp": 1.06105566, + "epoch": 0.08086577483841875, + "flos": 18334314079200.0, + "grad_norm": 2.4091441654040997, + "language_loss": 0.86413491, + "learning_rate": 3.972921303701695e-06, + "loss": 0.89343613, + "num_input_tokens_seen": 28617585, + "step": 1345, + "time_per_iteration": 2.7178211212158203 + }, + { + "auxiliary_loss_clip": 0.01536709, + "auxiliary_loss_mlp": 0.01377171, + "balance_loss_clip": 1.1646626, + "balance_loss_mlp": 1.0616951, + "epoch": 0.08092589809108673, + "flos": 21545761374720.0, + "grad_norm": 3.313697853121391, + "language_loss": 0.87990582, + "learning_rate": 3.972857395313042e-06, + "loss": 0.90904462, + "num_input_tokens_seen": 28636355, + "step": 1346, + "time_per_iteration": 2.8223602771759033 + }, + { + "auxiliary_loss_clip": 0.01537316, + "auxiliary_loss_mlp": 0.01381029, + "balance_loss_clip": 1.16596937, + "balance_loss_mlp": 1.06231093, + "epoch": 0.0809860213437547, + "flos": 22130240149440.0, + "grad_norm": 1.700673002678969, + "language_loss": 0.92922372, + "learning_rate": 3.972793412113439e-06, + "loss": 0.95840716, + "num_input_tokens_seen": 28656260, + "step": 1347, + "time_per_iteration": 4.336812734603882 + }, + { + "auxiliary_loss_clip": 0.01541127, + "auxiliary_loss_mlp": 0.013914, + "balance_loss_clip": 1.16963565, + "balance_loss_mlp": 1.07554293, + "epoch": 0.08104614459642266, + "flos": 21727628723520.0, + "grad_norm": 4.716793730500001, + "language_loss": 0.89225376, + "learning_rate": 3.972729354105312e-06, + "loss": 0.92157912, + "num_input_tokens_seen": 28675865, + "step": 1348, + "time_per_iteration": 2.8585591316223145 + }, + { + "auxiliary_loss_clip": 0.01543174, + "auxiliary_loss_mlp": 0.01409763, + "balance_loss_clip": 1.17079616, + "balance_loss_mlp": 1.10725725, + "epoch": 0.08110626784909064, + "flos": 23954185651680.0, + "grad_norm": 2.003118561115828, + "language_loss": 0.76509041, + "learning_rate": 3.97266522129109e-06, + "loss": 0.7946198, + "num_input_tokens_seen": 28696255, + "step": 1349, + "time_per_iteration": 2.807995557785034 + }, + { + "auxiliary_loss_clip": 0.01538005, + "auxiliary_loss_mlp": 0.01390952, + "balance_loss_clip": 1.16525912, + "balance_loss_mlp": 1.07757521, + "epoch": 0.0811663911017586, + "flos": 19027685694240.0, + "grad_norm": 2.505996400621359, + "language_loss": 0.8861562, + "learning_rate": 3.972601013673205e-06, + "loss": 0.91544569, + "num_input_tokens_seen": 28713905, + "step": 1350, + "time_per_iteration": 2.831482410430908 + }, + { + "auxiliary_loss_clip": 0.01546264, + "auxiliary_loss_mlp": 0.01401889, + "balance_loss_clip": 1.17382264, + "balance_loss_mlp": 1.09442425, + "epoch": 0.08122651435442657, + "flos": 15343117794720.0, + "grad_norm": 5.205195186859057, + "language_loss": 0.82047689, + "learning_rate": 3.972536731254092e-06, + "loss": 0.84995842, + "num_input_tokens_seen": 28732075, + "step": 1351, + "time_per_iteration": 5.917403221130371 + }, + { + "auxiliary_loss_clip": 0.01541553, + "auxiliary_loss_mlp": 0.01389882, + "balance_loss_clip": 1.16907299, + "balance_loss_mlp": 1.09290755, + "epoch": 0.08128663760709455, + "flos": 23223795788160.0, + "grad_norm": 3.567174168265464, + "language_loss": 0.75516129, + "learning_rate": 3.972472374036189e-06, + "loss": 0.78447568, + "num_input_tokens_seen": 28751150, + "step": 1352, + "time_per_iteration": 4.4523093700408936 + }, + { + "auxiliary_loss_clip": 0.01547582, + "auxiliary_loss_mlp": 0.0144389, + "balance_loss_clip": 1.17382312, + "balance_loss_mlp": 1.14367342, + "epoch": 0.08134676085976252, + "flos": 22967816086080.0, + "grad_norm": 2.8406847882759965, + "language_loss": 0.8279438, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85785854, + "num_input_tokens_seen": 28773360, + "step": 1353, + "time_per_iteration": 2.8413922786712646 + }, + { + "auxiliary_loss_clip": 0.01632704, + "auxiliary_loss_mlp": 0.0132473, + "balance_loss_clip": 1.25899076, + "balance_loss_mlp": 1.08974457, + "epoch": 0.08140688411243048, + "flos": 64327635711360.0, + "grad_norm": 0.9018120330866422, + "language_loss": 0.59646642, + "learning_rate": 3.972343435213775e-06, + "loss": 0.6260407, + "num_input_tokens_seen": 28833390, + "step": 1354, + "time_per_iteration": 3.3398828506469727 + }, + { + "auxiliary_loss_clip": 0.01536405, + "auxiliary_loss_mlp": 0.01399949, + "balance_loss_clip": 1.16388535, + "balance_loss_mlp": 1.08866966, + "epoch": 0.08146700736509845, + "flos": 22494050703360.0, + "grad_norm": 2.3145694334838285, + "language_loss": 0.82765293, + "learning_rate": 3.972278853614154e-06, + "loss": 0.8570165, + "num_input_tokens_seen": 28852430, + "step": 1355, + "time_per_iteration": 2.8852148056030273 + }, + { + "auxiliary_loss_clip": 0.01542172, + "auxiliary_loss_mlp": 0.01399874, + "balance_loss_clip": 1.1678797, + "balance_loss_mlp": 1.08992982, + "epoch": 0.08152713061776642, + "flos": 20449854190080.0, + "grad_norm": 2.278078053910915, + "language_loss": 0.7092818, + "learning_rate": 3.972214197225521e-06, + "loss": 0.7387023, + "num_input_tokens_seen": 28870685, + "step": 1356, + "time_per_iteration": 2.8003597259521484 + }, + { + "auxiliary_loss_clip": 0.01542551, + "auxiliary_loss_mlp": 0.01390571, + "balance_loss_clip": 1.16929674, + "balance_loss_mlp": 1.07356977, + "epoch": 0.08158725387043439, + "flos": 23552484501600.0, + "grad_norm": 2.292186951155308, + "language_loss": 0.70509493, + "learning_rate": 3.972149466050329e-06, + "loss": 0.73442614, + "num_input_tokens_seen": 28889860, + "step": 1357, + "time_per_iteration": 2.7778286933898926 + }, + { + "auxiliary_loss_clip": 0.01544342, + "auxiliary_loss_mlp": 0.01407739, + "balance_loss_clip": 1.17144489, + "balance_loss_mlp": 1.08673251, + "epoch": 0.08164737712310235, + "flos": 22019488829280.0, + "grad_norm": 3.175379224651318, + "language_loss": 0.8433072, + "learning_rate": 3.97208466009103e-06, + "loss": 0.87282801, + "num_input_tokens_seen": 28905865, + "step": 1358, + "time_per_iteration": 2.7912185192108154 + }, + { + "auxiliary_loss_clip": 0.01548311, + "auxiliary_loss_mlp": 0.01404243, + "balance_loss_clip": 1.17483473, + "balance_loss_mlp": 1.08895874, + "epoch": 0.08170750037577033, + "flos": 23370086158560.0, + "grad_norm": 5.112357947791361, + "language_loss": 1.03037107, + "learning_rate": 3.972019779350084e-06, + "loss": 1.05989659, + "num_input_tokens_seen": 28925250, + "step": 1359, + "time_per_iteration": 2.7938761711120605 + }, + { + "auxiliary_loss_clip": 0.01542191, + "auxiliary_loss_mlp": 0.01401147, + "balance_loss_clip": 1.16894031, + "balance_loss_mlp": 1.0738461, + "epoch": 0.0817676236284383, + "flos": 28400244870240.0, + "grad_norm": 3.721231030390617, + "language_loss": 0.83393925, + "learning_rate": 3.971954823829951e-06, + "loss": 0.86337262, + "num_input_tokens_seen": 28943445, + "step": 1360, + "time_per_iteration": 2.856707811355591 + }, + { + "auxiliary_loss_clip": 0.01544347, + "auxiliary_loss_mlp": 0.01401079, + "balance_loss_clip": 1.16960621, + "balance_loss_mlp": 1.07511306, + "epoch": 0.08182774688110626, + "flos": 19210956384960.0, + "grad_norm": 4.224167819487874, + "language_loss": 0.7222867, + "learning_rate": 3.971889793533093e-06, + "loss": 0.75174099, + "num_input_tokens_seen": 28962695, + "step": 1361, + "time_per_iteration": 2.805246591567993 + }, + { + "auxiliary_loss_clip": 0.01536625, + "auxiliary_loss_mlp": 0.01390007, + "balance_loss_clip": 1.16256571, + "balance_loss_mlp": 1.06442225, + "epoch": 0.08188787013377424, + "flos": 22786328018880.0, + "grad_norm": 2.7294159639943922, + "language_loss": 0.77234656, + "learning_rate": 3.971824688461976e-06, + "loss": 0.80161285, + "num_input_tokens_seen": 28982120, + "step": 1362, + "time_per_iteration": 2.79021954536438 + }, + { + "auxiliary_loss_clip": 0.01542492, + "auxiliary_loss_mlp": 0.01374597, + "balance_loss_clip": 1.16747522, + "balance_loss_mlp": 1.05168283, + "epoch": 0.08194799338644221, + "flos": 16469747153280.0, + "grad_norm": 22.379177241700926, + "language_loss": 0.72591698, + "learning_rate": 3.971759508619069e-06, + "loss": 0.75508785, + "num_input_tokens_seen": 28998100, + "step": 1363, + "time_per_iteration": 2.811420440673828 + }, + { + "auxiliary_loss_clip": 0.0154971, + "auxiliary_loss_mlp": 0.01386506, + "balance_loss_clip": 1.17561543, + "balance_loss_mlp": 1.05634451, + "epoch": 0.08200811663911017, + "flos": 23915839917600.0, + "grad_norm": 4.638747326057353, + "language_loss": 0.76833135, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79769349, + "num_input_tokens_seen": 29017095, + "step": 1364, + "time_per_iteration": 2.787659168243408 + }, + { + "auxiliary_loss_clip": 0.01550257, + "auxiliary_loss_mlp": 0.01418727, + "balance_loss_clip": 1.17584074, + "balance_loss_mlp": 1.08627629, + "epoch": 0.08206823989177814, + "flos": 17898591005280.0, + "grad_norm": 1.9228540117984105, + "language_loss": 0.81928813, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84897798, + "num_input_tokens_seen": 29037240, + "step": 1365, + "time_per_iteration": 2.8211781978607178 + }, + { + "auxiliary_loss_clip": 0.01543622, + "auxiliary_loss_mlp": 0.01381184, + "balance_loss_clip": 1.17128253, + "balance_loss_mlp": 1.05312014, + "epoch": 0.08212836314444612, + "flos": 22090225576320.0, + "grad_norm": 1.9105156396575071, + "language_loss": 0.82161224, + "learning_rate": 3.97156352048434e-06, + "loss": 0.8508603, + "num_input_tokens_seen": 29056250, + "step": 1366, + "time_per_iteration": 2.79296612739563 + }, + { + "auxiliary_loss_clip": 0.01538162, + "auxiliary_loss_mlp": 0.01373334, + "balance_loss_clip": 1.16529691, + "balance_loss_mlp": 1.05194581, + "epoch": 0.08218848639711408, + "flos": 17599107339360.0, + "grad_norm": 1.8944555869569264, + "language_loss": 0.8212266, + "learning_rate": 3.97149804157902e-06, + "loss": 0.85034156, + "num_input_tokens_seen": 29073380, + "step": 1367, + "time_per_iteration": 2.7961244583129883 + }, + { + "auxiliary_loss_clip": 0.01540852, + "auxiliary_loss_mlp": 0.0138435, + "balance_loss_clip": 1.1675446, + "balance_loss_mlp": 1.05914712, + "epoch": 0.08224860964978205, + "flos": 17859752205120.0, + "grad_norm": 2.925233540823357, + "language_loss": 0.83895135, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.86820334, + "num_input_tokens_seen": 29091330, + "step": 1368, + "time_per_iteration": 2.8121414184570312 + }, + { + "auxiliary_loss_clip": 0.01547652, + "auxiliary_loss_mlp": 0.01376906, + "balance_loss_clip": 1.17313027, + "balance_loss_mlp": 1.05895138, + "epoch": 0.08230873290245003, + "flos": 25229836208160.0, + "grad_norm": 2.0932290005364145, + "language_loss": 0.81181967, + "learning_rate": 3.971366859492653e-06, + "loss": 0.84106523, + "num_input_tokens_seen": 29110375, + "step": 1369, + "time_per_iteration": 2.841003179550171 + }, + { + "auxiliary_loss_clip": 0.0154941, + "auxiliary_loss_mlp": 0.01390879, + "balance_loss_clip": 1.17602205, + "balance_loss_mlp": 1.08055389, + "epoch": 0.08236885615511799, + "flos": 31762685628000.0, + "grad_norm": 2.355931286175524, + "language_loss": 0.74276942, + "learning_rate": 3.971301156316582e-06, + "loss": 0.77217233, + "num_input_tokens_seen": 29129395, + "step": 1370, + "time_per_iteration": 2.8708460330963135 + }, + { + "auxiliary_loss_clip": 0.01542309, + "auxiliary_loss_mlp": 0.01389324, + "balance_loss_clip": 1.16878641, + "balance_loss_mlp": 1.07937956, + "epoch": 0.08242897940778596, + "flos": 23188105025280.0, + "grad_norm": 3.1690193001593863, + "language_loss": 0.74334466, + "learning_rate": 3.971235378388573e-06, + "loss": 0.77266097, + "num_input_tokens_seen": 29148650, + "step": 1371, + "time_per_iteration": 2.8127174377441406 + }, + { + "auxiliary_loss_clip": 0.0154352, + "auxiliary_loss_mlp": 0.01380625, + "balance_loss_clip": 1.17071509, + "balance_loss_mlp": 1.06705642, + "epoch": 0.08248910266045394, + "flos": 34494109394400.0, + "grad_norm": 2.3472750256038344, + "language_loss": 0.71114719, + "learning_rate": 3.971169525711122e-06, + "loss": 0.74038863, + "num_input_tokens_seen": 29170785, + "step": 1372, + "time_per_iteration": 2.8964579105377197 + }, + { + "auxiliary_loss_clip": 0.01554432, + "auxiliary_loss_mlp": 0.01398477, + "balance_loss_clip": 1.17888165, + "balance_loss_mlp": 1.09044039, + "epoch": 0.0825492259131219, + "flos": 13437512235360.0, + "grad_norm": 2.629843770719647, + "language_loss": 0.88232619, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.91185528, + "num_input_tokens_seen": 29185210, + "step": 1373, + "time_per_iteration": 2.7387523651123047 + }, + { + "auxiliary_loss_clip": 0.01540146, + "auxiliary_loss_mlp": 0.01378427, + "balance_loss_clip": 1.16652799, + "balance_loss_mlp": 1.06466818, + "epoch": 0.08260934916578987, + "flos": 25814845977120.0, + "grad_norm": 2.0259328889838093, + "language_loss": 0.82467294, + "learning_rate": 3.971037596117882e-06, + "loss": 0.85385871, + "num_input_tokens_seen": 29205210, + "step": 1374, + "time_per_iteration": 2.826366662979126 + }, + { + "auxiliary_loss_clip": 0.01639325, + "auxiliary_loss_mlp": 0.01257255, + "balance_loss_clip": 1.26435423, + "balance_loss_mlp": 1.00243378, + "epoch": 0.08266947241845783, + "flos": 63466240525920.0, + "grad_norm": 0.8533877398759111, + "language_loss": 0.60626709, + "learning_rate": 3.970971519207095e-06, + "loss": 0.63523293, + "num_input_tokens_seen": 29265350, + "step": 1375, + "time_per_iteration": 3.3445894718170166 + }, + { + "auxiliary_loss_clip": 0.01637267, + "auxiliary_loss_mlp": 0.01253769, + "balance_loss_clip": 1.26192892, + "balance_loss_mlp": 1.0058136, + "epoch": 0.08272959567112581, + "flos": 70000455359520.0, + "grad_norm": 0.9351810475221869, + "language_loss": 0.62133646, + "learning_rate": 3.970905367556871e-06, + "loss": 0.65024674, + "num_input_tokens_seen": 29321475, + "step": 1376, + "time_per_iteration": 3.2228739261627197 + }, + { + "auxiliary_loss_clip": 0.01552352, + "auxiliary_loss_mlp": 0.0141055, + "balance_loss_clip": 1.17803788, + "balance_loss_mlp": 1.10785365, + "epoch": 0.08278971892379378, + "flos": 20415339200160.0, + "grad_norm": 2.051843272473256, + "language_loss": 0.82514054, + "learning_rate": 3.970839141169718e-06, + "loss": 0.85476953, + "num_input_tokens_seen": 29341405, + "step": 1377, + "time_per_iteration": 2.8662257194519043 + }, + { + "auxiliary_loss_clip": 0.01538046, + "auxiliary_loss_mlp": 0.01405568, + "balance_loss_clip": 1.16483426, + "balance_loss_mlp": 1.10535169, + "epoch": 0.08284984217646174, + "flos": 26252655099840.0, + "grad_norm": 2.156374174623076, + "language_loss": 0.84749234, + "learning_rate": 3.970772840048147e-06, + "loss": 0.87692845, + "num_input_tokens_seen": 29361955, + "step": 1378, + "time_per_iteration": 2.9178264141082764 + }, + { + "auxiliary_loss_clip": 0.0153858, + "auxiliary_loss_mlp": 0.01416866, + "balance_loss_clip": 1.16340065, + "balance_loss_mlp": 1.12237132, + "epoch": 0.08290996542912972, + "flos": 27196924043520.0, + "grad_norm": 2.5525397732107424, + "language_loss": 0.87764072, + "learning_rate": 3.970706464194672e-06, + "loss": 0.90719521, + "num_input_tokens_seen": 29382395, + "step": 1379, + "time_per_iteration": 2.8816685676574707 + }, + { + "auxiliary_loss_clip": 0.01546298, + "auxiliary_loss_mlp": 0.01421663, + "balance_loss_clip": 1.17148316, + "balance_loss_mlp": 1.1233542, + "epoch": 0.08297008868179769, + "flos": 38621265364800.0, + "grad_norm": 2.8822551447808205, + "language_loss": 0.78534389, + "learning_rate": 3.970640013611812e-06, + "loss": 0.81502354, + "num_input_tokens_seen": 29404460, + "step": 1380, + "time_per_iteration": 2.948707103729248 + }, + { + "auxiliary_loss_clip": 0.01544036, + "auxiliary_loss_mlp": 0.01427063, + "balance_loss_clip": 1.16934466, + "balance_loss_mlp": 1.13447583, + "epoch": 0.08303021193446565, + "flos": 19976809442400.0, + "grad_norm": 3.283043613925462, + "language_loss": 0.86661434, + "learning_rate": 3.970573488302083e-06, + "loss": 0.89632535, + "num_input_tokens_seen": 29422675, + "step": 1381, + "time_per_iteration": 2.778780460357666 + }, + { + "auxiliary_loss_clip": 0.01543988, + "auxiliary_loss_mlp": 0.01410462, + "balance_loss_clip": 1.16856861, + "balance_loss_mlp": 1.10490429, + "epoch": 0.08309033518713363, + "flos": 13664362746240.0, + "grad_norm": 3.593812814319803, + "language_loss": 0.87738132, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90692586, + "num_input_tokens_seen": 29439840, + "step": 1382, + "time_per_iteration": 2.8554458618164062 + }, + { + "auxiliary_loss_clip": 0.01534098, + "auxiliary_loss_mlp": 0.0141242, + "balance_loss_clip": 1.15829754, + "balance_loss_mlp": 1.1131568, + "epoch": 0.0831504584398016, + "flos": 17970541453440.0, + "grad_norm": 3.1095706304955795, + "language_loss": 0.77211291, + "learning_rate": 3.970440213512121e-06, + "loss": 0.8015781, + "num_input_tokens_seen": 29457360, + "step": 1383, + "time_per_iteration": 2.8070952892303467 + }, + { + "auxiliary_loss_clip": 0.01540995, + "auxiliary_loss_mlp": 0.01382332, + "balance_loss_clip": 1.16597807, + "balance_loss_mlp": 1.07238793, + "epoch": 0.08321058169246956, + "flos": 22603588322400.0, + "grad_norm": 2.697134373104882, + "language_loss": 0.83073288, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85996616, + "num_input_tokens_seen": 29477040, + "step": 1384, + "time_per_iteration": 2.8367080688476562 + }, + { + "auxiliary_loss_clip": 0.01541048, + "auxiliary_loss_mlp": 0.01393349, + "balance_loss_clip": 1.16773772, + "balance_loss_mlp": 1.08111644, + "epoch": 0.08327070494513754, + "flos": 22852285817760.0, + "grad_norm": 6.861780545312881, + "language_loss": 0.8519811, + "learning_rate": 3.970306639845e-06, + "loss": 0.88132513, + "num_input_tokens_seen": 29492010, + "step": 1385, + "time_per_iteration": 4.320385456085205 + }, + { + "auxiliary_loss_clip": 0.01542361, + "auxiliary_loss_mlp": 0.01376022, + "balance_loss_clip": 1.16626692, + "balance_loss_mlp": 1.0664593, + "epoch": 0.0833308281978055, + "flos": 22785152245920.0, + "grad_norm": 2.4426903589891666, + "language_loss": 0.6890437, + "learning_rate": 3.970239740938835e-06, + "loss": 0.71822751, + "num_input_tokens_seen": 29511850, + "step": 1386, + "time_per_iteration": 2.9456679821014404 + }, + { + "auxiliary_loss_clip": 0.01535632, + "auxiliary_loss_mlp": 0.01382706, + "balance_loss_clip": 1.16038918, + "balance_loss_mlp": 1.06856537, + "epoch": 0.08339095145047347, + "flos": 20814650876160.0, + "grad_norm": 1.7943939291641506, + "language_loss": 0.81811965, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84730303, + "num_input_tokens_seen": 29531415, + "step": 1387, + "time_per_iteration": 2.7853240966796875 + }, + { + "auxiliary_loss_clip": 0.01541985, + "auxiliary_loss_mlp": 0.01395315, + "balance_loss_clip": 1.16682053, + "balance_loss_mlp": 1.07755041, + "epoch": 0.08345107470314143, + "flos": 18517319272800.0, + "grad_norm": 3.6225660950618126, + "language_loss": 0.77343154, + "learning_rate": 3.970105718993978e-06, + "loss": 0.80280459, + "num_input_tokens_seen": 29549525, + "step": 1388, + "time_per_iteration": 2.8022849559783936 + }, + { + "auxiliary_loss_clip": 0.01535087, + "auxiliary_loss_mlp": 0.0138252, + "balance_loss_clip": 1.15985453, + "balance_loss_mlp": 1.05941534, + "epoch": 0.08351119795580941, + "flos": 18809596588320.0, + "grad_norm": 4.056594506477594, + "language_loss": 0.7910862, + "learning_rate": 3.970038595960369e-06, + "loss": 0.82026225, + "num_input_tokens_seen": 29568705, + "step": 1389, + "time_per_iteration": 5.830451965332031 + }, + { + "auxiliary_loss_clip": 0.01548452, + "auxiliary_loss_mlp": 0.01407019, + "balance_loss_clip": 1.17258799, + "balance_loss_mlp": 1.08276939, + "epoch": 0.08357132120847738, + "flos": 18443775841920.0, + "grad_norm": 3.236654964519757, + "language_loss": 0.87590384, + "learning_rate": 3.969971398222699e-06, + "loss": 0.90545851, + "num_input_tokens_seen": 29585855, + "step": 1390, + "time_per_iteration": 2.7739880084991455 + }, + { + "auxiliary_loss_clip": 0.01537755, + "auxiliary_loss_mlp": 0.0137593, + "balance_loss_clip": 1.1609056, + "balance_loss_mlp": 1.05644953, + "epoch": 0.08363144446114534, + "flos": 25924762877760.0, + "grad_norm": 2.2578521870070363, + "language_loss": 0.86927021, + "learning_rate": 3.969904125783517e-06, + "loss": 0.8984071, + "num_input_tokens_seen": 29607280, + "step": 1391, + "time_per_iteration": 4.2682716846466064 + }, + { + "auxiliary_loss_clip": 0.01555718, + "auxiliary_loss_mlp": 0.01380833, + "balance_loss_clip": 1.17870593, + "balance_loss_mlp": 1.0594449, + "epoch": 0.08369156771381332, + "flos": 18043288392960.0, + "grad_norm": 4.328166619063581, + "language_loss": 0.8763141, + "learning_rate": 3.969836778645371e-06, + "loss": 0.90567958, + "num_input_tokens_seen": 29624130, + "step": 1392, + "time_per_iteration": 2.852402687072754 + }, + { + "auxiliary_loss_clip": 0.01539171, + "auxiliary_loss_mlp": 0.0137446, + "balance_loss_clip": 1.16317201, + "balance_loss_mlp": 1.05631447, + "epoch": 0.08375169096648129, + "flos": 22677359322240.0, + "grad_norm": 4.179147264494828, + "language_loss": 0.79996097, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82909727, + "num_input_tokens_seen": 29643210, + "step": 1393, + "time_per_iteration": 2.789896249771118 + }, + { + "auxiliary_loss_clip": 0.01548535, + "auxiliary_loss_mlp": 0.01380879, + "balance_loss_clip": 1.17382956, + "balance_loss_mlp": 1.05434132, + "epoch": 0.08381181421914925, + "flos": 26105568238080.0, + "grad_norm": 3.0793428524540816, + "language_loss": 0.84933376, + "learning_rate": 3.969701860282415e-06, + "loss": 0.8786279, + "num_input_tokens_seen": 29663920, + "step": 1394, + "time_per_iteration": 2.906993865966797 + }, + { + "auxiliary_loss_clip": 0.01545553, + "auxiliary_loss_mlp": 0.013758, + "balance_loss_clip": 1.16938496, + "balance_loss_mlp": 1.05002522, + "epoch": 0.08387193747181723, + "flos": 20631797395200.0, + "grad_norm": 2.6007951452481217, + "language_loss": 0.82947987, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85869342, + "num_input_tokens_seen": 29683825, + "step": 1395, + "time_per_iteration": 2.8167574405670166 + }, + { + "auxiliary_loss_clip": 0.01554895, + "auxiliary_loss_mlp": 0.01392635, + "balance_loss_clip": 1.17874479, + "balance_loss_mlp": 1.06685984, + "epoch": 0.0839320607244852, + "flos": 13444870298400.0, + "grad_norm": 4.4803711080282325, + "language_loss": 0.82763588, + "learning_rate": 3.969566643154293e-06, + "loss": 0.85711116, + "num_input_tokens_seen": 29698775, + "step": 1396, + "time_per_iteration": 2.813591957092285 + }, + { + "auxiliary_loss_clip": 0.01555626, + "auxiliary_loss_mlp": 0.01381723, + "balance_loss_clip": 1.18002284, + "balance_loss_mlp": 1.06319594, + "epoch": 0.08399218397715316, + "flos": 23479509993120.0, + "grad_norm": 3.3111337611065323, + "language_loss": 0.76998883, + "learning_rate": 3.969498922559703e-06, + "loss": 0.7993623, + "num_input_tokens_seen": 29719430, + "step": 1397, + "time_per_iteration": 2.80816388130188 + }, + { + "auxiliary_loss_clip": 0.01552256, + "auxiliary_loss_mlp": 0.01377455, + "balance_loss_clip": 1.17602754, + "balance_loss_mlp": 1.06083548, + "epoch": 0.08405230722982113, + "flos": 25923056110560.0, + "grad_norm": 2.0735201641834413, + "language_loss": 0.78312808, + "learning_rate": 3.969431127281516e-06, + "loss": 0.81242514, + "num_input_tokens_seen": 29739685, + "step": 1398, + "time_per_iteration": 2.8516902923583984 + }, + { + "auxiliary_loss_clip": 0.01544894, + "auxiliary_loss_mlp": 0.01369222, + "balance_loss_clip": 1.17140293, + "balance_loss_mlp": 1.0466888, + "epoch": 0.0841124304824891, + "flos": 17969365680480.0, + "grad_norm": 2.93658994408384, + "language_loss": 0.95259142, + "learning_rate": 3.969363257322304e-06, + "loss": 0.98173249, + "num_input_tokens_seen": 29756165, + "step": 1399, + "time_per_iteration": 2.957690477371216 + }, + { + "auxiliary_loss_clip": 0.01553811, + "auxiliary_loss_mlp": 0.0137597, + "balance_loss_clip": 1.17811286, + "balance_loss_mlp": 1.06087565, + "epoch": 0.08417255373515707, + "flos": 25632030424320.0, + "grad_norm": 3.6917021232549767, + "language_loss": 0.81935227, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84865004, + "num_input_tokens_seen": 29776425, + "step": 1400, + "time_per_iteration": 2.8335225582122803 + }, + { + "auxiliary_loss_clip": 0.01554762, + "auxiliary_loss_mlp": 0.01374338, + "balance_loss_clip": 1.1790849, + "balance_loss_mlp": 1.06076968, + "epoch": 0.08423267698782504, + "flos": 26252124105600.0, + "grad_norm": 3.6323364222065058, + "language_loss": 0.86528802, + "learning_rate": 3.969227293371099e-06, + "loss": 0.89457905, + "num_input_tokens_seen": 29796440, + "step": 1401, + "time_per_iteration": 2.8370630741119385 + }, + { + "auxiliary_loss_clip": 0.01549503, + "auxiliary_loss_mlp": 0.01380419, + "balance_loss_clip": 1.17421436, + "balance_loss_mlp": 1.06532478, + "epoch": 0.08429280024049302, + "flos": 20121734399040.0, + "grad_norm": 2.197332771483289, + "language_loss": 0.87174261, + "learning_rate": 3.969159199384263e-06, + "loss": 0.90104181, + "num_input_tokens_seen": 29814755, + "step": 1402, + "time_per_iteration": 2.827022075653076 + }, + { + "auxiliary_loss_clip": 0.01544213, + "auxiliary_loss_mlp": 0.01366826, + "balance_loss_clip": 1.16989613, + "balance_loss_mlp": 1.05306709, + "epoch": 0.08435292349316098, + "flos": 42927899209920.0, + "grad_norm": 2.4515865713495937, + "language_loss": 0.891774, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.92088437, + "num_input_tokens_seen": 29834785, + "step": 1403, + "time_per_iteration": 2.9808084964752197 + }, + { + "auxiliary_loss_clip": 0.0154635, + "auxiliary_loss_mlp": 0.01387838, + "balance_loss_clip": 1.17080414, + "balance_loss_mlp": 1.08132744, + "epoch": 0.08441304674582895, + "flos": 22859530096320.0, + "grad_norm": 7.538525838441971, + "language_loss": 0.79906321, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82840514, + "num_input_tokens_seen": 29854695, + "step": 1404, + "time_per_iteration": 2.80257248878479 + }, + { + "auxiliary_loss_clip": 0.01554106, + "auxiliary_loss_mlp": 0.01389746, + "balance_loss_clip": 1.17928684, + "balance_loss_mlp": 1.07713187, + "epoch": 0.08447316999849692, + "flos": 18699490046880.0, + "grad_norm": 2.4464345296019925, + "language_loss": 0.83486456, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86430311, + "num_input_tokens_seen": 29872180, + "step": 1405, + "time_per_iteration": 2.814396619796753 + }, + { + "auxiliary_loss_clip": 0.01558278, + "auxiliary_loss_mlp": 0.01385839, + "balance_loss_clip": 1.18313098, + "balance_loss_mlp": 1.07646716, + "epoch": 0.08453329325116489, + "flos": 25486346904480.0, + "grad_norm": 3.273197820274678, + "language_loss": 0.80086285, + "learning_rate": 3.968886076755639e-06, + "loss": 0.83030403, + "num_input_tokens_seen": 29893205, + "step": 1406, + "time_per_iteration": 2.860323667526245 + }, + { + "auxiliary_loss_clip": 0.01554568, + "auxiliary_loss_mlp": 0.01385326, + "balance_loss_clip": 1.17938209, + "balance_loss_mlp": 1.08205748, + "epoch": 0.08459341650383286, + "flos": 20921988661920.0, + "grad_norm": 2.563717037222808, + "language_loss": 0.79748511, + "learning_rate": 3.96881760944111e-06, + "loss": 0.82688403, + "num_input_tokens_seen": 29911970, + "step": 1407, + "time_per_iteration": 2.782778263092041 + }, + { + "auxiliary_loss_clip": 0.01557734, + "auxiliary_loss_mlp": 0.01378046, + "balance_loss_clip": 1.18174624, + "balance_loss_mlp": 1.07420576, + "epoch": 0.08465353975650082, + "flos": 13044800059200.0, + "grad_norm": 2.8363281445913797, + "language_loss": 0.91602248, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94538027, + "num_input_tokens_seen": 29929925, + "step": 1408, + "time_per_iteration": 2.8131580352783203 + }, + { + "auxiliary_loss_clip": 0.01698953, + "auxiliary_loss_mlp": 0.01288315, + "balance_loss_clip": 1.32341778, + "balance_loss_mlp": 1.05943298, + "epoch": 0.0847136630091688, + "flos": 60883951741920.0, + "grad_norm": 0.8972859593753362, + "language_loss": 0.61818838, + "learning_rate": 3.968680450841368e-06, + "loss": 0.6480611, + "num_input_tokens_seen": 29985950, + "step": 1409, + "time_per_iteration": 3.3326497077941895 + }, + { + "auxiliary_loss_clip": 0.01559367, + "auxiliary_loss_mlp": 0.0137828, + "balance_loss_clip": 1.18429077, + "balance_loss_mlp": 1.07405806, + "epoch": 0.08477378626183676, + "flos": 22048276667040.0, + "grad_norm": 5.36552298083235, + "language_loss": 0.86849999, + "learning_rate": 3.968611759561355e-06, + "loss": 0.89787644, + "num_input_tokens_seen": 30004330, + "step": 1410, + "time_per_iteration": 2.896707773208618 + }, + { + "auxiliary_loss_clip": 0.01558761, + "auxiliary_loss_mlp": 0.01369054, + "balance_loss_clip": 1.1835289, + "balance_loss_mlp": 1.05758405, + "epoch": 0.08483390951450473, + "flos": 16691894572320.0, + "grad_norm": 2.3145879735124177, + "language_loss": 0.74457651, + "learning_rate": 3.968542993631388e-06, + "loss": 0.77385467, + "num_input_tokens_seen": 30022555, + "step": 1411, + "time_per_iteration": 2.7762222290039062 + }, + { + "auxiliary_loss_clip": 0.01686625, + "auxiliary_loss_mlp": 0.01261398, + "balance_loss_clip": 1.31234682, + "balance_loss_mlp": 1.01954651, + "epoch": 0.08489403276717271, + "flos": 51591269999520.0, + "grad_norm": 0.9043513486321494, + "language_loss": 0.56689596, + "learning_rate": 3.968474153054073e-06, + "loss": 0.59637618, + "num_input_tokens_seen": 30077220, + "step": 1412, + "time_per_iteration": 3.1904611587524414 + }, + { + "auxiliary_loss_clip": 0.01551667, + "auxiliary_loss_mlp": 0.01381847, + "balance_loss_clip": 1.17695403, + "balance_loss_mlp": 1.07133031, + "epoch": 0.08495415601984067, + "flos": 17094240501120.0, + "grad_norm": 2.4297295899712155, + "language_loss": 0.89156044, + "learning_rate": 3.96840523783202e-06, + "loss": 0.92089552, + "num_input_tokens_seen": 30094600, + "step": 1413, + "time_per_iteration": 2.8550074100494385 + }, + { + "auxiliary_loss_clip": 0.0156174, + "auxiliary_loss_mlp": 0.0138011, + "balance_loss_clip": 1.18568826, + "balance_loss_mlp": 1.07436228, + "epoch": 0.08501427927250864, + "flos": 23150631638880.0, + "grad_norm": 2.2237237983558122, + "language_loss": 0.88122827, + "learning_rate": 3.968336247967844e-06, + "loss": 0.91064674, + "num_input_tokens_seen": 30114475, + "step": 1414, + "time_per_iteration": 2.770888566970825 + }, + { + "auxiliary_loss_clip": 0.01552186, + "auxiliary_loss_mlp": 0.0138097, + "balance_loss_clip": 1.17493677, + "balance_loss_mlp": 1.07407737, + "epoch": 0.08507440252517662, + "flos": 19065538362240.0, + "grad_norm": 1.811002489138618, + "language_loss": 0.7756421, + "learning_rate": 3.96826718346416e-06, + "loss": 0.8049736, + "num_input_tokens_seen": 30133350, + "step": 1415, + "time_per_iteration": 2.8282039165496826 + }, + { + "auxiliary_loss_clip": 0.0156872, + "auxiliary_loss_mlp": 0.01360537, + "balance_loss_clip": 1.19259095, + "balance_loss_mlp": 1.04429877, + "epoch": 0.08513452577784458, + "flos": 60186635962560.0, + "grad_norm": 1.9462938707373196, + "language_loss": 0.71018809, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73948067, + "num_input_tokens_seen": 30159005, + "step": 1416, + "time_per_iteration": 3.1208558082580566 + }, + { + "auxiliary_loss_clip": 0.01561086, + "auxiliary_loss_mlp": 0.01385835, + "balance_loss_clip": 1.18512952, + "balance_loss_mlp": 1.07360196, + "epoch": 0.08519464903051255, + "flos": 27310975113600.0, + "grad_norm": 2.0338334831713816, + "language_loss": 0.74917507, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77864426, + "num_input_tokens_seen": 30179450, + "step": 1417, + "time_per_iteration": 2.8953235149383545 + }, + { + "auxiliary_loss_clip": 0.01569233, + "auxiliary_loss_mlp": 0.01393105, + "balance_loss_clip": 1.19379997, + "balance_loss_mlp": 1.08754766, + "epoch": 0.08525477228318051, + "flos": 20268517835520.0, + "grad_norm": 3.791234356267423, + "language_loss": 0.82380664, + "learning_rate": 3.968059542142265e-06, + "loss": 0.85342997, + "num_input_tokens_seen": 30197235, + "step": 1418, + "time_per_iteration": 2.8294806480407715 + }, + { + "auxiliary_loss_clip": 0.01661905, + "auxiliary_loss_mlp": 0.01259277, + "balance_loss_clip": 1.28675032, + "balance_loss_mlp": 1.0166626, + "epoch": 0.08531489553584849, + "flos": 67621236130080.0, + "grad_norm": 0.8773795609447712, + "language_loss": 0.56603563, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.59524751, + "num_input_tokens_seen": 30257410, + "step": 1419, + "time_per_iteration": 3.2317121028900146 + }, + { + "auxiliary_loss_clip": 0.01553496, + "auxiliary_loss_mlp": 0.01353416, + "balance_loss_clip": 1.17768431, + "balance_loss_mlp": 1.04232752, + "epoch": 0.08537501878851646, + "flos": 27529443501120.0, + "grad_norm": 2.231280864398755, + "language_loss": 0.70139241, + "learning_rate": 3.967920741444886e-06, + "loss": 0.7304616, + "num_input_tokens_seen": 30277865, + "step": 1420, + "time_per_iteration": 2.9407806396484375 + }, + { + "auxiliary_loss_clip": 0.01558969, + "auxiliary_loss_mlp": 0.01374405, + "balance_loss_clip": 1.18264604, + "balance_loss_mlp": 1.06980157, + "epoch": 0.08543514204118442, + "flos": 22786479731520.0, + "grad_norm": 1.5603850734829048, + "language_loss": 0.8814503, + "learning_rate": 3.967851229159252e-06, + "loss": 0.91078401, + "num_input_tokens_seen": 30298545, + "step": 1421, + "time_per_iteration": 2.799372673034668 + }, + { + "auxiliary_loss_clip": 0.01655323, + "auxiliary_loss_mlp": 0.01244621, + "balance_loss_clip": 1.28057826, + "balance_loss_mlp": 1.0111618, + "epoch": 0.0854952652938524, + "flos": 60997471817760.0, + "grad_norm": 0.8224279855487955, + "language_loss": 0.63414931, + "learning_rate": 3.967781642252502e-06, + "loss": 0.66314876, + "num_input_tokens_seen": 30361725, + "step": 1422, + "time_per_iteration": 3.2671425342559814 + }, + { + "auxiliary_loss_clip": 0.01552141, + "auxiliary_loss_mlp": 0.01367836, + "balance_loss_clip": 1.17802215, + "balance_loss_mlp": 1.06666541, + "epoch": 0.08555538854652037, + "flos": 28040568485760.0, + "grad_norm": 2.1955514484731533, + "language_loss": 0.82951057, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85871029, + "num_input_tokens_seen": 30382180, + "step": 1423, + "time_per_iteration": 4.402692556381226 + }, + { + "auxiliary_loss_clip": 0.01556446, + "auxiliary_loss_mlp": 0.01374406, + "balance_loss_clip": 1.18131685, + "balance_loss_mlp": 1.07743192, + "epoch": 0.08561551179918833, + "flos": 23511294155520.0, + "grad_norm": 2.19498719048477, + "language_loss": 0.7519573, + "learning_rate": 3.967642244586213e-06, + "loss": 0.7812658, + "num_input_tokens_seen": 30402980, + "step": 1424, + "time_per_iteration": 2.879849672317505 + }, + { + "auxiliary_loss_clip": 0.01551727, + "auxiliary_loss_mlp": 0.01413582, + "balance_loss_clip": 1.17689371, + "balance_loss_mlp": 1.11603498, + "epoch": 0.08567563505185631, + "flos": 17928630472320.0, + "grad_norm": 3.018776607940042, + "language_loss": 0.76023275, + "learning_rate": 3.96757243383196e-06, + "loss": 0.78988582, + "num_input_tokens_seen": 30420800, + "step": 1425, + "time_per_iteration": 2.764342784881592 + }, + { + "auxiliary_loss_clip": 0.01559151, + "auxiliary_loss_mlp": 0.01364847, + "balance_loss_clip": 1.18567765, + "balance_loss_mlp": 1.06272292, + "epoch": 0.08573575830452428, + "flos": 19721588303520.0, + "grad_norm": 3.230593863274995, + "language_loss": 0.93731886, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.96655887, + "num_input_tokens_seen": 30439620, + "step": 1426, + "time_per_iteration": 4.329450845718384 + }, + { + "auxiliary_loss_clip": 0.01552781, + "auxiliary_loss_mlp": 0.01408347, + "balance_loss_clip": 1.17805099, + "balance_loss_mlp": 1.10984755, + "epoch": 0.08579588155719224, + "flos": 17933523204960.0, + "grad_norm": 4.285300329096906, + "language_loss": 0.75969565, + "learning_rate": 3.967432588494471e-06, + "loss": 0.78930688, + "num_input_tokens_seen": 30457300, + "step": 1427, + "time_per_iteration": 4.245726823806763 + }, + { + "auxiliary_loss_clip": 0.01557454, + "auxiliary_loss_mlp": 0.01403833, + "balance_loss_clip": 1.18170977, + "balance_loss_mlp": 1.11010098, + "epoch": 0.08585600480986022, + "flos": 16035123996000.0, + "grad_norm": 5.457521624982909, + "language_loss": 0.81615376, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84576666, + "num_input_tokens_seen": 30471580, + "step": 1428, + "time_per_iteration": 2.779782772064209 + }, + { + "auxiliary_loss_clip": 0.01550916, + "auxiliary_loss_mlp": 0.0138598, + "balance_loss_clip": 1.17665553, + "balance_loss_mlp": 1.08671737, + "epoch": 0.08591612806252819, + "flos": 28659941532000.0, + "grad_norm": 2.3483765476066116, + "language_loss": 0.80167216, + "learning_rate": 3.967292444736023e-06, + "loss": 0.8310411, + "num_input_tokens_seen": 30492720, + "step": 1429, + "time_per_iteration": 4.460594654083252 + }, + { + "auxiliary_loss_clip": 0.01543806, + "auxiliary_loss_mlp": 0.01408246, + "balance_loss_clip": 1.17075682, + "balance_loss_mlp": 1.1143235, + "epoch": 0.08597625131519615, + "flos": 20961206743680.0, + "grad_norm": 1.994416991636752, + "language_loss": 0.88402987, + "learning_rate": 3.967222260955578e-06, + "loss": 0.91355038, + "num_input_tokens_seen": 30509535, + "step": 1430, + "time_per_iteration": 2.815978527069092 + }, + { + "auxiliary_loss_clip": 0.01553939, + "auxiliary_loss_mlp": 0.01378564, + "balance_loss_clip": 1.18029654, + "balance_loss_mlp": 1.08025479, + "epoch": 0.08603637456786412, + "flos": 23258500418880.0, + "grad_norm": 1.8823311689241888, + "language_loss": 0.81822026, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84754527, + "num_input_tokens_seen": 30529490, + "step": 1431, + "time_per_iteration": 2.7837986946105957 + }, + { + "auxiliary_loss_clip": 0.01555893, + "auxiliary_loss_mlp": 0.01391091, + "balance_loss_clip": 1.18305945, + "balance_loss_mlp": 1.09144652, + "epoch": 0.0860964978205321, + "flos": 28696694283360.0, + "grad_norm": 1.6931652836512556, + "language_loss": 0.78251374, + "learning_rate": 3.967081669605559e-06, + "loss": 0.81198359, + "num_input_tokens_seen": 30550205, + "step": 1432, + "time_per_iteration": 2.8352267742156982 + }, + { + "auxiliary_loss_clip": 0.01541658, + "auxiliary_loss_mlp": 0.01378886, + "balance_loss_clip": 1.16745007, + "balance_loss_mlp": 1.07466435, + "epoch": 0.08615662107320006, + "flos": 19320456075840.0, + "grad_norm": 2.565061296667001, + "language_loss": 0.73379594, + "learning_rate": 3.967011262041315e-06, + "loss": 0.76300132, + "num_input_tokens_seen": 30568830, + "step": 1433, + "time_per_iteration": 2.835827350616455 + }, + { + "auxiliary_loss_clip": 0.01550695, + "auxiliary_loss_mlp": 0.01385689, + "balance_loss_clip": 1.17466104, + "balance_loss_mlp": 1.07574463, + "epoch": 0.08621674432586802, + "flos": 15853522144320.0, + "grad_norm": 2.694196471080663, + "language_loss": 0.85628396, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.88564777, + "num_input_tokens_seen": 30585730, + "step": 1434, + "time_per_iteration": 2.7872536182403564 + }, + { + "auxiliary_loss_clip": 0.01544377, + "auxiliary_loss_mlp": 0.01373012, + "balance_loss_clip": 1.16971731, + "balance_loss_mlp": 1.06440282, + "epoch": 0.086276867578536, + "flos": 14102399437920.0, + "grad_norm": 3.5831459461918165, + "language_loss": 0.7876308, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81680477, + "num_input_tokens_seen": 30603180, + "step": 1435, + "time_per_iteration": 2.766141414642334 + }, + { + "auxiliary_loss_clip": 0.01629786, + "auxiliary_loss_mlp": 0.01245583, + "balance_loss_clip": 1.25378382, + "balance_loss_mlp": 1.0235672, + "epoch": 0.08633699083120397, + "flos": 70192108173600.0, + "grad_norm": 0.8931007721322725, + "language_loss": 0.57908058, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60783422, + "num_input_tokens_seen": 30668895, + "step": 1436, + "time_per_iteration": 3.4049477577209473 + }, + { + "auxiliary_loss_clip": 0.01542665, + "auxiliary_loss_mlp": 0.01373429, + "balance_loss_clip": 1.16872585, + "balance_loss_mlp": 1.05223167, + "epoch": 0.08639711408387193, + "flos": 30301450763040.0, + "grad_norm": 2.9146333774829345, + "language_loss": 0.69808072, + "learning_rate": 3.966728885918437e-06, + "loss": 0.72724169, + "num_input_tokens_seen": 30688955, + "step": 1437, + "time_per_iteration": 2.8090429306030273 + }, + { + "auxiliary_loss_clip": 0.01552041, + "auxiliary_loss_mlp": 0.01384005, + "balance_loss_clip": 1.1769675, + "balance_loss_mlp": 1.05803871, + "epoch": 0.08645723733653991, + "flos": 20299467578400.0, + "grad_norm": 2.058774627160627, + "language_loss": 0.72615701, + "learning_rate": 3.966658105434627e-06, + "loss": 0.75551748, + "num_input_tokens_seen": 30706095, + "step": 1438, + "time_per_iteration": 2.719221830368042 + }, + { + "auxiliary_loss_clip": 0.01557402, + "auxiliary_loss_mlp": 0.01376379, + "balance_loss_clip": 1.18258023, + "balance_loss_mlp": 1.05632591, + "epoch": 0.08651736058920788, + "flos": 32893373299680.0, + "grad_norm": 1.7901603974776406, + "language_loss": 0.64607501, + "learning_rate": 3.966587250374945e-06, + "loss": 0.67541277, + "num_input_tokens_seen": 30729025, + "step": 1439, + "time_per_iteration": 2.900521755218506 + }, + { + "auxiliary_loss_clip": 0.01570112, + "auxiliary_loss_mlp": 0.01392111, + "balance_loss_clip": 1.19548333, + "balance_loss_mlp": 1.06614542, + "epoch": 0.08657748384187584, + "flos": 22639544582400.0, + "grad_norm": 2.1052814445085706, + "language_loss": 0.87855822, + "learning_rate": 3.966516320742077e-06, + "loss": 0.90818048, + "num_input_tokens_seen": 30746155, + "step": 1440, + "time_per_iteration": 2.784341335296631 + }, + { + "auxiliary_loss_clip": 0.01547989, + "auxiliary_loss_mlp": 0.01390203, + "balance_loss_clip": 1.17166209, + "balance_loss_mlp": 1.06442833, + "epoch": 0.08663760709454381, + "flos": 23660580850560.0, + "grad_norm": 2.46436225723349, + "language_loss": 0.83402717, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.86340916, + "num_input_tokens_seen": 30761410, + "step": 1441, + "time_per_iteration": 2.781604766845703 + }, + { + "auxiliary_loss_clip": 0.01629875, + "auxiliary_loss_mlp": 0.01240074, + "balance_loss_clip": 1.25210667, + "balance_loss_mlp": 1.00661469, + "epoch": 0.08669773034721179, + "flos": 62692346698560.0, + "grad_norm": 0.8589916056031145, + "language_loss": 0.60414267, + "learning_rate": 3.966374237767545e-06, + "loss": 0.63284212, + "num_input_tokens_seen": 30823010, + "step": 1442, + "time_per_iteration": 3.4317243099212646 + }, + { + "auxiliary_loss_clip": 0.01551048, + "auxiliary_loss_mlp": 0.01391185, + "balance_loss_clip": 1.17371798, + "balance_loss_mlp": 1.06712604, + "epoch": 0.08675785359987975, + "flos": 20669574206880.0, + "grad_norm": 2.3011001038516743, + "language_loss": 0.79256517, + "learning_rate": 3.96630308443127e-06, + "loss": 0.82198745, + "num_input_tokens_seen": 30841980, + "step": 1443, + "time_per_iteration": 2.7745330333709717 + }, + { + "auxiliary_loss_clip": 0.01539273, + "auxiliary_loss_mlp": 0.01368771, + "balance_loss_clip": 1.16289973, + "balance_loss_mlp": 1.04547572, + "epoch": 0.08681797685254772, + "flos": 26943447600000.0, + "grad_norm": 2.0174894960858905, + "language_loss": 0.82606137, + "learning_rate": 3.966231856532584e-06, + "loss": 0.85514176, + "num_input_tokens_seen": 30863280, + "step": 1444, + "time_per_iteration": 2.895026922225952 + }, + { + "auxiliary_loss_clip": 0.01551076, + "auxiliary_loss_mlp": 0.01381771, + "balance_loss_clip": 1.17535353, + "balance_loss_mlp": 1.06782091, + "epoch": 0.0868781001052157, + "flos": 17714789320320.0, + "grad_norm": 3.911511587411426, + "language_loss": 0.87388098, + "learning_rate": 3.966160554074189e-06, + "loss": 0.90320951, + "num_input_tokens_seen": 30881710, + "step": 1445, + "time_per_iteration": 2.7316489219665527 + }, + { + "auxiliary_loss_clip": 0.01557903, + "auxiliary_loss_mlp": 0.01364929, + "balance_loss_clip": 1.18199635, + "balance_loss_mlp": 1.05899048, + "epoch": 0.08693822335788366, + "flos": 19898297422560.0, + "grad_norm": 2.024054933363709, + "language_loss": 0.81735682, + "learning_rate": 3.96608917705879e-06, + "loss": 0.84658515, + "num_input_tokens_seen": 30900225, + "step": 1446, + "time_per_iteration": 2.7909317016601562 + }, + { + "auxiliary_loss_clip": 0.01634111, + "auxiliary_loss_mlp": 0.01262184, + "balance_loss_clip": 1.25850511, + "balance_loss_mlp": 1.04322052, + "epoch": 0.08699834661055163, + "flos": 67029513076800.0, + "grad_norm": 0.7321602928823283, + "language_loss": 0.54680502, + "learning_rate": 3.966017725489091e-06, + "loss": 0.57576799, + "num_input_tokens_seen": 30959580, + "step": 1447, + "time_per_iteration": 3.402387857437134 + }, + { + "auxiliary_loss_clip": 0.01563653, + "auxiliary_loss_mlp": 0.01381924, + "balance_loss_clip": 1.18901396, + "balance_loss_mlp": 1.08227921, + "epoch": 0.0870584698632196, + "flos": 13482229900320.0, + "grad_norm": 4.837721651620681, + "language_loss": 0.84532619, + "learning_rate": 3.965946199367804e-06, + "loss": 0.87478203, + "num_input_tokens_seen": 30976775, + "step": 1448, + "time_per_iteration": 2.719696521759033 + }, + { + "auxiliary_loss_clip": 0.01564815, + "auxiliary_loss_mlp": 0.01360294, + "balance_loss_clip": 1.19132388, + "balance_loss_mlp": 1.06312943, + "epoch": 0.08711859311588757, + "flos": 16108857067680.0, + "grad_norm": 4.413686273160782, + "language_loss": 0.80723792, + "learning_rate": 3.965874598697638e-06, + "loss": 0.83648902, + "num_input_tokens_seen": 30990495, + "step": 1449, + "time_per_iteration": 2.792349338531494 + }, + { + "auxiliary_loss_clip": 0.01557385, + "auxiliary_loss_mlp": 0.01364233, + "balance_loss_clip": 1.18212485, + "balance_loss_mlp": 1.06058311, + "epoch": 0.08717871636855554, + "flos": 38475430132320.0, + "grad_norm": 1.7257367667817576, + "language_loss": 0.70958817, + "learning_rate": 3.965802923481313e-06, + "loss": 0.7388044, + "num_input_tokens_seen": 31014080, + "step": 1450, + "time_per_iteration": 2.926945447921753 + }, + { + "auxiliary_loss_clip": 0.01566656, + "auxiliary_loss_mlp": 0.01381285, + "balance_loss_clip": 1.19381213, + "balance_loss_mlp": 1.0860275, + "epoch": 0.0872388396212235, + "flos": 17602369161120.0, + "grad_norm": 2.2424607867040653, + "language_loss": 0.8349514, + "learning_rate": 3.965731173721542e-06, + "loss": 0.86443079, + "num_input_tokens_seen": 31031210, + "step": 1451, + "time_per_iteration": 2.771292209625244 + }, + { + "auxiliary_loss_clip": 0.01554596, + "auxiliary_loss_mlp": 0.01374094, + "balance_loss_clip": 1.18004918, + "balance_loss_mlp": 1.08131623, + "epoch": 0.08729896287389148, + "flos": 25261430729760.0, + "grad_norm": 3.104186163550605, + "language_loss": 0.74828672, + "learning_rate": 3.965659349421049e-06, + "loss": 0.77757359, + "num_input_tokens_seen": 31049710, + "step": 1452, + "time_per_iteration": 2.829723834991455 + }, + { + "auxiliary_loss_clip": 0.01565157, + "auxiliary_loss_mlp": 0.01386495, + "balance_loss_clip": 1.19135892, + "balance_loss_mlp": 1.09142852, + "epoch": 0.08735908612655945, + "flos": 15634257265440.0, + "grad_norm": 4.306487986182732, + "language_loss": 0.80646873, + "learning_rate": 3.965587450582556e-06, + "loss": 0.83598524, + "num_input_tokens_seen": 31066160, + "step": 1453, + "time_per_iteration": 2.7961928844451904 + }, + { + "auxiliary_loss_clip": 0.01550892, + "auxiliary_loss_mlp": 0.01372024, + "balance_loss_clip": 1.17820323, + "balance_loss_mlp": 1.07409573, + "epoch": 0.08741920937922741, + "flos": 20341757841120.0, + "grad_norm": 2.348293109013769, + "language_loss": 0.71268678, + "learning_rate": 3.96551547720879e-06, + "loss": 0.74191594, + "num_input_tokens_seen": 31085270, + "step": 1454, + "time_per_iteration": 2.8794288635253906 + }, + { + "auxiliary_loss_clip": 0.01634609, + "auxiliary_loss_mlp": 0.01242203, + "balance_loss_clip": 1.26264739, + "balance_loss_mlp": 1.02323914, + "epoch": 0.08747933263189539, + "flos": 62826916903200.0, + "grad_norm": 0.783393502169501, + "language_loss": 0.58478373, + "learning_rate": 3.96544342930248e-06, + "loss": 0.61355186, + "num_input_tokens_seen": 31148445, + "step": 1455, + "time_per_iteration": 3.4083895683288574 + }, + { + "auxiliary_loss_clip": 0.01557655, + "auxiliary_loss_mlp": 0.01370025, + "balance_loss_clip": 1.18407989, + "balance_loss_mlp": 1.07934499, + "epoch": 0.08753945588456336, + "flos": 33038639609760.0, + "grad_norm": 1.697288078697979, + "language_loss": 0.77601576, + "learning_rate": 3.965371306866359e-06, + "loss": 0.80529249, + "num_input_tokens_seen": 31168770, + "step": 1456, + "time_per_iteration": 2.9303715229034424 + }, + { + "auxiliary_loss_clip": 0.01560483, + "auxiliary_loss_mlp": 0.01380272, + "balance_loss_clip": 1.18673098, + "balance_loss_mlp": 1.08272624, + "epoch": 0.08759957913723132, + "flos": 35549736508800.0, + "grad_norm": 2.083815268440607, + "language_loss": 0.71952903, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74893659, + "num_input_tokens_seen": 31189270, + "step": 1457, + "time_per_iteration": 2.901547908782959 + }, + { + "auxiliary_loss_clip": 0.01561172, + "auxiliary_loss_mlp": 0.01368534, + "balance_loss_clip": 1.18754375, + "balance_loss_mlp": 1.07518411, + "epoch": 0.0876597023898993, + "flos": 23913184946400.0, + "grad_norm": 1.6879668427178833, + "language_loss": 0.86411613, + "learning_rate": 3.965226838415622e-06, + "loss": 0.89341325, + "num_input_tokens_seen": 31210385, + "step": 1458, + "time_per_iteration": 2.820154905319214 + }, + { + "auxiliary_loss_clip": 0.01558033, + "auxiliary_loss_mlp": 0.0138499, + "balance_loss_clip": 1.18419254, + "balance_loss_mlp": 1.08477283, + "epoch": 0.08771982564256726, + "flos": 18115997404320.0, + "grad_norm": 1.9225964492086065, + "language_loss": 0.80568612, + "learning_rate": 3.965154492406486e-06, + "loss": 0.83511633, + "num_input_tokens_seen": 31229745, + "step": 1459, + "time_per_iteration": 2.7662179470062256 + }, + { + "auxiliary_loss_clip": 0.01556695, + "auxiliary_loss_mlp": 0.01368862, + "balance_loss_clip": 1.18431282, + "balance_loss_mlp": 1.06654704, + "epoch": 0.08777994889523523, + "flos": 17713916972640.0, + "grad_norm": 2.884801804892103, + "language_loss": 0.843732, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.87298763, + "num_input_tokens_seen": 31248280, + "step": 1460, + "time_per_iteration": 2.7481818199157715 + }, + { + "auxiliary_loss_clip": 0.01552794, + "auxiliary_loss_mlp": 0.01369153, + "balance_loss_clip": 1.180861, + "balance_loss_mlp": 1.06397748, + "epoch": 0.0878400721479032, + "flos": 12821401010880.0, + "grad_norm": 3.942633658280356, + "language_loss": 0.80804521, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83726466, + "num_input_tokens_seen": 31262190, + "step": 1461, + "time_per_iteration": 4.200263738632202 + }, + { + "auxiliary_loss_clip": 0.01556451, + "auxiliary_loss_mlp": 0.01372061, + "balance_loss_clip": 1.1840024, + "balance_loss_mlp": 1.06974626, + "epoch": 0.08790019540057117, + "flos": 26394925085280.0, + "grad_norm": 1.7591331762576623, + "language_loss": 0.76414871, + "learning_rate": 3.964937007276932e-06, + "loss": 0.79343379, + "num_input_tokens_seen": 31283690, + "step": 1462, + "time_per_iteration": 2.8544552326202393 + }, + { + "auxiliary_loss_clip": 0.01565409, + "auxiliary_loss_mlp": 0.01384026, + "balance_loss_clip": 1.19207084, + "balance_loss_mlp": 1.07503593, + "epoch": 0.08796031865323914, + "flos": 19135971684000.0, + "grad_norm": 6.188917995218897, + "language_loss": 0.74743843, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.77693284, + "num_input_tokens_seen": 31302505, + "step": 1463, + "time_per_iteration": 2.75203800201416 + }, + { + "auxiliary_loss_clip": 0.01561237, + "auxiliary_loss_mlp": 0.01379049, + "balance_loss_clip": 1.18734384, + "balance_loss_mlp": 1.07692456, + "epoch": 0.0880204419059071, + "flos": 26066501868960.0, + "grad_norm": 7.363189469663584, + "language_loss": 0.83568436, + "learning_rate": 3.964791644632941e-06, + "loss": 0.86508721, + "num_input_tokens_seen": 31323070, + "step": 1464, + "time_per_iteration": 2.8345136642456055 + }, + { + "auxiliary_loss_clip": 0.01556159, + "auxiliary_loss_mlp": 0.01377468, + "balance_loss_clip": 1.18208218, + "balance_loss_mlp": 1.07877696, + "epoch": 0.08808056515857508, + "flos": 22379772064320.0, + "grad_norm": 3.0620087892765184, + "language_loss": 0.7834214, + "learning_rate": 3.964718851551923e-06, + "loss": 0.81275761, + "num_input_tokens_seen": 31341880, + "step": 1465, + "time_per_iteration": 5.700449466705322 + }, + { + "auxiliary_loss_clip": 0.0155795, + "auxiliary_loss_mlp": 0.0136342, + "balance_loss_clip": 1.18560386, + "balance_loss_mlp": 1.05481076, + "epoch": 0.08814068841124305, + "flos": 23187536102880.0, + "grad_norm": 2.2397768598477485, + "language_loss": 0.85037321, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.87958694, + "num_input_tokens_seen": 31361995, + "step": 1466, + "time_per_iteration": 2.750126838684082 + }, + { + "auxiliary_loss_clip": 0.01557, + "auxiliary_loss_mlp": 0.01364907, + "balance_loss_clip": 1.18314528, + "balance_loss_mlp": 1.05877721, + "epoch": 0.08820081166391101, + "flos": 25157620262880.0, + "grad_norm": 2.993964338903606, + "language_loss": 0.83945072, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86866987, + "num_input_tokens_seen": 31381515, + "step": 1467, + "time_per_iteration": 2.784679651260376 + }, + { + "auxiliary_loss_clip": 0.01557014, + "auxiliary_loss_mlp": 0.01354879, + "balance_loss_clip": 1.18076825, + "balance_loss_mlp": 1.04398143, + "epoch": 0.08826093491657899, + "flos": 22233595478400.0, + "grad_norm": 2.5978320656280767, + "language_loss": 0.754884, + "learning_rate": 3.964500025305907e-06, + "loss": 0.7840029, + "num_input_tokens_seen": 31400345, + "step": 1468, + "time_per_iteration": 4.3299877643585205 + }, + { + "auxiliary_loss_clip": 0.01559795, + "auxiliary_loss_mlp": 0.01370685, + "balance_loss_clip": 1.18590164, + "balance_loss_mlp": 1.07104075, + "epoch": 0.08832105816924696, + "flos": 22128988520160.0, + "grad_norm": 1.968396128201916, + "language_loss": 0.80842817, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.83773291, + "num_input_tokens_seen": 31419620, + "step": 1469, + "time_per_iteration": 2.7659013271331787 + }, + { + "auxiliary_loss_clip": 0.01564066, + "auxiliary_loss_mlp": 0.01364755, + "balance_loss_clip": 1.18938494, + "balance_loss_mlp": 1.05767179, + "epoch": 0.08838118142191492, + "flos": 17568423093600.0, + "grad_norm": 3.3805683309260854, + "language_loss": 0.7781992, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.80748743, + "num_input_tokens_seen": 31437970, + "step": 1470, + "time_per_iteration": 2.743354082107544 + }, + { + "auxiliary_loss_clip": 0.01556984, + "auxiliary_loss_mlp": 0.01351448, + "balance_loss_clip": 1.18243396, + "balance_loss_mlp": 1.05409229, + "epoch": 0.0884413046745829, + "flos": 20779301466720.0, + "grad_norm": 1.9046622572499659, + "language_loss": 0.84672672, + "learning_rate": 3.964280528613569e-06, + "loss": 0.8758111, + "num_input_tokens_seen": 31457040, + "step": 1471, + "time_per_iteration": 2.7373008728027344 + }, + { + "auxiliary_loss_clip": 0.0156483, + "auxiliary_loss_mlp": 0.01375466, + "balance_loss_clip": 1.18998253, + "balance_loss_mlp": 1.07257867, + "epoch": 0.08850142792725087, + "flos": 22127547250080.0, + "grad_norm": 2.2607007238170373, + "language_loss": 0.8364979, + "learning_rate": 3.964207214074324e-06, + "loss": 0.86590087, + "num_input_tokens_seen": 31477520, + "step": 1472, + "time_per_iteration": 2.8177831172943115 + }, + { + "auxiliary_loss_clip": 0.01560063, + "auxiliary_loss_mlp": 0.013602, + "balance_loss_clip": 1.18613112, + "balance_loss_mlp": 1.05883908, + "epoch": 0.08856155117991883, + "flos": 22420924482240.0, + "grad_norm": 2.4541884322175904, + "language_loss": 0.82889748, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85810012, + "num_input_tokens_seen": 31495575, + "step": 1473, + "time_per_iteration": 2.793095588684082 + }, + { + "auxiliary_loss_clip": 0.01555405, + "auxiliary_loss_mlp": 0.01368205, + "balance_loss_clip": 1.18089008, + "balance_loss_mlp": 1.06360102, + "epoch": 0.0886216744325868, + "flos": 29939726257920.0, + "grad_norm": 1.708522627577366, + "language_loss": 0.78810227, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81733829, + "num_input_tokens_seen": 31520020, + "step": 1474, + "time_per_iteration": 2.916419506072998 + }, + { + "auxiliary_loss_clip": 0.015665, + "auxiliary_loss_mlp": 0.01374504, + "balance_loss_clip": 1.1933943, + "balance_loss_mlp": 1.07409632, + "epoch": 0.08868179768525478, + "flos": 23984831969280.0, + "grad_norm": 2.1780204853632767, + "language_loss": 0.79389989, + "learning_rate": 3.963986823570121e-06, + "loss": 0.8233099, + "num_input_tokens_seen": 31539265, + "step": 1475, + "time_per_iteration": 2.872620105743408 + }, + { + "auxiliary_loss_clip": 0.01560286, + "auxiliary_loss_mlp": 0.01354449, + "balance_loss_clip": 1.18552732, + "balance_loss_mlp": 1.05728436, + "epoch": 0.08874192093792274, + "flos": 43180806731040.0, + "grad_norm": 2.025271155459339, + "language_loss": 0.74144983, + "learning_rate": 3.963913211115848e-06, + "loss": 0.7705971, + "num_input_tokens_seen": 31563425, + "step": 1476, + "time_per_iteration": 2.973794460296631 + }, + { + "auxiliary_loss_clip": 0.01557487, + "auxiliary_loss_mlp": 0.01365278, + "balance_loss_clip": 1.18315589, + "balance_loss_mlp": 1.06811297, + "epoch": 0.0888020441905907, + "flos": 32855141350080.0, + "grad_norm": 1.636818211944316, + "language_loss": 0.74742079, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.77664834, + "num_input_tokens_seen": 31584525, + "step": 1477, + "time_per_iteration": 2.881152629852295 + }, + { + "auxiliary_loss_clip": 0.01554738, + "auxiliary_loss_mlp": 0.01370514, + "balance_loss_clip": 1.18021584, + "balance_loss_mlp": 1.07296789, + "epoch": 0.08886216744325869, + "flos": 23151617771040.0, + "grad_norm": 2.1413027551963486, + "language_loss": 0.87209404, + "learning_rate": 3.963765762794739e-06, + "loss": 0.90134656, + "num_input_tokens_seen": 31603325, + "step": 1478, + "time_per_iteration": 2.7622578144073486 + }, + { + "auxiliary_loss_clip": 0.01570269, + "auxiliary_loss_mlp": 0.0136225, + "balance_loss_clip": 1.19736505, + "balance_loss_mlp": 1.06622982, + "epoch": 0.08892229069592665, + "flos": 23333864401440.0, + "grad_norm": 2.0210838973447385, + "language_loss": 0.77537149, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80469668, + "num_input_tokens_seen": 31624820, + "step": 1479, + "time_per_iteration": 2.8845791816711426 + }, + { + "auxiliary_loss_clip": 0.01561774, + "auxiliary_loss_mlp": 0.01356278, + "balance_loss_clip": 1.18640828, + "balance_loss_mlp": 1.0570147, + "epoch": 0.08898241394859462, + "flos": 26216205773760.0, + "grad_norm": 2.9235601969019167, + "language_loss": 0.77881169, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80799222, + "num_input_tokens_seen": 31646080, + "step": 1480, + "time_per_iteration": 2.847515344619751 + }, + { + "auxiliary_loss_clip": 0.01561779, + "auxiliary_loss_mlp": 0.0136889, + "balance_loss_clip": 1.18751526, + "balance_loss_mlp": 1.06352353, + "epoch": 0.0890425372012626, + "flos": 23552901711360.0, + "grad_norm": 1.9793414751730012, + "language_loss": 0.66862392, + "learning_rate": 3.963544031823624e-06, + "loss": 0.69793057, + "num_input_tokens_seen": 31665770, + "step": 1481, + "time_per_iteration": 2.8035173416137695 + }, + { + "auxiliary_loss_clip": 0.01557064, + "auxiliary_loss_mlp": 0.01384581, + "balance_loss_clip": 1.18244708, + "balance_loss_mlp": 1.08207524, + "epoch": 0.08910266045393056, + "flos": 23005023975360.0, + "grad_norm": 2.3898038614400385, + "language_loss": 0.96538186, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.99479836, + "num_input_tokens_seen": 31683805, + "step": 1482, + "time_per_iteration": 2.793639659881592 + }, + { + "auxiliary_loss_clip": 0.0155084, + "auxiliary_loss_mlp": 0.0135616, + "balance_loss_clip": 1.17773438, + "balance_loss_mlp": 1.05479932, + "epoch": 0.08916278370659853, + "flos": 31938939609120.0, + "grad_norm": 1.99016948551738, + "language_loss": 0.78572798, + "learning_rate": 3.96339583888261e-06, + "loss": 0.81479794, + "num_input_tokens_seen": 31704630, + "step": 1483, + "time_per_iteration": 2.8427178859710693 + }, + { + "auxiliary_loss_clip": 0.01561373, + "auxiliary_loss_mlp": 0.01370182, + "balance_loss_clip": 1.18867719, + "balance_loss_mlp": 1.0671041, + "epoch": 0.08922290695926649, + "flos": 17532353049120.0, + "grad_norm": 2.590333960183644, + "language_loss": 0.8540405, + "learning_rate": 3.963321630732448e-06, + "loss": 0.88335603, + "num_input_tokens_seen": 31723255, + "step": 1484, + "time_per_iteration": 2.8093860149383545 + }, + { + "auxiliary_loss_clip": 0.01560229, + "auxiliary_loss_mlp": 0.01371983, + "balance_loss_clip": 1.18654037, + "balance_loss_mlp": 1.06718886, + "epoch": 0.08928303021193447, + "flos": 32127861595680.0, + "grad_norm": 2.506647380896557, + "language_loss": 0.80621493, + "learning_rate": 3.963247348132932e-06, + "loss": 0.83553708, + "num_input_tokens_seen": 31747045, + "step": 1485, + "time_per_iteration": 2.853532075881958 + }, + { + "auxiliary_loss_clip": 0.01558446, + "auxiliary_loss_mlp": 0.0136122, + "balance_loss_clip": 1.18522334, + "balance_loss_mlp": 1.06252909, + "epoch": 0.08934315346460243, + "flos": 22127281752960.0, + "grad_norm": 2.998139496461356, + "language_loss": 0.83310008, + "learning_rate": 3.96317299108688e-06, + "loss": 0.86229682, + "num_input_tokens_seen": 31766615, + "step": 1486, + "time_per_iteration": 2.8853728771209717 + }, + { + "auxiliary_loss_clip": 0.01556476, + "auxiliary_loss_mlp": 0.01351124, + "balance_loss_clip": 1.18329287, + "balance_loss_mlp": 1.04995406, + "epoch": 0.0894032767172704, + "flos": 22567821703200.0, + "grad_norm": 2.167623794983781, + "language_loss": 0.76815474, + "learning_rate": 3.963098559597111e-06, + "loss": 0.79723072, + "num_input_tokens_seen": 31785855, + "step": 1487, + "time_per_iteration": 2.7869632244110107 + }, + { + "auxiliary_loss_clip": 0.01553332, + "auxiliary_loss_mlp": 0.01369286, + "balance_loss_clip": 1.17966831, + "balance_loss_mlp": 1.06658936, + "epoch": 0.08946339996993838, + "flos": 20195467470720.0, + "grad_norm": 2.419361802164064, + "language_loss": 0.82890999, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85813618, + "num_input_tokens_seen": 31804210, + "step": 1488, + "time_per_iteration": 2.7718138694763184 + }, + { + "auxiliary_loss_clip": 0.01556843, + "auxiliary_loss_mlp": 0.01354945, + "balance_loss_clip": 1.18280828, + "balance_loss_mlp": 1.05186749, + "epoch": 0.08952352322260634, + "flos": 48363817384800.0, + "grad_norm": 7.555951399249367, + "language_loss": 0.71936285, + "learning_rate": 3.962949473297718e-06, + "loss": 0.7484808, + "num_input_tokens_seen": 31826150, + "step": 1489, + "time_per_iteration": 3.0428295135498047 + }, + { + "auxiliary_loss_clip": 0.0155534, + "auxiliary_loss_mlp": 0.01358469, + "balance_loss_clip": 1.18213165, + "balance_loss_mlp": 1.05462813, + "epoch": 0.08958364647527431, + "flos": 31795645563360.0, + "grad_norm": 1.9862635117668705, + "language_loss": 0.89590907, + "learning_rate": 3.962874818493745e-06, + "loss": 0.92504716, + "num_input_tokens_seen": 31848060, + "step": 1490, + "time_per_iteration": 2.7901742458343506 + }, + { + "auxiliary_loss_clip": 0.0155348, + "auxiliary_loss_mlp": 0.01391785, + "balance_loss_clip": 1.17976665, + "balance_loss_mlp": 1.09519196, + "epoch": 0.08964376972794229, + "flos": 23370465440160.0, + "grad_norm": 2.799735447303641, + "language_loss": 0.74218345, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.77163613, + "num_input_tokens_seen": 31870040, + "step": 1491, + "time_per_iteration": 2.865168333053589 + }, + { + "auxiliary_loss_clip": 0.01555005, + "auxiliary_loss_mlp": 0.01361848, + "balance_loss_clip": 1.18211102, + "balance_loss_mlp": 1.06659091, + "epoch": 0.08970389298061025, + "flos": 23297035793760.0, + "grad_norm": 1.821439626923843, + "language_loss": 0.77001166, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79918021, + "num_input_tokens_seen": 31890400, + "step": 1492, + "time_per_iteration": 2.728147506713867 + }, + { + "auxiliary_loss_clip": 0.01555505, + "auxiliary_loss_mlp": 0.01352801, + "balance_loss_clip": 1.18287218, + "balance_loss_mlp": 1.05716181, + "epoch": 0.08976401623327822, + "flos": 33764288453280.0, + "grad_norm": 2.3685528093000814, + "language_loss": 0.70863324, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73771632, + "num_input_tokens_seen": 31913435, + "step": 1493, + "time_per_iteration": 2.8573601245880127 + }, + { + "auxiliary_loss_clip": 0.0155646, + "auxiliary_loss_mlp": 0.01363537, + "balance_loss_clip": 1.18317389, + "balance_loss_mlp": 1.06522751, + "epoch": 0.08982413948594618, + "flos": 23913412515360.0, + "grad_norm": 1.932583136484598, + "language_loss": 0.87416232, + "learning_rate": 3.962575454982109e-06, + "loss": 0.90336227, + "num_input_tokens_seen": 31932435, + "step": 1494, + "time_per_iteration": 2.796006679534912 + }, + { + "auxiliary_loss_clip": 0.01553455, + "auxiliary_loss_mlp": 0.01369626, + "balance_loss_clip": 1.17854488, + "balance_loss_mlp": 1.07913637, + "epoch": 0.08988426273861416, + "flos": 16839474500160.0, + "grad_norm": 2.519129150860271, + "language_loss": 0.8318091, + "learning_rate": 3.962500428044454e-06, + "loss": 0.86103988, + "num_input_tokens_seen": 31950125, + "step": 1495, + "time_per_iteration": 2.7624287605285645 + }, + { + "auxiliary_loss_clip": 0.01552469, + "auxiliary_loss_mlp": 0.01361961, + "balance_loss_clip": 1.17921126, + "balance_loss_mlp": 1.0628891, + "epoch": 0.08994438599128213, + "flos": 14795126274240.0, + "grad_norm": 2.107525132053918, + "language_loss": 0.69988757, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72903186, + "num_input_tokens_seen": 31968050, + "step": 1496, + "time_per_iteration": 2.8112339973449707 + }, + { + "auxiliary_loss_clip": 0.01549574, + "auxiliary_loss_mlp": 0.01358031, + "balance_loss_clip": 1.17568421, + "balance_loss_mlp": 1.06429982, + "epoch": 0.09000450924395009, + "flos": 17386252319520.0, + "grad_norm": 1.7380755871114568, + "language_loss": 0.79888904, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82796514, + "num_input_tokens_seen": 31985675, + "step": 1497, + "time_per_iteration": 2.8131401538848877 + }, + { + "auxiliary_loss_clip": 0.01554944, + "auxiliary_loss_mlp": 0.01376829, + "balance_loss_clip": 1.18160188, + "balance_loss_mlp": 1.08443189, + "epoch": 0.09006463249661807, + "flos": 24282646796160.0, + "grad_norm": 3.236467478939079, + "language_loss": 0.82808447, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.85740221, + "num_input_tokens_seen": 32005180, + "step": 1498, + "time_per_iteration": 2.835418939590454 + }, + { + "auxiliary_loss_clip": 0.01550948, + "auxiliary_loss_mlp": 0.01361596, + "balance_loss_clip": 1.17690945, + "balance_loss_mlp": 1.06824553, + "epoch": 0.09012475574928604, + "flos": 13663604183040.0, + "grad_norm": 8.640395819482245, + "language_loss": 0.79303205, + "learning_rate": 3.962199576140195e-06, + "loss": 0.8221575, + "num_input_tokens_seen": 32022970, + "step": 1499, + "time_per_iteration": 4.297576665878296 + }, + { + "auxiliary_loss_clip": 0.01557279, + "auxiliary_loss_mlp": 0.01355107, + "balance_loss_clip": 1.18313527, + "balance_loss_mlp": 1.05927753, + "epoch": 0.090184879001954, + "flos": 23329806088320.0, + "grad_norm": 2.1515135058542, + "language_loss": 0.93029052, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95941436, + "num_input_tokens_seen": 32043055, + "step": 1500, + "time_per_iteration": 2.9322853088378906 + }, + { + "auxiliary_loss_clip": 0.01556017, + "auxiliary_loss_mlp": 0.0137631, + "balance_loss_clip": 1.18170357, + "balance_loss_mlp": 1.08200657, + "epoch": 0.09024500225462198, + "flos": 23004910190880.0, + "grad_norm": 2.551868268192498, + "language_loss": 0.74429405, + "learning_rate": 3.962048703735822e-06, + "loss": 0.77361733, + "num_input_tokens_seen": 32061900, + "step": 1501, + "time_per_iteration": 2.8171393871307373 + }, + { + "auxiliary_loss_clip": 0.01643524, + "auxiliary_loss_mlp": 0.01240913, + "balance_loss_clip": 1.27040374, + "balance_loss_mlp": 1.00440216, + "epoch": 0.09030512550728995, + "flos": 62195710271040.0, + "grad_norm": 1.62257604426799, + "language_loss": 0.58259153, + "learning_rate": 3.96197315593058e-06, + "loss": 0.61143595, + "num_input_tokens_seen": 32122745, + "step": 1502, + "time_per_iteration": 3.3720510005950928 + }, + { + "auxiliary_loss_clip": 0.01554399, + "auxiliary_loss_mlp": 0.01353905, + "balance_loss_clip": 1.18109202, + "balance_loss_mlp": 1.05845642, + "epoch": 0.09036524875995791, + "flos": 38803777492320.0, + "grad_norm": 2.4838063600880296, + "language_loss": 0.69580722, + "learning_rate": 3.961897533727119e-06, + "loss": 0.72489023, + "num_input_tokens_seen": 32145125, + "step": 1503, + "time_per_iteration": 6.001763582229614 + }, + { + "auxiliary_loss_clip": 0.01551141, + "auxiliary_loss_mlp": 0.01365418, + "balance_loss_clip": 1.17860031, + "balance_loss_mlp": 1.07264006, + "epoch": 0.09042537201262588, + "flos": 21692279314080.0, + "grad_norm": 2.6977870733872216, + "language_loss": 0.86369985, + "learning_rate": 3.961821837128306e-06, + "loss": 0.89286542, + "num_input_tokens_seen": 32166255, + "step": 1504, + "time_per_iteration": 2.8028831481933594 + }, + { + "auxiliary_loss_clip": 0.01563661, + "auxiliary_loss_mlp": 0.01372188, + "balance_loss_clip": 1.19289684, + "balance_loss_mlp": 1.07139921, + "epoch": 0.09048549526529386, + "flos": 22268869031520.0, + "grad_norm": 2.1103152040629896, + "language_loss": 0.72620279, + "learning_rate": 3.961746066137014e-06, + "loss": 0.75556129, + "num_input_tokens_seen": 32184010, + "step": 1505, + "time_per_iteration": 2.8140060901641846 + }, + { + "auxiliary_loss_clip": 0.01570195, + "auxiliary_loss_mlp": 0.01373384, + "balance_loss_clip": 1.19896877, + "balance_loss_mlp": 1.07621908, + "epoch": 0.09054561851796182, + "flos": 14612690003040.0, + "grad_norm": 2.939125772381591, + "language_loss": 0.81024766, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83968347, + "num_input_tokens_seen": 32201635, + "step": 1506, + "time_per_iteration": 4.313610076904297 + }, + { + "auxiliary_loss_clip": 0.01561262, + "auxiliary_loss_mlp": 0.01371058, + "balance_loss_clip": 1.18913817, + "balance_loss_mlp": 1.07084084, + "epoch": 0.09060574177062979, + "flos": 27638639766720.0, + "grad_norm": 1.8793847322800437, + "language_loss": 0.76631927, + "learning_rate": 3.961594300988482e-06, + "loss": 0.7956425, + "num_input_tokens_seen": 32221940, + "step": 1507, + "time_per_iteration": 2.811584234237671 + }, + { + "auxiliary_loss_clip": 0.01653261, + "auxiliary_loss_mlp": 0.01372093, + "balance_loss_clip": 1.28057909, + "balance_loss_mlp": 1.1264267, + "epoch": 0.09066586502329776, + "flos": 66092109130080.0, + "grad_norm": 0.7849219478671567, + "language_loss": 0.57637393, + "learning_rate": 3.961518306836998e-06, + "loss": 0.60662758, + "num_input_tokens_seen": 32276495, + "step": 1508, + "time_per_iteration": 3.233978509902954 + }, + { + "auxiliary_loss_clip": 0.01565785, + "auxiliary_loss_mlp": 0.01364466, + "balance_loss_clip": 1.194664, + "balance_loss_mlp": 1.07245147, + "epoch": 0.09072598827596573, + "flos": 18918451500480.0, + "grad_norm": 2.0077259427970597, + "language_loss": 0.85575461, + "learning_rate": 3.961442238304543e-06, + "loss": 0.88505715, + "num_input_tokens_seen": 32294130, + "step": 1509, + "time_per_iteration": 2.8061070442199707 + }, + { + "auxiliary_loss_clip": 0.01563879, + "auxiliary_loss_mlp": 0.01377459, + "balance_loss_clip": 1.19239759, + "balance_loss_mlp": 1.0896405, + "epoch": 0.0907861115286337, + "flos": 24823773319680.0, + "grad_norm": 3.0704837982352577, + "language_loss": 0.8415091, + "learning_rate": 3.961366095394002e-06, + "loss": 0.87092251, + "num_input_tokens_seen": 32313555, + "step": 1510, + "time_per_iteration": 2.808809757232666 + }, + { + "auxiliary_loss_clip": 0.01557781, + "auxiliary_loss_mlp": 0.01414523, + "balance_loss_clip": 1.1854701, + "balance_loss_mlp": 1.13433313, + "epoch": 0.09084623478130167, + "flos": 21654995568480.0, + "grad_norm": 1.9946226331615247, + "language_loss": 0.85172224, + "learning_rate": 3.961289878108262e-06, + "loss": 0.88144529, + "num_input_tokens_seen": 32331430, + "step": 1511, + "time_per_iteration": 2.823103427886963 + }, + { + "auxiliary_loss_clip": 0.01562699, + "auxiliary_loss_mlp": 0.01380231, + "balance_loss_clip": 1.19070971, + "balance_loss_mlp": 1.0988971, + "epoch": 0.09090635803396964, + "flos": 27641901588480.0, + "grad_norm": 1.9155499781798004, + "language_loss": 0.85064805, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.8800773, + "num_input_tokens_seen": 32353705, + "step": 1512, + "time_per_iteration": 2.8007652759552 + }, + { + "auxiliary_loss_clip": 0.0155167, + "auxiliary_loss_mlp": 0.01391253, + "balance_loss_clip": 1.17889678, + "balance_loss_mlp": 1.11335266, + "epoch": 0.0909664812866376, + "flos": 17670678505920.0, + "grad_norm": 3.356025967420437, + "language_loss": 0.86570799, + "learning_rate": 3.961137220422749e-06, + "loss": 0.89513719, + "num_input_tokens_seen": 32370520, + "step": 1513, + "time_per_iteration": 2.773301124572754 + }, + { + "auxiliary_loss_clip": 0.01558255, + "auxiliary_loss_mlp": 0.01381724, + "balance_loss_clip": 1.18668246, + "balance_loss_mlp": 1.10058069, + "epoch": 0.09102660453930557, + "flos": 23953730513760.0, + "grad_norm": 1.8634526517265886, + "language_loss": 0.8673138, + "learning_rate": 3.961060780028764e-06, + "loss": 0.89671361, + "num_input_tokens_seen": 32389105, + "step": 1514, + "time_per_iteration": 2.837021827697754 + }, + { + "auxiliary_loss_clip": 0.01551158, + "auxiliary_loss_mlp": 0.0138731, + "balance_loss_clip": 1.17702103, + "balance_loss_mlp": 1.10349643, + "epoch": 0.09108672779197355, + "flos": 25815376971360.0, + "grad_norm": 1.9490884102372752, + "language_loss": 0.90151966, + "learning_rate": 3.960984265271159e-06, + "loss": 0.93090433, + "num_input_tokens_seen": 32408065, + "step": 1515, + "time_per_iteration": 2.887789726257324 + }, + { + "auxiliary_loss_clip": 0.01550827, + "auxiliary_loss_mlp": 0.01392677, + "balance_loss_clip": 1.17792273, + "balance_loss_mlp": 1.11325049, + "epoch": 0.09114685104464151, + "flos": 29641873502880.0, + "grad_norm": 2.5679281743652993, + "language_loss": 0.85199571, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.88143075, + "num_input_tokens_seen": 32427225, + "step": 1516, + "time_per_iteration": 2.841268539428711 + }, + { + "auxiliary_loss_clip": 0.01551055, + "auxiliary_loss_mlp": 0.01384952, + "balance_loss_clip": 1.18031263, + "balance_loss_mlp": 1.10609782, + "epoch": 0.09120697429730948, + "flos": 33732466362720.0, + "grad_norm": 1.5269188573070092, + "language_loss": 0.81140769, + "learning_rate": 3.960831012676692e-06, + "loss": 0.84076774, + "num_input_tokens_seen": 32450510, + "step": 1517, + "time_per_iteration": 2.9316458702087402 + }, + { + "auxiliary_loss_clip": 0.01555957, + "auxiliary_loss_mlp": 0.01402728, + "balance_loss_clip": 1.18410563, + "balance_loss_mlp": 1.1257807, + "epoch": 0.09126709754997746, + "flos": 18403116490080.0, + "grad_norm": 1.9864146070449418, + "language_loss": 0.77866888, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80825579, + "num_input_tokens_seen": 32468425, + "step": 1518, + "time_per_iteration": 2.792651653289795 + }, + { + "auxiliary_loss_clip": 0.01550435, + "auxiliary_loss_mlp": 0.01380908, + "balance_loss_clip": 1.17914867, + "balance_loss_mlp": 1.09881115, + "epoch": 0.09132722080264542, + "flos": 22094208033120.0, + "grad_norm": 2.818856379181375, + "language_loss": 0.86715961, + "learning_rate": 3.960677462662594e-06, + "loss": 0.89647299, + "num_input_tokens_seen": 32487510, + "step": 1519, + "time_per_iteration": 2.748260974884033 + }, + { + "auxiliary_loss_clip": 0.01554692, + "auxiliary_loss_mlp": 0.01395131, + "balance_loss_clip": 1.18306601, + "balance_loss_mlp": 1.1116991, + "epoch": 0.09138734405531339, + "flos": 21035281168800.0, + "grad_norm": 3.003468830792883, + "language_loss": 0.7345469, + "learning_rate": 3.96060057613046e-06, + "loss": 0.76404512, + "num_input_tokens_seen": 32507250, + "step": 1520, + "time_per_iteration": 2.850630044937134 + }, + { + "auxiliary_loss_clip": 0.01559971, + "auxiliary_loss_mlp": 0.01408796, + "balance_loss_clip": 1.19075096, + "balance_loss_mlp": 1.11983263, + "epoch": 0.09144746730798137, + "flos": 20086005708000.0, + "grad_norm": 3.7390475482492, + "language_loss": 0.86009747, + "learning_rate": 3.960523615252156e-06, + "loss": 0.88978517, + "num_input_tokens_seen": 32526045, + "step": 1521, + "time_per_iteration": 2.7295420169830322 + }, + { + "auxiliary_loss_clip": 0.01564357, + "auxiliary_loss_mlp": 0.01398716, + "balance_loss_clip": 1.19312704, + "balance_loss_mlp": 1.11337686, + "epoch": 0.09150759056064933, + "flos": 22780107800640.0, + "grad_norm": 1.8058911964995008, + "language_loss": 0.84093928, + "learning_rate": 3.960446580030599e-06, + "loss": 0.87057006, + "num_input_tokens_seen": 32546575, + "step": 1522, + "time_per_iteration": 2.890258550643921 + }, + { + "auxiliary_loss_clip": 0.0156328, + "auxiliary_loss_mlp": 0.01381012, + "balance_loss_clip": 1.19304967, + "balance_loss_mlp": 1.09395587, + "epoch": 0.0915677138133173, + "flos": 27566727246720.0, + "grad_norm": 2.344161228891179, + "language_loss": 0.81413364, + "learning_rate": 3.960369470468711e-06, + "loss": 0.84357661, + "num_input_tokens_seen": 32568795, + "step": 1523, + "time_per_iteration": 2.813875436782837 + }, + { + "auxiliary_loss_clip": 0.01560413, + "auxiliary_loss_mlp": 0.01378032, + "balance_loss_clip": 1.19105124, + "balance_loss_mlp": 1.08086681, + "epoch": 0.09162783706598528, + "flos": 17676633227040.0, + "grad_norm": 2.1670788857965135, + "language_loss": 0.74754828, + "learning_rate": 3.960292286569418e-06, + "loss": 0.77693272, + "num_input_tokens_seen": 32587010, + "step": 1524, + "time_per_iteration": 2.8349194526672363 + }, + { + "auxiliary_loss_clip": 0.01559117, + "auxiliary_loss_mlp": 0.01368939, + "balance_loss_clip": 1.18881905, + "balance_loss_mlp": 1.07444477, + "epoch": 0.09168796031865324, + "flos": 18480149311680.0, + "grad_norm": 2.5258997155462937, + "language_loss": 0.86432672, + "learning_rate": 3.960215028335644e-06, + "loss": 0.89360726, + "num_input_tokens_seen": 32602375, + "step": 1525, + "time_per_iteration": 3.0656964778900146 + }, + { + "auxiliary_loss_clip": 0.01566236, + "auxiliary_loss_mlp": 0.01363642, + "balance_loss_clip": 1.19764805, + "balance_loss_mlp": 1.0723896, + "epoch": 0.0917480835713212, + "flos": 29390179682880.0, + "grad_norm": 2.7361237905668836, + "language_loss": 0.75326562, + "learning_rate": 3.96013769577032e-06, + "loss": 0.7825644, + "num_input_tokens_seen": 32621460, + "step": 1526, + "time_per_iteration": 2.8513572216033936 + }, + { + "auxiliary_loss_clip": 0.01569162, + "auxiliary_loss_mlp": 0.01381913, + "balance_loss_clip": 1.20028782, + "balance_loss_mlp": 1.09447575, + "epoch": 0.09180820682398917, + "flos": 19831353491520.0, + "grad_norm": 5.380877991910275, + "language_loss": 0.77656031, + "learning_rate": 3.960060288876378e-06, + "loss": 0.8060711, + "num_input_tokens_seen": 32640440, + "step": 1527, + "time_per_iteration": 2.876950979232788 + }, + { + "auxiliary_loss_clip": 0.01569389, + "auxiliary_loss_mlp": 0.01385659, + "balance_loss_clip": 1.20123231, + "balance_loss_mlp": 1.09269059, + "epoch": 0.09186833007665715, + "flos": 23844117038400.0, + "grad_norm": 2.1180003172362842, + "language_loss": 0.7827552, + "learning_rate": 3.959982807656753e-06, + "loss": 0.81230563, + "num_input_tokens_seen": 32660020, + "step": 1528, + "time_per_iteration": 2.8204615116119385 + }, + { + "auxiliary_loss_clip": 0.01572822, + "auxiliary_loss_mlp": 0.01375317, + "balance_loss_clip": 1.20703828, + "balance_loss_mlp": 1.08539963, + "epoch": 0.09192845332932512, + "flos": 12934390092480.0, + "grad_norm": 2.782627302084331, + "language_loss": 0.77105653, + "learning_rate": 3.959905252114384e-06, + "loss": 0.80053788, + "num_input_tokens_seen": 32678170, + "step": 1529, + "time_per_iteration": 2.764019727706909 + }, + { + "auxiliary_loss_clip": 0.01571146, + "auxiliary_loss_mlp": 0.01372334, + "balance_loss_clip": 1.20436251, + "balance_loss_mlp": 1.07974696, + "epoch": 0.09198857658199308, + "flos": 24570296876160.0, + "grad_norm": 2.034232076769783, + "language_loss": 0.83332813, + "learning_rate": 3.959827622252211e-06, + "loss": 0.86276293, + "num_input_tokens_seen": 32697540, + "step": 1530, + "time_per_iteration": 2.8048975467681885 + }, + { + "auxiliary_loss_clip": 0.01579056, + "auxiliary_loss_mlp": 0.01364922, + "balance_loss_clip": 1.21187401, + "balance_loss_mlp": 1.07462335, + "epoch": 0.09204869983466106, + "flos": 20269048829760.0, + "grad_norm": 2.6640709478186526, + "language_loss": 0.84036279, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86980259, + "num_input_tokens_seen": 32716805, + "step": 1531, + "time_per_iteration": 2.7666845321655273 + }, + { + "auxiliary_loss_clip": 0.01573395, + "auxiliary_loss_mlp": 0.0137191, + "balance_loss_clip": 1.20647454, + "balance_loss_mlp": 1.08485389, + "epoch": 0.09210882308732903, + "flos": 20887701240960.0, + "grad_norm": 2.096452020482017, + "language_loss": 0.81574535, + "learning_rate": 3.959672139580233e-06, + "loss": 0.84519839, + "num_input_tokens_seen": 32736385, + "step": 1532, + "time_per_iteration": 2.76557993888855 + }, + { + "auxiliary_loss_clip": 0.01575064, + "auxiliary_loss_mlp": 0.01375537, + "balance_loss_clip": 1.20884609, + "balance_loss_mlp": 1.08199644, + "epoch": 0.09216894633999699, + "flos": 30958979902560.0, + "grad_norm": 2.1588173265133443, + "language_loss": 0.8383112, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.86781728, + "num_input_tokens_seen": 32757140, + "step": 1533, + "time_per_iteration": 2.8876564502716064 + }, + { + "auxiliary_loss_clip": 0.01575173, + "auxiliary_loss_mlp": 0.0137039, + "balance_loss_clip": 1.20957375, + "balance_loss_mlp": 1.08886528, + "epoch": 0.09222906959266497, + "flos": 13153237761600.0, + "grad_norm": 2.308256100944013, + "language_loss": 0.90301913, + "learning_rate": 3.959516359664402e-06, + "loss": 0.93247473, + "num_input_tokens_seen": 32774860, + "step": 1534, + "time_per_iteration": 2.9185705184936523 + }, + { + "auxiliary_loss_clip": 0.01574105, + "auxiliary_loss_mlp": 0.01369163, + "balance_loss_clip": 1.20782304, + "balance_loss_mlp": 1.07981849, + "epoch": 0.09228919284533293, + "flos": 25996865038560.0, + "grad_norm": 2.945046605650394, + "language_loss": 0.76005352, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78948617, + "num_input_tokens_seen": 32795250, + "step": 1535, + "time_per_iteration": 2.835789203643799 + }, + { + "auxiliary_loss_clip": 0.01577137, + "auxiliary_loss_mlp": 0.01372395, + "balance_loss_clip": 1.20958042, + "balance_loss_mlp": 1.08629274, + "epoch": 0.0923493160980009, + "flos": 18662775223680.0, + "grad_norm": 2.0220709617180606, + "language_loss": 0.81689894, + "learning_rate": 3.959360282528346e-06, + "loss": 0.8463943, + "num_input_tokens_seen": 32813805, + "step": 1536, + "time_per_iteration": 2.728323221206665 + }, + { + "auxiliary_loss_clip": 0.01573336, + "auxiliary_loss_mlp": 0.01361815, + "balance_loss_clip": 1.20813203, + "balance_loss_mlp": 1.08086216, + "epoch": 0.09240943935066886, + "flos": 21142618954560.0, + "grad_norm": 2.3736008236526467, + "language_loss": 0.89216727, + "learning_rate": 3.959282132510131e-06, + "loss": 0.9215188, + "num_input_tokens_seen": 32830960, + "step": 1537, + "time_per_iteration": 4.253436803817749 + }, + { + "auxiliary_loss_clip": 0.01572924, + "auxiliary_loss_mlp": 0.01347117, + "balance_loss_clip": 1.2073735, + "balance_loss_mlp": 1.06196833, + "epoch": 0.09246956260333684, + "flos": 20594399865120.0, + "grad_norm": 3.91932200467752, + "language_loss": 0.80855334, + "learning_rate": 3.959203908195741e-06, + "loss": 0.83775377, + "num_input_tokens_seen": 32848275, + "step": 1538, + "time_per_iteration": 2.7821860313415527 + }, + { + "auxiliary_loss_clip": 0.01806144, + "auxiliary_loss_mlp": 0.01433361, + "balance_loss_clip": 1.44517469, + "balance_loss_mlp": 1.23041916, + "epoch": 0.09252968585600481, + "flos": 67565860287840.0, + "grad_norm": 0.7985876482497587, + "language_loss": 0.57361382, + "learning_rate": 3.959125609588142e-06, + "loss": 0.60600889, + "num_input_tokens_seen": 32917730, + "step": 1539, + "time_per_iteration": 3.5234103202819824 + }, + { + "auxiliary_loss_clip": 0.01584907, + "auxiliary_loss_mlp": 0.01370546, + "balance_loss_clip": 1.21924675, + "balance_loss_mlp": 1.07738686, + "epoch": 0.09258980910867277, + "flos": 17385759253440.0, + "grad_norm": 4.475859289731088, + "language_loss": 0.68225759, + "learning_rate": 3.959047236690304e-06, + "loss": 0.71181214, + "num_input_tokens_seen": 32934910, + "step": 1540, + "time_per_iteration": 2.7694389820098877 + }, + { + "auxiliary_loss_clip": 0.01581208, + "auxiliary_loss_mlp": 0.0135921, + "balance_loss_clip": 1.21677399, + "balance_loss_mlp": 1.06299877, + "epoch": 0.09264993236134075, + "flos": 19868030386560.0, + "grad_norm": 1.9256825551197754, + "language_loss": 0.84105968, + "learning_rate": 3.958968789505198e-06, + "loss": 0.87046385, + "num_input_tokens_seen": 32953840, + "step": 1541, + "time_per_iteration": 4.240798473358154 + }, + { + "auxiliary_loss_clip": 0.01815489, + "auxiliary_loss_mlp": 0.01313774, + "balance_loss_clip": 1.45583367, + "balance_loss_mlp": 1.09481049, + "epoch": 0.09271005561400872, + "flos": 62290114554240.0, + "grad_norm": 0.8995339073855138, + "language_loss": 0.6191777, + "learning_rate": 3.9588902680358e-06, + "loss": 0.65047032, + "num_input_tokens_seen": 33011410, + "step": 1542, + "time_per_iteration": 4.739258289337158 + }, + { + "auxiliary_loss_clip": 0.01575921, + "auxiliary_loss_mlp": 0.0136769, + "balance_loss_clip": 1.21062088, + "balance_loss_mlp": 1.069381, + "epoch": 0.09277017886667668, + "flos": 23332043849760.0, + "grad_norm": 1.865989598073652, + "language_loss": 0.82891798, + "learning_rate": 3.958811672285086e-06, + "loss": 0.85835409, + "num_input_tokens_seen": 33031675, + "step": 1543, + "time_per_iteration": 4.202216625213623 + }, + { + "auxiliary_loss_clip": 0.01579407, + "auxiliary_loss_mlp": 0.01367871, + "balance_loss_clip": 1.2135601, + "balance_loss_mlp": 1.06136012, + "epoch": 0.09283030211934466, + "flos": 54749162733120.0, + "grad_norm": 2.2534198955054907, + "language_loss": 0.72274429, + "learning_rate": 3.958733002256038e-06, + "loss": 0.75221705, + "num_input_tokens_seen": 33056355, + "step": 1544, + "time_per_iteration": 3.1204118728637695 + }, + { + "auxiliary_loss_clip": 0.01573013, + "auxiliary_loss_mlp": 0.01381504, + "balance_loss_clip": 1.20692885, + "balance_loss_mlp": 1.07690024, + "epoch": 0.09289042537201263, + "flos": 30337331166720.0, + "grad_norm": 2.2670958660875407, + "language_loss": 0.77822268, + "learning_rate": 3.958654257951637e-06, + "loss": 0.80776781, + "num_input_tokens_seen": 33079520, + "step": 1545, + "time_per_iteration": 2.845137596130371 + }, + { + "auxiliary_loss_clip": 0.015803, + "auxiliary_loss_mlp": 0.01377913, + "balance_loss_clip": 1.21473205, + "balance_loss_mlp": 1.07140231, + "epoch": 0.09295054862468059, + "flos": 17748659531520.0, + "grad_norm": 4.008406446560689, + "language_loss": 0.75188202, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.7814641, + "num_input_tokens_seen": 33096135, + "step": 1546, + "time_per_iteration": 2.7579963207244873 + }, + { + "auxiliary_loss_clip": 0.01576681, + "auxiliary_loss_mlp": 0.01359956, + "balance_loss_clip": 1.21048379, + "balance_loss_mlp": 1.06069303, + "epoch": 0.09301067187734856, + "flos": 23660201568960.0, + "grad_norm": 2.1629070274492177, + "language_loss": 0.84690809, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.87627447, + "num_input_tokens_seen": 33115245, + "step": 1547, + "time_per_iteration": 2.7981841564178467 + }, + { + "auxiliary_loss_clip": 0.01576158, + "auxiliary_loss_mlp": 0.01358218, + "balance_loss_clip": 1.21099663, + "balance_loss_mlp": 1.0564754, + "epoch": 0.09307079513001654, + "flos": 27530239992480.0, + "grad_norm": 2.598865243081644, + "language_loss": 0.67649472, + "learning_rate": 3.958417579416199e-06, + "loss": 0.7058385, + "num_input_tokens_seen": 33136640, + "step": 1548, + "time_per_iteration": 2.818516254425049 + }, + { + "auxiliary_loss_clip": 0.01580779, + "auxiliary_loss_mlp": 0.01376327, + "balance_loss_clip": 1.21515775, + "balance_loss_mlp": 1.07439387, + "epoch": 0.0931309183826845, + "flos": 20629218280320.0, + "grad_norm": 2.563974571131904, + "language_loss": 0.83771974, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.86729085, + "num_input_tokens_seen": 33155060, + "step": 1549, + "time_per_iteration": 2.7606821060180664 + }, + { + "auxiliary_loss_clip": 0.01583705, + "auxiliary_loss_mlp": 0.01362746, + "balance_loss_clip": 1.21891069, + "balance_loss_mlp": 1.0669167, + "epoch": 0.09319104163535247, + "flos": 29023372804320.0, + "grad_norm": 3.079345869483345, + "language_loss": 0.76391733, + "learning_rate": 3.958259422403966e-06, + "loss": 0.79338181, + "num_input_tokens_seen": 33175420, + "step": 1550, + "time_per_iteration": 2.8679723739624023 + }, + { + "auxiliary_loss_clip": 0.01584957, + "auxiliary_loss_mlp": 0.01370941, + "balance_loss_clip": 1.21967864, + "balance_loss_mlp": 1.07663751, + "epoch": 0.09325116488802045, + "flos": 25303910633280.0, + "grad_norm": 4.613416987363461, + "language_loss": 0.83422029, + "learning_rate": 3.95818023251026e-06, + "loss": 0.86377925, + "num_input_tokens_seen": 33194120, + "step": 1551, + "time_per_iteration": 2.809567451477051 + }, + { + "auxiliary_loss_clip": 0.01784393, + "auxiliary_loss_mlp": 0.01286209, + "balance_loss_clip": 1.42456865, + "balance_loss_mlp": 1.05198669, + "epoch": 0.09331128814068841, + "flos": 61542884223360.0, + "grad_norm": 0.7565613080165974, + "language_loss": 0.61778462, + "learning_rate": 3.958100968362163e-06, + "loss": 0.64849067, + "num_input_tokens_seen": 33261080, + "step": 1552, + "time_per_iteration": 3.47550892829895 + }, + { + "auxiliary_loss_clip": 0.01779863, + "auxiliary_loss_mlp": 0.01274307, + "balance_loss_clip": 1.42081547, + "balance_loss_mlp": 1.04466248, + "epoch": 0.09337141139335638, + "flos": 53300102443200.0, + "grad_norm": 0.8390630454807747, + "language_loss": 0.59002763, + "learning_rate": 3.958021629962681e-06, + "loss": 0.62056935, + "num_input_tokens_seen": 33330235, + "step": 1553, + "time_per_iteration": 3.4067046642303467 + }, + { + "auxiliary_loss_clip": 0.015742, + "auxiliary_loss_mlp": 0.01371707, + "balance_loss_clip": 1.20916152, + "balance_loss_mlp": 1.09189868, + "epoch": 0.09343153464602436, + "flos": 23479092783360.0, + "grad_norm": 2.7801609933605347, + "language_loss": 0.87913561, + "learning_rate": 3.957942217314823e-06, + "loss": 0.90859473, + "num_input_tokens_seen": 33349035, + "step": 1554, + "time_per_iteration": 2.815237522125244 + }, + { + "auxiliary_loss_clip": 0.01588731, + "auxiliary_loss_mlp": 0.01390349, + "balance_loss_clip": 1.22528911, + "balance_loss_mlp": 1.11702609, + "epoch": 0.09349165789869232, + "flos": 19355464131840.0, + "grad_norm": 2.496549387970383, + "language_loss": 0.81876814, + "learning_rate": 3.957862730421599e-06, + "loss": 0.8485589, + "num_input_tokens_seen": 33368060, + "step": 1555, + "time_per_iteration": 2.7738516330718994 + }, + { + "auxiliary_loss_clip": 0.01769654, + "auxiliary_loss_mlp": 0.0128302, + "balance_loss_clip": 1.41124201, + "balance_loss_mlp": 1.06634521, + "epoch": 0.09355178115136029, + "flos": 67508626330080.0, + "grad_norm": 0.8776926833336863, + "language_loss": 0.59576964, + "learning_rate": 3.957783169286024e-06, + "loss": 0.6262964, + "num_input_tokens_seen": 33430825, + "step": 1556, + "time_per_iteration": 3.276386022567749 + }, + { + "auxiliary_loss_clip": 0.0158524, + "auxiliary_loss_mlp": 0.01393237, + "balance_loss_clip": 1.22071433, + "balance_loss_mlp": 1.11762512, + "epoch": 0.09361190440402825, + "flos": 37344666604320.0, + "grad_norm": 2.230792785442673, + "language_loss": 0.842767, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.8725518, + "num_input_tokens_seen": 33454855, + "step": 1557, + "time_per_iteration": 2.9362528324127197 + }, + { + "auxiliary_loss_clip": 0.015747, + "auxiliary_loss_mlp": 0.01379383, + "balance_loss_clip": 1.20909321, + "balance_loss_mlp": 1.10491598, + "epoch": 0.09367202765669623, + "flos": 24902019842400.0, + "grad_norm": 2.4529550266832905, + "language_loss": 0.77969658, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80923742, + "num_input_tokens_seen": 33476000, + "step": 1558, + "time_per_iteration": 2.8609540462493896 + }, + { + "auxiliary_loss_clip": 0.01571654, + "auxiliary_loss_mlp": 0.01398526, + "balance_loss_clip": 1.20647836, + "balance_loss_mlp": 1.12176967, + "epoch": 0.0937321509093642, + "flos": 15707383486560.0, + "grad_norm": 2.112604936545774, + "language_loss": 0.80426693, + "learning_rate": 3.957544040455379e-06, + "loss": 0.83396876, + "num_input_tokens_seen": 33493845, + "step": 1559, + "time_per_iteration": 2.8411076068878174 + }, + { + "auxiliary_loss_clip": 0.01583548, + "auxiliary_loss_mlp": 0.01371139, + "balance_loss_clip": 1.21789527, + "balance_loss_mlp": 1.08751619, + "epoch": 0.09379227416203216, + "flos": 20485696665600.0, + "grad_norm": 2.031612655063238, + "language_loss": 0.76241493, + "learning_rate": 3.957464182380599e-06, + "loss": 0.79196185, + "num_input_tokens_seen": 33510850, + "step": 1560, + "time_per_iteration": 2.7896556854248047 + }, + { + "auxiliary_loss_clip": 0.01572114, + "auxiliary_loss_mlp": 0.01364422, + "balance_loss_clip": 1.20669079, + "balance_loss_mlp": 1.07412386, + "epoch": 0.09385239741470014, + "flos": 24354786885120.0, + "grad_norm": 2.1717875160681404, + "language_loss": 0.80941617, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83878154, + "num_input_tokens_seen": 33530430, + "step": 1561, + "time_per_iteration": 2.8166344165802 + }, + { + "auxiliary_loss_clip": 0.01577446, + "auxiliary_loss_mlp": 0.01367177, + "balance_loss_clip": 1.21079707, + "balance_loss_mlp": 1.07916689, + "epoch": 0.0939125206673681, + "flos": 33294277958400.0, + "grad_norm": 6.81651789412547, + "language_loss": 0.61937439, + "learning_rate": 3.957304243552354e-06, + "loss": 0.64882064, + "num_input_tokens_seen": 33551975, + "step": 1562, + "time_per_iteration": 2.8537449836730957 + }, + { + "auxiliary_loss_clip": 0.01585513, + "auxiliary_loss_mlp": 0.01367926, + "balance_loss_clip": 1.21909332, + "balance_loss_mlp": 1.08144271, + "epoch": 0.09397264392003607, + "flos": 19246950573120.0, + "grad_norm": 1.9725240718519579, + "language_loss": 0.84988368, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87941802, + "num_input_tokens_seen": 33569850, + "step": 1563, + "time_per_iteration": 2.8803722858428955 + }, + { + "auxiliary_loss_clip": 0.01572517, + "auxiliary_loss_mlp": 0.01346123, + "balance_loss_clip": 1.20678639, + "balance_loss_mlp": 1.05468059, + "epoch": 0.09403276717270405, + "flos": 19319963009760.0, + "grad_norm": 2.0869451606349867, + "language_loss": 0.76384252, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.79302883, + "num_input_tokens_seen": 33590510, + "step": 1564, + "time_per_iteration": 2.809110641479492 + }, + { + "auxiliary_loss_clip": 0.0157837, + "auxiliary_loss_mlp": 0.01348483, + "balance_loss_clip": 1.212255, + "balance_loss_mlp": 1.05112743, + "epoch": 0.09409289042537201, + "flos": 23585558221440.0, + "grad_norm": 2.47525161898125, + "language_loss": 0.79993349, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.82920206, + "num_input_tokens_seen": 33608810, + "step": 1565, + "time_per_iteration": 2.819528579711914 + }, + { + "auxiliary_loss_clip": 0.01573364, + "auxiliary_loss_mlp": 0.01348279, + "balance_loss_clip": 1.20689166, + "balance_loss_mlp": 1.05416548, + "epoch": 0.09415301367803998, + "flos": 20080202699520.0, + "grad_norm": 1.8833642537749613, + "language_loss": 0.75364059, + "learning_rate": 3.956983475266103e-06, + "loss": 0.782857, + "num_input_tokens_seen": 33627265, + "step": 1566, + "time_per_iteration": 2.787658929824829 + }, + { + "auxiliary_loss_clip": 0.01570046, + "auxiliary_loss_mlp": 0.01350472, + "balance_loss_clip": 1.20472169, + "balance_loss_mlp": 1.0561682, + "epoch": 0.09421313693070796, + "flos": 21063803509440.0, + "grad_norm": 2.0423542725116643, + "language_loss": 0.78068978, + "learning_rate": 3.956903097664407e-06, + "loss": 0.80989504, + "num_input_tokens_seen": 33644810, + "step": 1567, + "time_per_iteration": 2.788367748260498 + }, + { + "auxiliary_loss_clip": 0.01574438, + "auxiliary_loss_mlp": 0.01356827, + "balance_loss_clip": 1.20863986, + "balance_loss_mlp": 1.05661058, + "epoch": 0.09427326018337592, + "flos": 24318451343520.0, + "grad_norm": 3.798916730026315, + "language_loss": 0.83038759, + "learning_rate": 3.956822645856749e-06, + "loss": 0.8597002, + "num_input_tokens_seen": 33665665, + "step": 1568, + "time_per_iteration": 2.861647605895996 + }, + { + "auxiliary_loss_clip": 0.01573221, + "auxiliary_loss_mlp": 0.01364362, + "balance_loss_clip": 1.20556164, + "balance_loss_mlp": 1.06948566, + "epoch": 0.09433338343604389, + "flos": 20265824936160.0, + "grad_norm": 4.413173207166333, + "language_loss": 0.77136564, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.80074143, + "num_input_tokens_seen": 33684760, + "step": 1569, + "time_per_iteration": 2.766957998275757 + }, + { + "auxiliary_loss_clip": 0.01578728, + "auxiliary_loss_mlp": 0.01347901, + "balance_loss_clip": 1.21054614, + "balance_loss_mlp": 1.04901934, + "epoch": 0.09439350668871185, + "flos": 12744216476640.0, + "grad_norm": 2.7900629643330532, + "language_loss": 0.85938734, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88865364, + "num_input_tokens_seen": 33700750, + "step": 1570, + "time_per_iteration": 2.7953972816467285 + }, + { + "auxiliary_loss_clip": 0.01578756, + "auxiliary_loss_mlp": 0.01337819, + "balance_loss_clip": 1.21180725, + "balance_loss_mlp": 1.04198885, + "epoch": 0.09445362994137983, + "flos": 25964891235360.0, + "grad_norm": 2.0829491238108258, + "language_loss": 0.7654404, + "learning_rate": 3.95658084522853e-06, + "loss": 0.79460615, + "num_input_tokens_seen": 33724430, + "step": 1571, + "time_per_iteration": 2.8245606422424316 + }, + { + "auxiliary_loss_clip": 0.01573497, + "auxiliary_loss_mlp": 0.0135714, + "balance_loss_clip": 1.2052021, + "balance_loss_mlp": 1.0672226, + "epoch": 0.0945137531940478, + "flos": 19717188636960.0, + "grad_norm": 3.238566304188086, + "language_loss": 0.79385316, + "learning_rate": 3.956500096627561e-06, + "loss": 0.82315952, + "num_input_tokens_seen": 33743455, + "step": 1572, + "time_per_iteration": 2.8116252422332764 + }, + { + "auxiliary_loss_clip": 0.01576447, + "auxiliary_loss_mlp": 0.01357827, + "balance_loss_clip": 1.2089473, + "balance_loss_mlp": 1.06352305, + "epoch": 0.09457387644671576, + "flos": 23618973294720.0, + "grad_norm": 4.188324324394111, + "language_loss": 0.88096702, + "learning_rate": 3.956419273835913e-06, + "loss": 0.91030979, + "num_input_tokens_seen": 33763435, + "step": 1573, + "time_per_iteration": 2.7766153812408447 + }, + { + "auxiliary_loss_clip": 0.01570579, + "auxiliary_loss_mlp": 0.01336503, + "balance_loss_clip": 1.20099497, + "balance_loss_mlp": 1.04086375, + "epoch": 0.09463399969938374, + "flos": 26909767029600.0, + "grad_norm": 7.395795083906112, + "language_loss": 0.81896245, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84803325, + "num_input_tokens_seen": 33784325, + "step": 1574, + "time_per_iteration": 2.8207011222839355 + }, + { + "auxiliary_loss_clip": 0.01569078, + "auxiliary_loss_mlp": 0.01351526, + "balance_loss_clip": 1.20106542, + "balance_loss_mlp": 1.05874825, + "epoch": 0.0946941229520517, + "flos": 23661832479840.0, + "grad_norm": 2.7193260613959476, + "language_loss": 0.81279689, + "learning_rate": 3.95625740569284e-06, + "loss": 0.84200293, + "num_input_tokens_seen": 33802510, + "step": 1575, + "time_per_iteration": 4.269311189651489 + }, + { + "auxiliary_loss_clip": 0.01569642, + "auxiliary_loss_mlp": 0.01354698, + "balance_loss_clip": 1.20116043, + "balance_loss_mlp": 1.06134748, + "epoch": 0.09475424620471967, + "flos": 24136356425760.0, + "grad_norm": 2.39874274578836, + "language_loss": 0.86844528, + "learning_rate": 3.956176360347553e-06, + "loss": 0.89768863, + "num_input_tokens_seen": 33819980, + "step": 1576, + "time_per_iteration": 2.7819507122039795 + }, + { + "auxiliary_loss_clip": 0.01742887, + "auxiliary_loss_mlp": 0.0124456, + "balance_loss_clip": 1.37842822, + "balance_loss_mlp": 1.01873016, + "epoch": 0.09481436945738765, + "flos": 68432906769120.0, + "grad_norm": 1.005833066967614, + "language_loss": 0.65976048, + "learning_rate": 3.956095240823862e-06, + "loss": 0.68963492, + "num_input_tokens_seen": 33878925, + "step": 1577, + "time_per_iteration": 3.261800527572632 + }, + { + "auxiliary_loss_clip": 0.01565783, + "auxiliary_loss_mlp": 0.01358828, + "balance_loss_clip": 1.19693017, + "balance_loss_mlp": 1.06929231, + "epoch": 0.09487449271005562, + "flos": 16655938312320.0, + "grad_norm": 3.2531042876982337, + "language_loss": 0.79071105, + "learning_rate": 3.956014047124844e-06, + "loss": 0.8199572, + "num_input_tokens_seen": 33897600, + "step": 1578, + "time_per_iteration": 4.297030210494995 + }, + { + "auxiliary_loss_clip": 0.01559405, + "auxiliary_loss_mlp": 0.01367192, + "balance_loss_clip": 1.1901176, + "balance_loss_mlp": 1.07536733, + "epoch": 0.09493461596272358, + "flos": 24277754063520.0, + "grad_norm": 2.0181354512476672, + "language_loss": 0.78042024, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80968618, + "num_input_tokens_seen": 33917365, + "step": 1579, + "time_per_iteration": 4.276596546173096 + }, + { + "auxiliary_loss_clip": 0.01563303, + "auxiliary_loss_mlp": 0.01376726, + "balance_loss_clip": 1.1945138, + "balance_loss_mlp": 1.09157693, + "epoch": 0.09499473921539155, + "flos": 21872250254880.0, + "grad_norm": 2.2401148655215115, + "language_loss": 0.73214054, + "learning_rate": 3.955851437213144e-06, + "loss": 0.76154083, + "num_input_tokens_seen": 33936680, + "step": 1580, + "time_per_iteration": 2.821462392807007 + }, + { + "auxiliary_loss_clip": 0.01563199, + "auxiliary_loss_mlp": 0.01379519, + "balance_loss_clip": 1.19347692, + "balance_loss_mlp": 1.1042887, + "epoch": 0.09505486246805953, + "flos": 33549954235200.0, + "grad_norm": 1.9115083187418922, + "language_loss": 0.77820957, + "learning_rate": 3.955770021006627e-06, + "loss": 0.80763674, + "num_input_tokens_seen": 33960685, + "step": 1581, + "time_per_iteration": 2.9173877239227295 + }, + { + "auxiliary_loss_clip": 0.01560723, + "auxiliary_loss_mlp": 0.01384343, + "balance_loss_clip": 1.19135761, + "balance_loss_mlp": 1.10873103, + "epoch": 0.09511498572072749, + "flos": 21217489871040.0, + "grad_norm": 2.624588857830966, + "language_loss": 0.87058556, + "learning_rate": 3.955688530637116e-06, + "loss": 0.90003628, + "num_input_tokens_seen": 33980015, + "step": 1582, + "time_per_iteration": 4.367383003234863 + }, + { + "auxiliary_loss_clip": 0.01560672, + "auxiliary_loss_mlp": 0.01385668, + "balance_loss_clip": 1.19045901, + "balance_loss_mlp": 1.1115824, + "epoch": 0.09517510897339546, + "flos": 14613031356480.0, + "grad_norm": 2.366398717700679, + "language_loss": 0.66847968, + "learning_rate": 3.955606966107699e-06, + "loss": 0.69794309, + "num_input_tokens_seen": 33997705, + "step": 1583, + "time_per_iteration": 2.813817024230957 + }, + { + "auxiliary_loss_clip": 0.01568421, + "auxiliary_loss_mlp": 0.01366762, + "balance_loss_clip": 1.20093155, + "balance_loss_mlp": 1.08275795, + "epoch": 0.09523523222606343, + "flos": 27819748552320.0, + "grad_norm": 1.9000180011365357, + "language_loss": 0.70565307, + "learning_rate": 3.95552532742147e-06, + "loss": 0.7350049, + "num_input_tokens_seen": 34017465, + "step": 1584, + "time_per_iteration": 2.8144073486328125 + }, + { + "auxiliary_loss_clip": 0.01563162, + "auxiliary_loss_mlp": 0.0137516, + "balance_loss_clip": 1.19763255, + "balance_loss_mlp": 1.09706891, + "epoch": 0.0952953554787314, + "flos": 20708488863360.0, + "grad_norm": 1.7626461087309353, + "language_loss": 0.8099581, + "learning_rate": 3.955443614581525e-06, + "loss": 0.83934128, + "num_input_tokens_seen": 34038550, + "step": 1585, + "time_per_iteration": 2.864929437637329 + }, + { + "auxiliary_loss_clip": 0.01571668, + "auxiliary_loss_mlp": 0.01374002, + "balance_loss_clip": 1.20442927, + "balance_loss_mlp": 1.09724581, + "epoch": 0.09535547873139937, + "flos": 24789789324000.0, + "grad_norm": 2.738161858605276, + "language_loss": 0.72094899, + "learning_rate": 3.955361827590961e-06, + "loss": 0.75040567, + "num_input_tokens_seen": 34058665, + "step": 1586, + "time_per_iteration": 2.83528208732605 + }, + { + "auxiliary_loss_clip": 0.01714034, + "auxiliary_loss_mlp": 0.01278839, + "balance_loss_clip": 1.34667873, + "balance_loss_mlp": 1.07131958, + "epoch": 0.09541560198406734, + "flos": 71918198294400.0, + "grad_norm": 0.8290545591777665, + "language_loss": 0.55347878, + "learning_rate": 3.955279966452883e-06, + "loss": 0.58340752, + "num_input_tokens_seen": 34109655, + "step": 1587, + "time_per_iteration": 3.1664133071899414 + }, + { + "auxiliary_loss_clip": 0.01565659, + "auxiliary_loss_mlp": 0.01352803, + "balance_loss_clip": 1.19959986, + "balance_loss_mlp": 1.06155133, + "epoch": 0.09547572523673531, + "flos": 28984951213920.0, + "grad_norm": 1.86426707915565, + "language_loss": 0.81046033, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83964491, + "num_input_tokens_seen": 34131115, + "step": 1588, + "time_per_iteration": 2.858194589614868 + }, + { + "auxiliary_loss_clip": 0.01569552, + "auxiliary_loss_mlp": 0.01345764, + "balance_loss_clip": 1.20344043, + "balance_loss_mlp": 1.05680108, + "epoch": 0.09553584848940327, + "flos": 24136204713120.0, + "grad_norm": 1.844474742196343, + "language_loss": 0.81660277, + "learning_rate": 3.955116021746594e-06, + "loss": 0.84575593, + "num_input_tokens_seen": 34151925, + "step": 1589, + "time_per_iteration": 2.863316297531128 + }, + { + "auxiliary_loss_clip": 0.01575249, + "auxiliary_loss_mlp": 0.01356181, + "balance_loss_clip": 1.20832145, + "balance_loss_mlp": 1.06225824, + "epoch": 0.09559597174207124, + "flos": 42854735060640.0, + "grad_norm": 1.8708159330967102, + "language_loss": 0.65058672, + "learning_rate": 3.955033938184601e-06, + "loss": 0.679901, + "num_input_tokens_seen": 34175395, + "step": 1590, + "time_per_iteration": 2.974672794342041 + }, + { + "auxiliary_loss_clip": 0.01570809, + "auxiliary_loss_mlp": 0.01357956, + "balance_loss_clip": 1.20476055, + "balance_loss_mlp": 1.05697656, + "epoch": 0.09565609499473922, + "flos": 32673577426560.0, + "grad_norm": 1.7230595035588132, + "language_loss": 0.83310771, + "learning_rate": 3.954951780487526e-06, + "loss": 0.86239541, + "num_input_tokens_seen": 34197760, + "step": 1591, + "time_per_iteration": 2.841581106185913 + }, + { + "auxiliary_loss_clip": 0.01566957, + "auxiliary_loss_mlp": 0.0136299, + "balance_loss_clip": 1.2009654, + "balance_loss_mlp": 1.06716025, + "epoch": 0.09571621824740718, + "flos": 18480376880640.0, + "grad_norm": 2.8896345473565304, + "language_loss": 0.74359709, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.77289653, + "num_input_tokens_seen": 34215330, + "step": 1592, + "time_per_iteration": 2.785074234008789 + }, + { + "auxiliary_loss_clip": 0.01574172, + "auxiliary_loss_mlp": 0.01358699, + "balance_loss_clip": 1.20838141, + "balance_loss_mlp": 1.06534922, + "epoch": 0.09577634150007515, + "flos": 29390065898400.0, + "grad_norm": 2.1560092613513198, + "language_loss": 0.74004543, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76937413, + "num_input_tokens_seen": 34237745, + "step": 1593, + "time_per_iteration": 2.8747808933258057 + }, + { + "auxiliary_loss_clip": 0.01580218, + "auxiliary_loss_mlp": 0.01362035, + "balance_loss_clip": 1.21342099, + "balance_loss_mlp": 1.06372619, + "epoch": 0.09583646475274313, + "flos": 22750068333600.0, + "grad_norm": 2.615399173893056, + "language_loss": 0.70262921, + "learning_rate": 3.954704862616971e-06, + "loss": 0.73205173, + "num_input_tokens_seen": 34256565, + "step": 1594, + "time_per_iteration": 2.7788381576538086 + }, + { + "auxiliary_loss_clip": 0.01567077, + "auxiliary_loss_mlp": 0.01366033, + "balance_loss_clip": 1.20092666, + "balance_loss_mlp": 1.06524432, + "epoch": 0.0958965880054111, + "flos": 23220420181920.0, + "grad_norm": 2.298534503940751, + "language_loss": 0.82107818, + "learning_rate": 3.954622408410747e-06, + "loss": 0.85040933, + "num_input_tokens_seen": 34275970, + "step": 1595, + "time_per_iteration": 2.770742416381836 + }, + { + "auxiliary_loss_clip": 0.01575585, + "auxiliary_loss_mlp": 0.01355265, + "balance_loss_clip": 1.2077353, + "balance_loss_mlp": 1.05829096, + "epoch": 0.09595671125807906, + "flos": 21326572352160.0, + "grad_norm": 2.108301301271745, + "language_loss": 0.85097897, + "learning_rate": 3.954539880085045e-06, + "loss": 0.88028741, + "num_input_tokens_seen": 34295490, + "step": 1596, + "time_per_iteration": 2.8153703212738037 + }, + { + "auxiliary_loss_clip": 0.01570557, + "auxiliary_loss_mlp": 0.01369269, + "balance_loss_clip": 1.20553327, + "balance_loss_mlp": 1.07858968, + "epoch": 0.09601683451074704, + "flos": 39606610870080.0, + "grad_norm": 1.7505420599212065, + "language_loss": 0.6869328, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71633101, + "num_input_tokens_seen": 34319990, + "step": 1597, + "time_per_iteration": 2.9012961387634277 + }, + { + "auxiliary_loss_clip": 0.01567137, + "auxiliary_loss_mlp": 0.01331214, + "balance_loss_clip": 1.20086968, + "balance_loss_mlp": 1.04263234, + "epoch": 0.096076957763415, + "flos": 23734996629120.0, + "grad_norm": 2.4851013317963684, + "language_loss": 0.74852979, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77751327, + "num_input_tokens_seen": 34339225, + "step": 1598, + "time_per_iteration": 2.8256635665893555 + }, + { + "auxiliary_loss_clip": 0.01577624, + "auxiliary_loss_mlp": 0.01369487, + "balance_loss_clip": 1.21093607, + "balance_loss_mlp": 1.08815265, + "epoch": 0.09613708101608297, + "flos": 34680983260320.0, + "grad_norm": 2.0197732563004225, + "language_loss": 0.69311345, + "learning_rate": 3.954291850422382e-06, + "loss": 0.72258461, + "num_input_tokens_seen": 34361020, + "step": 1599, + "time_per_iteration": 2.8957839012145996 + }, + { + "auxiliary_loss_clip": 0.01575106, + "auxiliary_loss_mlp": 0.01345179, + "balance_loss_clip": 1.2076304, + "balance_loss_mlp": 1.06479883, + "epoch": 0.09619720426875093, + "flos": 20742093577440.0, + "grad_norm": 4.012848697201479, + "language_loss": 0.84023547, + "learning_rate": 3.954209025650093e-06, + "loss": 0.86943829, + "num_input_tokens_seen": 34378630, + "step": 1600, + "time_per_iteration": 2.7934653759002686 + }, + { + "auxiliary_loss_clip": 0.01570155, + "auxiliary_loss_mlp": 0.01343937, + "balance_loss_clip": 1.20357275, + "balance_loss_mlp": 1.06222129, + "epoch": 0.09625732752141891, + "flos": 13044420777600.0, + "grad_norm": 4.245499731104312, + "language_loss": 0.80626279, + "learning_rate": 3.954126126774001e-06, + "loss": 0.83540368, + "num_input_tokens_seen": 34397110, + "step": 1601, + "time_per_iteration": 2.819133996963501 + }, + { + "auxiliary_loss_clip": 0.01585397, + "auxiliary_loss_mlp": 0.01354214, + "balance_loss_clip": 1.21953225, + "balance_loss_mlp": 1.07287955, + "epoch": 0.09631745077408688, + "flos": 22275847812960.0, + "grad_norm": 4.471373847894328, + "language_loss": 0.82394338, + "learning_rate": 3.954043153797251e-06, + "loss": 0.85333949, + "num_input_tokens_seen": 34414165, + "step": 1602, + "time_per_iteration": 2.7469682693481445 + }, + { + "auxiliary_loss_clip": 0.01594618, + "auxiliary_loss_mlp": 0.01344722, + "balance_loss_clip": 1.22880232, + "balance_loss_mlp": 1.06396008, + "epoch": 0.09637757402675484, + "flos": 24756601819680.0, + "grad_norm": 4.660337299200548, + "language_loss": 0.63053024, + "learning_rate": 3.953960106722989e-06, + "loss": 0.65992361, + "num_input_tokens_seen": 34434445, + "step": 1603, + "time_per_iteration": 2.8387997150421143 + }, + { + "auxiliary_loss_clip": 0.01582981, + "auxiliary_loss_mlp": 0.01371603, + "balance_loss_clip": 1.2177701, + "balance_loss_mlp": 1.09541881, + "epoch": 0.09643769727942282, + "flos": 22527807130080.0, + "grad_norm": 3.239883432660171, + "language_loss": 0.7136488, + "learning_rate": 3.953876985554364e-06, + "loss": 0.74319464, + "num_input_tokens_seen": 34453095, + "step": 1604, + "time_per_iteration": 2.819148540496826 + }, + { + "auxiliary_loss_clip": 0.01585611, + "auxiliary_loss_mlp": 0.01360218, + "balance_loss_clip": 1.21995485, + "balance_loss_mlp": 1.08632278, + "epoch": 0.09649782053209079, + "flos": 30923744277600.0, + "grad_norm": 2.606135977407738, + "language_loss": 0.80121469, + "learning_rate": 3.953793790294527e-06, + "loss": 0.83067298, + "num_input_tokens_seen": 34473680, + "step": 1605, + "time_per_iteration": 2.888615846633911 + }, + { + "auxiliary_loss_clip": 0.01583293, + "auxiliary_loss_mlp": 0.01357385, + "balance_loss_clip": 1.21789122, + "balance_loss_mlp": 1.07967472, + "epoch": 0.09655794378475875, + "flos": 25339980677760.0, + "grad_norm": 2.691018437524322, + "language_loss": 0.74234885, + "learning_rate": 3.953710520946634e-06, + "loss": 0.77175558, + "num_input_tokens_seen": 34492610, + "step": 1606, + "time_per_iteration": 2.8227005004882812 + }, + { + "auxiliary_loss_clip": 0.01597234, + "auxiliary_loss_mlp": 0.01356385, + "balance_loss_clip": 1.23218489, + "balance_loss_mlp": 1.08058286, + "epoch": 0.09661806703742673, + "flos": 22348253399040.0, + "grad_norm": 2.8923749548247843, + "language_loss": 0.75996661, + "learning_rate": 3.953627177513843e-06, + "loss": 0.7895028, + "num_input_tokens_seen": 34511855, + "step": 1607, + "time_per_iteration": 2.856543779373169 + }, + { + "auxiliary_loss_clip": 0.01588857, + "auxiliary_loss_mlp": 0.01363815, + "balance_loss_clip": 1.22397065, + "balance_loss_mlp": 1.08972931, + "epoch": 0.0966781902900947, + "flos": 17459530253280.0, + "grad_norm": 1.9291093036320692, + "language_loss": 0.86801612, + "learning_rate": 3.953543759999312e-06, + "loss": 0.89754283, + "num_input_tokens_seen": 34528905, + "step": 1608, + "time_per_iteration": 2.9154856204986572 + }, + { + "auxiliary_loss_clip": 0.0158744, + "auxiliary_loss_mlp": 0.0135493, + "balance_loss_clip": 1.22184062, + "balance_loss_mlp": 1.07970011, + "epoch": 0.09673831354276266, + "flos": 36907009194240.0, + "grad_norm": 2.903244106579991, + "language_loss": 0.70893985, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73836362, + "num_input_tokens_seen": 34548480, + "step": 1609, + "time_per_iteration": 2.9002034664154053 + }, + { + "auxiliary_loss_clip": 0.01591953, + "auxiliary_loss_mlp": 0.01358155, + "balance_loss_clip": 1.22607803, + "balance_loss_mlp": 1.07262516, + "epoch": 0.09679843679543064, + "flos": 20703330633600.0, + "grad_norm": 2.8482080277565833, + "language_loss": 0.84831387, + "learning_rate": 3.953376702737693e-06, + "loss": 0.87781495, + "num_input_tokens_seen": 34565410, + "step": 1610, + "time_per_iteration": 2.801016092300415 + }, + { + "auxiliary_loss_clip": 0.01606275, + "auxiliary_loss_mlp": 0.01349841, + "balance_loss_clip": 1.24020553, + "balance_loss_mlp": 1.0652647, + "epoch": 0.0968585600480986, + "flos": 23516717882400.0, + "grad_norm": 2.301486952222236, + "language_loss": 0.6775946, + "learning_rate": 3.953293062996939e-06, + "loss": 0.7071557, + "num_input_tokens_seen": 34584840, + "step": 1611, + "time_per_iteration": 2.8093254566192627 + }, + { + "auxiliary_loss_clip": 0.01596673, + "auxiliary_loss_mlp": 0.01356164, + "balance_loss_clip": 1.23159266, + "balance_loss_mlp": 1.0763557, + "epoch": 0.09691868330076657, + "flos": 20123175669120.0, + "grad_norm": 2.190427007702879, + "language_loss": 0.81263304, + "learning_rate": 3.953209349187115e-06, + "loss": 0.84216142, + "num_input_tokens_seen": 34603360, + "step": 1612, + "time_per_iteration": 4.430294036865234 + }, + { + "auxiliary_loss_clip": 0.01595653, + "auxiliary_loss_mlp": 0.01350063, + "balance_loss_clip": 1.23079467, + "balance_loss_mlp": 1.06338906, + "epoch": 0.09697880655343454, + "flos": 16546552405920.0, + "grad_norm": 5.216186934214176, + "language_loss": 0.80734718, + "learning_rate": 3.953125561311398e-06, + "loss": 0.83680433, + "num_input_tokens_seen": 34620760, + "step": 1613, + "time_per_iteration": 2.8215911388397217 + }, + { + "auxiliary_loss_clip": 0.01605011, + "auxiliary_loss_mlp": 0.01352466, + "balance_loss_clip": 1.23975015, + "balance_loss_mlp": 1.0667448, + "epoch": 0.09703892980610251, + "flos": 26106592298400.0, + "grad_norm": 2.1884547644684713, + "language_loss": 0.8455795, + "learning_rate": 3.953041699372964e-06, + "loss": 0.87515432, + "num_input_tokens_seen": 34640695, + "step": 1614, + "time_per_iteration": 2.8410065174102783 + }, + { + "auxiliary_loss_clip": 0.0183221, + "auxiliary_loss_mlp": 0.01301727, + "balance_loss_clip": 1.46109653, + "balance_loss_mlp": 1.07131958, + "epoch": 0.09709905305877048, + "flos": 60450352644960.0, + "grad_norm": 0.8042715487236561, + "language_loss": 0.54567468, + "learning_rate": 3.952957763374992e-06, + "loss": 0.57701409, + "num_input_tokens_seen": 34702395, + "step": 1615, + "time_per_iteration": 3.2501180171966553 + }, + { + "auxiliary_loss_clip": 0.01833393, + "auxiliary_loss_mlp": 0.01264664, + "balance_loss_clip": 1.46167243, + "balance_loss_mlp": 1.0403595, + "epoch": 0.09715917631143844, + "flos": 57646826717760.0, + "grad_norm": 0.7908746057092948, + "language_loss": 0.58155584, + "learning_rate": 3.952873753320666e-06, + "loss": 0.61253643, + "num_input_tokens_seen": 34768910, + "step": 1616, + "time_per_iteration": 3.495590925216675 + }, + { + "auxiliary_loss_clip": 0.01596745, + "auxiliary_loss_mlp": 0.01360512, + "balance_loss_clip": 1.23244166, + "balance_loss_mlp": 1.08623528, + "epoch": 0.09721929956410642, + "flos": 20560302084960.0, + "grad_norm": 2.469525746714079, + "language_loss": 0.69600147, + "learning_rate": 3.952789669213172e-06, + "loss": 0.72557408, + "num_input_tokens_seen": 34787680, + "step": 1617, + "time_per_iteration": 4.3246071338653564 + }, + { + "auxiliary_loss_clip": 0.01598743, + "auxiliary_loss_mlp": 0.01363901, + "balance_loss_clip": 1.23465776, + "balance_loss_mlp": 1.09057784, + "epoch": 0.09727942281677439, + "flos": 27346817589120.0, + "grad_norm": 2.638192841609343, + "language_loss": 0.80685627, + "learning_rate": 3.952705511055698e-06, + "loss": 0.8364827, + "num_input_tokens_seen": 34808330, + "step": 1618, + "time_per_iteration": 4.290903806686401 + }, + { + "auxiliary_loss_clip": 0.01604974, + "auxiliary_loss_mlp": 0.01368648, + "balance_loss_clip": 1.2408011, + "balance_loss_mlp": 1.10066545, + "epoch": 0.09733954606944235, + "flos": 24902361195840.0, + "grad_norm": 2.0483208908768047, + "language_loss": 0.92892003, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95865625, + "num_input_tokens_seen": 34830020, + "step": 1619, + "time_per_iteration": 4.299603700637817 + }, + { + "auxiliary_loss_clip": 0.01607347, + "auxiliary_loss_mlp": 0.01386125, + "balance_loss_clip": 1.24275339, + "balance_loss_mlp": 1.11909616, + "epoch": 0.09739966932211033, + "flos": 31506630069600.0, + "grad_norm": 5.079803699669864, + "language_loss": 0.8870967, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.91703141, + "num_input_tokens_seen": 34850330, + "step": 1620, + "time_per_iteration": 2.8429102897644043 + }, + { + "auxiliary_loss_clip": 0.01596111, + "auxiliary_loss_mlp": 0.01396982, + "balance_loss_clip": 1.2334466, + "balance_loss_mlp": 1.12671113, + "epoch": 0.0974597925747783, + "flos": 23881059430560.0, + "grad_norm": 2.4004141872655373, + "language_loss": 0.76896548, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79889637, + "num_input_tokens_seen": 34871640, + "step": 1621, + "time_per_iteration": 2.8833935260772705 + }, + { + "auxiliary_loss_clip": 0.01597096, + "auxiliary_loss_mlp": 0.01383217, + "balance_loss_clip": 1.23392248, + "balance_loss_mlp": 1.11656988, + "epoch": 0.09751991582744626, + "flos": 17021493561600.0, + "grad_norm": 2.0958158917034795, + "language_loss": 0.78024048, + "learning_rate": 3.952368137989871e-06, + "loss": 0.81004363, + "num_input_tokens_seen": 34888100, + "step": 1622, + "time_per_iteration": 2.7970409393310547 + }, + { + "auxiliary_loss_clip": 0.016018, + "auxiliary_loss_mlp": 0.01371379, + "balance_loss_clip": 1.23734069, + "balance_loss_mlp": 1.09710181, + "epoch": 0.09758003908011423, + "flos": 28405365171840.0, + "grad_norm": 1.9363874356856885, + "language_loss": 0.85685897, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88659078, + "num_input_tokens_seen": 34910485, + "step": 1623, + "time_per_iteration": 2.9241864681243896 + }, + { + "auxiliary_loss_clip": 0.01604499, + "auxiliary_loss_mlp": 0.01370027, + "balance_loss_clip": 1.24056625, + "balance_loss_mlp": 1.10032797, + "epoch": 0.09764016233278221, + "flos": 18145923086880.0, + "grad_norm": 2.3099630674408185, + "language_loss": 0.80455124, + "learning_rate": 3.952199007240184e-06, + "loss": 0.83429652, + "num_input_tokens_seen": 34928615, + "step": 1624, + "time_per_iteration": 2.8017117977142334 + }, + { + "auxiliary_loss_clip": 0.01601739, + "auxiliary_loss_mlp": 0.01342639, + "balance_loss_clip": 1.23769665, + "balance_loss_mlp": 1.0637846, + "epoch": 0.09770028558545017, + "flos": 15267526243200.0, + "grad_norm": 2.340130432232315, + "language_loss": 0.85899901, + "learning_rate": 3.952114330822364e-06, + "loss": 0.88844287, + "num_input_tokens_seen": 34946045, + "step": 1625, + "time_per_iteration": 2.7527997493743896 + }, + { + "auxiliary_loss_clip": 0.01598814, + "auxiliary_loss_mlp": 0.0135237, + "balance_loss_clip": 1.23445094, + "balance_loss_mlp": 1.06912827, + "epoch": 0.09776040883811814, + "flos": 23474427619680.0, + "grad_norm": 2.821897534780064, + "language_loss": 0.85525942, + "learning_rate": 3.952029580380172e-06, + "loss": 0.88477135, + "num_input_tokens_seen": 34962865, + "step": 1626, + "time_per_iteration": 2.759861469268799 + }, + { + "auxiliary_loss_clip": 0.0159232, + "auxiliary_loss_mlp": 0.01351872, + "balance_loss_clip": 1.22763515, + "balance_loss_mlp": 1.07339919, + "epoch": 0.09782053209078612, + "flos": 24501835818720.0, + "grad_norm": 2.1312425581335788, + "language_loss": 0.83329356, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.86273551, + "num_input_tokens_seen": 34983505, + "step": 1627, + "time_per_iteration": 2.851658821105957 + }, + { + "auxiliary_loss_clip": 0.01592109, + "auxiliary_loss_mlp": 0.013445, + "balance_loss_clip": 1.228477, + "balance_loss_mlp": 1.06354809, + "epoch": 0.09788065534345408, + "flos": 21582324485280.0, + "grad_norm": 3.1207135877977126, + "language_loss": 0.84544861, + "learning_rate": 3.951859857435534e-06, + "loss": 0.87481469, + "num_input_tokens_seen": 35001825, + "step": 1628, + "time_per_iteration": 2.79774808883667 + }, + { + "auxiliary_loss_clip": 0.01592029, + "auxiliary_loss_mlp": 0.01345029, + "balance_loss_clip": 1.22770727, + "balance_loss_mlp": 1.06178761, + "epoch": 0.09794077859612205, + "flos": 23844761817120.0, + "grad_norm": 1.681650599414661, + "language_loss": 0.75847375, + "learning_rate": 3.951774884939523e-06, + "loss": 0.7878443, + "num_input_tokens_seen": 35023075, + "step": 1629, + "time_per_iteration": 2.907288074493408 + }, + { + "auxiliary_loss_clip": 0.01602545, + "auxiliary_loss_mlp": 0.01339522, + "balance_loss_clip": 1.2378633, + "balance_loss_mlp": 1.05475473, + "epoch": 0.09800090184879003, + "flos": 23662173833280.0, + "grad_norm": 3.347047091144335, + "language_loss": 0.7829774, + "learning_rate": 3.951689838432013e-06, + "loss": 0.81239808, + "num_input_tokens_seen": 35043480, + "step": 1630, + "time_per_iteration": 2.838350534439087 + }, + { + "auxiliary_loss_clip": 0.01595955, + "auxiliary_loss_mlp": 0.01343583, + "balance_loss_clip": 1.23147082, + "balance_loss_mlp": 1.06377482, + "epoch": 0.09806102510145799, + "flos": 17057677390560.0, + "grad_norm": 2.5712101565115324, + "language_loss": 0.86714464, + "learning_rate": 3.951604717916228e-06, + "loss": 0.89654005, + "num_input_tokens_seen": 35061490, + "step": 1631, + "time_per_iteration": 2.799325704574585 + }, + { + "auxiliary_loss_clip": 0.01590553, + "auxiliary_loss_mlp": 0.01338612, + "balance_loss_clip": 1.22672963, + "balance_loss_mlp": 1.05689692, + "epoch": 0.09812114835412596, + "flos": 23880756005280.0, + "grad_norm": 2.042660096674311, + "language_loss": 0.82812309, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85741472, + "num_input_tokens_seen": 35079670, + "step": 1632, + "time_per_iteration": 2.9006237983703613 + }, + { + "auxiliary_loss_clip": 0.01593537, + "auxiliary_loss_mlp": 0.01348427, + "balance_loss_clip": 1.2305125, + "balance_loss_mlp": 1.06289709, + "epoch": 0.09818127160679392, + "flos": 20597623758720.0, + "grad_norm": 1.6600380276809656, + "language_loss": 0.78974116, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81916082, + "num_input_tokens_seen": 35099205, + "step": 1633, + "time_per_iteration": 2.815613269805908 + }, + { + "auxiliary_loss_clip": 0.01591217, + "auxiliary_loss_mlp": 0.01366586, + "balance_loss_clip": 1.22778237, + "balance_loss_mlp": 1.07705045, + "epoch": 0.0982413948594619, + "flos": 15489180596160.0, + "grad_norm": 2.67588968310594, + "language_loss": 0.73252934, + "learning_rate": 3.951348912351521e-06, + "loss": 0.76210737, + "num_input_tokens_seen": 35115270, + "step": 1634, + "time_per_iteration": 2.8155970573425293 + }, + { + "auxiliary_loss_clip": 0.01588794, + "auxiliary_loss_mlp": 0.01338137, + "balance_loss_clip": 1.22545505, + "balance_loss_mlp": 1.04097223, + "epoch": 0.09830151811212987, + "flos": 24210317066400.0, + "grad_norm": 3.50054391888088, + "language_loss": 0.73011297, + "learning_rate": 3.951263495834947e-06, + "loss": 0.75938225, + "num_input_tokens_seen": 35134065, + "step": 1635, + "time_per_iteration": 2.881359338760376 + }, + { + "auxiliary_loss_clip": 0.01593072, + "auxiliary_loss_mlp": 0.01358902, + "balance_loss_clip": 1.23002267, + "balance_loss_mlp": 1.06803131, + "epoch": 0.09836164136479783, + "flos": 20596675554720.0, + "grad_norm": 2.6619306102216673, + "language_loss": 0.78473812, + "learning_rate": 3.951178005326264e-06, + "loss": 0.81425786, + "num_input_tokens_seen": 35154870, + "step": 1636, + "time_per_iteration": 2.9183309078216553 + }, + { + "auxiliary_loss_clip": 0.01586073, + "auxiliary_loss_mlp": 0.01351275, + "balance_loss_clip": 1.22366643, + "balance_loss_mlp": 1.05563617, + "epoch": 0.09842176461746581, + "flos": 19935998377920.0, + "grad_norm": 2.058084605450474, + "language_loss": 0.69957709, + "learning_rate": 3.951092440828715e-06, + "loss": 0.72895062, + "num_input_tokens_seen": 35171850, + "step": 1637, + "time_per_iteration": 2.8363983631134033 + }, + { + "auxiliary_loss_clip": 0.01592827, + "auxiliary_loss_mlp": 0.01338995, + "balance_loss_clip": 1.22982287, + "balance_loss_mlp": 1.05460954, + "epoch": 0.09848188787013377, + "flos": 21216769236000.0, + "grad_norm": 2.0962319690984694, + "language_loss": 0.77752388, + "learning_rate": 3.951006802345545e-06, + "loss": 0.80684209, + "num_input_tokens_seen": 35188795, + "step": 1638, + "time_per_iteration": 2.8128817081451416 + }, + { + "auxiliary_loss_clip": 0.01597149, + "auxiliary_loss_mlp": 0.01325927, + "balance_loss_clip": 1.23518014, + "balance_loss_mlp": 1.04497457, + "epoch": 0.09854201112280174, + "flos": 30156867159840.0, + "grad_norm": 1.5289467984765888, + "language_loss": 0.72796136, + "learning_rate": 3.950921089880003e-06, + "loss": 0.75719213, + "num_input_tokens_seen": 35212100, + "step": 1639, + "time_per_iteration": 2.929672956466675 + }, + { + "auxiliary_loss_clip": 0.01590736, + "auxiliary_loss_mlp": 0.0133802, + "balance_loss_clip": 1.22796535, + "balance_loss_mlp": 1.05039179, + "epoch": 0.09860213437546972, + "flos": 21797720691840.0, + "grad_norm": 2.041458448058566, + "language_loss": 0.88764679, + "learning_rate": 3.950835303435337e-06, + "loss": 0.91693431, + "num_input_tokens_seen": 35230390, + "step": 1640, + "time_per_iteration": 2.87429141998291 + }, + { + "auxiliary_loss_clip": 0.01598671, + "auxiliary_loss_mlp": 0.01352073, + "balance_loss_clip": 1.23692155, + "balance_loss_mlp": 1.06597066, + "epoch": 0.09866225762813768, + "flos": 21837735264960.0, + "grad_norm": 2.269317165187076, + "language_loss": 0.80896074, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83846819, + "num_input_tokens_seen": 35250405, + "step": 1641, + "time_per_iteration": 2.846555233001709 + }, + { + "auxiliary_loss_clip": 0.01604248, + "auxiliary_loss_mlp": 0.01352591, + "balance_loss_clip": 1.24244881, + "balance_loss_mlp": 1.07449937, + "epoch": 0.09872238088080565, + "flos": 17601610597920.0, + "grad_norm": 2.58327612847622, + "language_loss": 0.86721134, + "learning_rate": 3.95066350862165e-06, + "loss": 0.89677978, + "num_input_tokens_seen": 35262820, + "step": 1642, + "time_per_iteration": 2.799180746078491 + }, + { + "auxiliary_loss_clip": 0.01600714, + "auxiliary_loss_mlp": 0.01342559, + "balance_loss_clip": 1.23941278, + "balance_loss_mlp": 1.06008089, + "epoch": 0.09878250413347361, + "flos": 27638601838560.0, + "grad_norm": 1.9017610620392178, + "language_loss": 0.80815458, + "learning_rate": 3.950577500259144e-06, + "loss": 0.8375873, + "num_input_tokens_seen": 35284490, + "step": 1643, + "time_per_iteration": 2.914703845977783 + }, + { + "auxiliary_loss_clip": 0.01607218, + "auxiliary_loss_mlp": 0.01345032, + "balance_loss_clip": 1.24502635, + "balance_loss_mlp": 1.06083727, + "epoch": 0.0988426273861416, + "flos": 16546590334080.0, + "grad_norm": 1.8856994907191202, + "language_loss": 0.82748562, + "learning_rate": 3.950491417930543e-06, + "loss": 0.8570081, + "num_input_tokens_seen": 35302815, + "step": 1644, + "time_per_iteration": 2.7752532958984375 + }, + { + "auxiliary_loss_clip": 0.0159625, + "auxiliary_loss_mlp": 0.01355581, + "balance_loss_clip": 1.23356056, + "balance_loss_mlp": 1.07176709, + "epoch": 0.09890275063880956, + "flos": 21217414014720.0, + "grad_norm": 2.4166370728926263, + "language_loss": 0.69035971, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.71987808, + "num_input_tokens_seen": 35321175, + "step": 1645, + "time_per_iteration": 2.822387456893921 + }, + { + "auxiliary_loss_clip": 0.01847024, + "auxiliary_loss_mlp": 0.01935573, + "balance_loss_clip": 1.48139203, + "balance_loss_mlp": 1.74789047, + "epoch": 0.09896287389147752, + "flos": 59385698628480.0, + "grad_norm": 1.193079177126299, + "language_loss": 0.60811025, + "learning_rate": 3.950319031388119e-06, + "loss": 0.64593619, + "num_input_tokens_seen": 35381740, + "step": 1646, + "time_per_iteration": 3.2888214588165283 + }, + { + "auxiliary_loss_clip": 0.01607484, + "auxiliary_loss_mlp": 0.0134889, + "balance_loss_clip": 1.24453378, + "balance_loss_mlp": 1.05973577, + "epoch": 0.0990229971441455, + "flos": 29645249109120.0, + "grad_norm": 22.718095786092263, + "language_loss": 0.7311781, + "learning_rate": 3.950232727180833e-06, + "loss": 0.76074189, + "num_input_tokens_seen": 35403760, + "step": 1647, + "time_per_iteration": 2.9225003719329834 + }, + { + "auxiliary_loss_clip": 0.01608599, + "auxiliary_loss_mlp": 0.01343021, + "balance_loss_clip": 1.24584246, + "balance_loss_mlp": 1.04909897, + "epoch": 0.09908312039681347, + "flos": 21837014629920.0, + "grad_norm": 1.9064461125878107, + "language_loss": 0.84394109, + "learning_rate": 3.950146349020525e-06, + "loss": 0.87345731, + "num_input_tokens_seen": 35424050, + "step": 1648, + "time_per_iteration": 2.9113667011260986 + }, + { + "auxiliary_loss_clip": 0.01841771, + "auxiliary_loss_mlp": 0.01268135, + "balance_loss_clip": 1.47485399, + "balance_loss_mlp": 1.0560379, + "epoch": 0.09914324364948143, + "flos": 57571045525440.0, + "grad_norm": 0.7805917699987102, + "language_loss": 0.55612552, + "learning_rate": 3.950059896910473e-06, + "loss": 0.58722454, + "num_input_tokens_seen": 35481690, + "step": 1649, + "time_per_iteration": 3.2167699337005615 + }, + { + "auxiliary_loss_clip": 0.01596653, + "auxiliary_loss_mlp": 0.01378903, + "balance_loss_clip": 1.23378706, + "balance_loss_mlp": 1.07506239, + "epoch": 0.09920336690214941, + "flos": 34126392240000.0, + "grad_norm": 2.9201414024386567, + "language_loss": 0.90142745, + "learning_rate": 3.949973370853954e-06, + "loss": 0.93118298, + "num_input_tokens_seen": 35498635, + "step": 1650, + "time_per_iteration": 4.444278717041016 + }, + { + "auxiliary_loss_clip": 0.0183257, + "auxiliary_loss_mlp": 0.01324303, + "balance_loss_clip": 1.46534145, + "balance_loss_mlp": 1.0984726, + "epoch": 0.09926349015481738, + "flos": 71224295320800.0, + "grad_norm": 0.8112607804292643, + "language_loss": 0.63701147, + "learning_rate": 3.94988677085425e-06, + "loss": 0.66858017, + "num_input_tokens_seen": 35565720, + "step": 1651, + "time_per_iteration": 3.437191963195801 + }, + { + "auxiliary_loss_clip": 0.01600238, + "auxiliary_loss_mlp": 0.01405245, + "balance_loss_clip": 1.23796546, + "balance_loss_mlp": 1.09434748, + "epoch": 0.09932361340748534, + "flos": 23150935064160.0, + "grad_norm": 1.9879633983451208, + "language_loss": 0.88138288, + "learning_rate": 3.949800096914643e-06, + "loss": 0.91143775, + "num_input_tokens_seen": 35586000, + "step": 1652, + "time_per_iteration": 2.8517701625823975 + }, + { + "auxiliary_loss_clip": 0.01607953, + "auxiliary_loss_mlp": 0.01411355, + "balance_loss_clip": 1.24602616, + "balance_loss_mlp": 1.10408139, + "epoch": 0.09938373666015332, + "flos": 19830708712800.0, + "grad_norm": 2.385243662361348, + "language_loss": 0.82413244, + "learning_rate": 3.949713349038422e-06, + "loss": 0.85432547, + "num_input_tokens_seen": 35604355, + "step": 1653, + "time_per_iteration": 2.8269989490509033 + }, + { + "auxiliary_loss_clip": 0.01604401, + "auxiliary_loss_mlp": 0.0138582, + "balance_loss_clip": 1.24215341, + "balance_loss_mlp": 1.07549477, + "epoch": 0.09944385991282129, + "flos": 22092842619360.0, + "grad_norm": 1.8780080494856033, + "language_loss": 0.79589975, + "learning_rate": 3.949626527228875e-06, + "loss": 0.82580197, + "num_input_tokens_seen": 35625495, + "step": 1654, + "time_per_iteration": 2.8614299297332764 + }, + { + "auxiliary_loss_clip": 0.01611308, + "auxiliary_loss_mlp": 0.01381713, + "balance_loss_clip": 1.24940264, + "balance_loss_mlp": 1.07577419, + "epoch": 0.09950398316548925, + "flos": 19830936281760.0, + "grad_norm": 1.6785603247923353, + "language_loss": 0.81731629, + "learning_rate": 3.949539631489295e-06, + "loss": 0.84724653, + "num_input_tokens_seen": 35645030, + "step": 1655, + "time_per_iteration": 5.7428295612335205 + }, + { + "auxiliary_loss_clip": 0.01600361, + "auxiliary_loss_mlp": 0.013808, + "balance_loss_clip": 1.23843443, + "balance_loss_mlp": 1.08592379, + "epoch": 0.09956410641815722, + "flos": 25005716524800.0, + "grad_norm": 1.9315471367048773, + "language_loss": 0.80968899, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83950061, + "num_input_tokens_seen": 35664305, + "step": 1656, + "time_per_iteration": 2.81620454788208 + }, + { + "auxiliary_loss_clip": 0.01596019, + "auxiliary_loss_mlp": 0.01365501, + "balance_loss_clip": 1.23367548, + "balance_loss_mlp": 1.07100654, + "epoch": 0.0996242296708252, + "flos": 19319735440800.0, + "grad_norm": 1.768405148631309, + "language_loss": 0.8896085, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91922367, + "num_input_tokens_seen": 35684060, + "step": 1657, + "time_per_iteration": 4.190796375274658 + }, + { + "auxiliary_loss_clip": 0.01602663, + "auxiliary_loss_mlp": 0.01368818, + "balance_loss_clip": 1.24122036, + "balance_loss_mlp": 1.08080888, + "epoch": 0.09968435292349316, + "flos": 21873767381280.0, + "grad_norm": 2.966068030665898, + "language_loss": 0.8517924, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.88150716, + "num_input_tokens_seen": 35703250, + "step": 1658, + "time_per_iteration": 2.8429949283599854 + }, + { + "auxiliary_loss_clip": 0.01812414, + "auxiliary_loss_mlp": 0.01329338, + "balance_loss_clip": 1.44462085, + "balance_loss_mlp": 1.09740448, + "epoch": 0.09974447617616113, + "flos": 65390355391680.0, + "grad_norm": 0.9145060284556547, + "language_loss": 0.6075511, + "learning_rate": 3.949191309296585e-06, + "loss": 0.63896859, + "num_input_tokens_seen": 35762165, + "step": 1659, + "time_per_iteration": 3.393333911895752 + }, + { + "auxiliary_loss_clip": 0.01596539, + "auxiliary_loss_mlp": 0.01347268, + "balance_loss_clip": 1.23483109, + "balance_loss_mlp": 1.07241893, + "epoch": 0.0998045994288291, + "flos": 23662097976960.0, + "grad_norm": 2.0422253586125985, + "language_loss": 0.85788524, + "learning_rate": 3.949104043956321e-06, + "loss": 0.88732326, + "num_input_tokens_seen": 35781520, + "step": 1660, + "time_per_iteration": 2.7801551818847656 + }, + { + "auxiliary_loss_clip": 0.01597876, + "auxiliary_loss_mlp": 0.0137681, + "balance_loss_clip": 1.23685026, + "balance_loss_mlp": 1.10997176, + "epoch": 0.09986472268149707, + "flos": 19611747259200.0, + "grad_norm": 2.1392492906457434, + "language_loss": 0.80188423, + "learning_rate": 3.949016704705836e-06, + "loss": 0.83163106, + "num_input_tokens_seen": 35799565, + "step": 1661, + "time_per_iteration": 2.8498289585113525 + }, + { + "auxiliary_loss_clip": 0.01584348, + "auxiliary_loss_mlp": 0.0139818, + "balance_loss_clip": 1.22377086, + "balance_loss_mlp": 1.13382125, + "epoch": 0.09992484593416504, + "flos": 26215712707680.0, + "grad_norm": 2.0540368662308266, + "language_loss": 0.83678782, + "learning_rate": 3.948929291548443e-06, + "loss": 0.86661303, + "num_input_tokens_seen": 35821085, + "step": 1662, + "time_per_iteration": 2.8979263305664062 + }, + { + "auxiliary_loss_clip": 0.01588292, + "auxiliary_loss_mlp": 0.01415829, + "balance_loss_clip": 1.22823727, + "balance_loss_mlp": 1.16024458, + "epoch": 0.09998496918683301, + "flos": 17495562369600.0, + "grad_norm": 2.344203418921806, + "language_loss": 0.89366692, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.9237082, + "num_input_tokens_seen": 35839840, + "step": 1663, + "time_per_iteration": 2.804574489593506 + }, + { + "auxiliary_loss_clip": 0.01600991, + "auxiliary_loss_mlp": 0.01433134, + "balance_loss_clip": 1.24092269, + "balance_loss_mlp": 1.17735887, + "epoch": 0.10004509243950098, + "flos": 22787200366560.0, + "grad_norm": 1.682565160726373, + "language_loss": 0.70202005, + "learning_rate": 3.948754243526191e-06, + "loss": 0.73236138, + "num_input_tokens_seen": 35861545, + "step": 1664, + "time_per_iteration": 2.7866616249084473 + }, + { + "auxiliary_loss_clip": 0.01599976, + "auxiliary_loss_mlp": 0.01439755, + "balance_loss_clip": 1.23869264, + "balance_loss_mlp": 1.18512368, + "epoch": 0.10010521569216894, + "flos": 16255261222560.0, + "grad_norm": 2.6127038651692196, + "language_loss": 0.78450191, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81489921, + "num_input_tokens_seen": 35878295, + "step": 1665, + "time_per_iteration": 2.821721315383911 + }, + { + "auxiliary_loss_clip": 0.0160041, + "auxiliary_loss_mlp": 0.01440848, + "balance_loss_clip": 1.23959589, + "balance_loss_mlp": 1.1827836, + "epoch": 0.10016533894483691, + "flos": 23404752861120.0, + "grad_norm": 2.25611779391388, + "language_loss": 0.70249265, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.73290527, + "num_input_tokens_seen": 35898990, + "step": 1666, + "time_per_iteration": 2.8436129093170166 + }, + { + "auxiliary_loss_clip": 0.01590251, + "auxiliary_loss_mlp": 0.01423544, + "balance_loss_clip": 1.23100913, + "balance_loss_mlp": 1.16395426, + "epoch": 0.10022546219750489, + "flos": 19356184766880.0, + "grad_norm": 2.4139120165752277, + "language_loss": 0.79183555, + "learning_rate": 3.948491117273956e-06, + "loss": 0.82197356, + "num_input_tokens_seen": 35916225, + "step": 1667, + "time_per_iteration": 2.762037515640259 + }, + { + "auxiliary_loss_clip": 0.01597864, + "auxiliary_loss_mlp": 0.0143033, + "balance_loss_clip": 1.23676753, + "balance_loss_mlp": 1.17455471, + "epoch": 0.10028558545017285, + "flos": 27088031203200.0, + "grad_norm": 2.543786339341695, + "language_loss": 0.77664852, + "learning_rate": 3.948403260744817e-06, + "loss": 0.80693042, + "num_input_tokens_seen": 35934630, + "step": 1668, + "time_per_iteration": 2.8890511989593506 + }, + { + "auxiliary_loss_clip": 0.01590719, + "auxiliary_loss_mlp": 0.01381155, + "balance_loss_clip": 1.2310245, + "balance_loss_mlp": 1.1183219, + "epoch": 0.10034570870284082, + "flos": 25849436823360.0, + "grad_norm": 2.3846484086795146, + "language_loss": 0.78201282, + "learning_rate": 3.948315330332031e-06, + "loss": 0.81173158, + "num_input_tokens_seen": 35953855, + "step": 1669, + "time_per_iteration": 2.8029592037200928 + }, + { + "auxiliary_loss_clip": 0.0159456, + "auxiliary_loss_mlp": 0.01408392, + "balance_loss_clip": 1.2356708, + "balance_loss_mlp": 1.14040923, + "epoch": 0.1004058319555088, + "flos": 26251972392960.0, + "grad_norm": 2.6024086187793602, + "language_loss": 0.85101759, + "learning_rate": 3.948227326038933e-06, + "loss": 0.88104707, + "num_input_tokens_seen": 35974555, + "step": 1670, + "time_per_iteration": 2.8371498584747314 + }, + { + "auxiliary_loss_clip": 0.01586785, + "auxiliary_loss_mlp": 0.01339356, + "balance_loss_clip": 1.22610903, + "balance_loss_mlp": 1.06679583, + "epoch": 0.10046595520817676, + "flos": 25376847213600.0, + "grad_norm": 1.6886346115811162, + "language_loss": 0.77126861, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.80053002, + "num_input_tokens_seen": 35996830, + "step": 1671, + "time_per_iteration": 2.8311820030212402 + }, + { + "auxiliary_loss_clip": 0.01795878, + "auxiliary_loss_mlp": 0.01284317, + "balance_loss_clip": 1.42966771, + "balance_loss_mlp": 1.06535339, + "epoch": 0.10052607846084473, + "flos": 67467625624800.0, + "grad_norm": 0.7835882750700401, + "language_loss": 0.60712588, + "learning_rate": 3.948051095825149e-06, + "loss": 0.63792789, + "num_input_tokens_seen": 36054465, + "step": 1672, + "time_per_iteration": 3.3721776008605957 + }, + { + "auxiliary_loss_clip": 0.01579713, + "auxiliary_loss_mlp": 0.01354097, + "balance_loss_clip": 1.22091579, + "balance_loss_mlp": 1.07047462, + "epoch": 0.10058620171351271, + "flos": 21362604468480.0, + "grad_norm": 3.1790107017470848, + "language_loss": 0.77647722, + "learning_rate": 3.947962869911147e-06, + "loss": 0.80581534, + "num_input_tokens_seen": 36073480, + "step": 1673, + "time_per_iteration": 2.898684024810791 + }, + { + "auxiliary_loss_clip": 0.01579699, + "auxiliary_loss_mlp": 0.0134416, + "balance_loss_clip": 1.21965909, + "balance_loss_mlp": 1.05958331, + "epoch": 0.10064632496618067, + "flos": 16802001113760.0, + "grad_norm": 2.5499227684972863, + "language_loss": 0.74258035, + "learning_rate": 3.947874570130197e-06, + "loss": 0.771819, + "num_input_tokens_seen": 36091830, + "step": 1674, + "time_per_iteration": 2.7823450565338135 + }, + { + "auxiliary_loss_clip": 0.01588935, + "auxiliary_loss_mlp": 0.01377545, + "balance_loss_clip": 1.22883856, + "balance_loss_mlp": 1.08629322, + "epoch": 0.10070644821884864, + "flos": 23626786495680.0, + "grad_norm": 2.3452154880804534, + "language_loss": 0.8024717, + "learning_rate": 3.947786196485649e-06, + "loss": 0.83213651, + "num_input_tokens_seen": 36111400, + "step": 1675, + "time_per_iteration": 2.8485448360443115 + }, + { + "auxiliary_loss_clip": 0.01591446, + "auxiliary_loss_mlp": 0.01375983, + "balance_loss_clip": 1.23157859, + "balance_loss_mlp": 1.0832051, + "epoch": 0.1007665714715166, + "flos": 24464893426560.0, + "grad_norm": 2.359969578701015, + "language_loss": 0.81486541, + "learning_rate": 3.947697748980853e-06, + "loss": 0.84453964, + "num_input_tokens_seen": 36129345, + "step": 1676, + "time_per_iteration": 2.8211722373962402 + }, + { + "auxiliary_loss_clip": 0.01586341, + "auxiliary_loss_mlp": 0.01382377, + "balance_loss_clip": 1.22714531, + "balance_loss_mlp": 1.09131563, + "epoch": 0.10082669472418458, + "flos": 16800939125280.0, + "grad_norm": 2.2367674848704078, + "language_loss": 0.86450535, + "learning_rate": 3.947609227619163e-06, + "loss": 0.89419258, + "num_input_tokens_seen": 36146255, + "step": 1677, + "time_per_iteration": 2.760434150695801 + }, + { + "auxiliary_loss_clip": 0.01582911, + "auxiliary_loss_mlp": 0.01383078, + "balance_loss_clip": 1.22379267, + "balance_loss_mlp": 1.09106314, + "epoch": 0.10088681797685255, + "flos": 13555242336960.0, + "grad_norm": 3.5374267728985616, + "language_loss": 0.86557007, + "learning_rate": 3.947520632403936e-06, + "loss": 0.89522994, + "num_input_tokens_seen": 36164050, + "step": 1678, + "time_per_iteration": 2.7923226356506348 + }, + { + "auxiliary_loss_clip": 0.01581214, + "auxiliary_loss_mlp": 0.01375217, + "balance_loss_clip": 1.22140002, + "balance_loss_mlp": 1.09006834, + "epoch": 0.10094694122952051, + "flos": 25268106085920.0, + "grad_norm": 2.751499724756277, + "language_loss": 0.90190434, + "learning_rate": 3.947431963338532e-06, + "loss": 0.93146861, + "num_input_tokens_seen": 36183530, + "step": 1679, + "time_per_iteration": 2.936603307723999 + }, + { + "auxiliary_loss_clip": 0.01790183, + "auxiliary_loss_mlp": 0.01318764, + "balance_loss_clip": 1.42709422, + "balance_loss_mlp": 1.09903717, + "epoch": 0.10100706448218849, + "flos": 69861409267680.0, + "grad_norm": 0.7788859201182132, + "language_loss": 0.5288204, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55990994, + "num_input_tokens_seen": 36248550, + "step": 1680, + "time_per_iteration": 3.365609645843506 + }, + { + "auxiliary_loss_clip": 0.01582873, + "auxiliary_loss_mlp": 0.0135495, + "balance_loss_clip": 1.22425246, + "balance_loss_mlp": 1.07018244, + "epoch": 0.10106718773485646, + "flos": 20009010814560.0, + "grad_norm": 1.7535560212056254, + "language_loss": 0.7723583, + "learning_rate": 3.947254403670641e-06, + "loss": 0.80173647, + "num_input_tokens_seen": 36266065, + "step": 1681, + "time_per_iteration": 2.7445638179779053 + }, + { + "auxiliary_loss_clip": 0.01583697, + "auxiliary_loss_mlp": 0.0136034, + "balance_loss_clip": 1.22421432, + "balance_loss_mlp": 1.07538223, + "epoch": 0.10112731098752442, + "flos": 13481509265280.0, + "grad_norm": 3.025615324850629, + "language_loss": 0.93992865, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96936899, + "num_input_tokens_seen": 36280960, + "step": 1682, + "time_per_iteration": 2.7408623695373535 + }, + { + "auxiliary_loss_clip": 0.01578716, + "auxiliary_loss_mlp": 0.0133771, + "balance_loss_clip": 1.21884847, + "balance_loss_mlp": 1.06286097, + "epoch": 0.1011874342401924, + "flos": 18517926123360.0, + "grad_norm": 2.3522382322024113, + "language_loss": 0.8802979, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90946215, + "num_input_tokens_seen": 36299010, + "step": 1683, + "time_per_iteration": 2.7736032009124756 + }, + { + "auxiliary_loss_clip": 0.0158414, + "auxiliary_loss_mlp": 0.01362314, + "balance_loss_clip": 1.22538519, + "balance_loss_mlp": 1.08918118, + "epoch": 0.10124755749286037, + "flos": 20704885688160.0, + "grad_norm": 2.020827833847611, + "language_loss": 0.74878561, + "learning_rate": 3.946987510376624e-06, + "loss": 0.77825016, + "num_input_tokens_seen": 36318400, + "step": 1684, + "time_per_iteration": 2.8154594898223877 + }, + { + "auxiliary_loss_clip": 0.01778776, + "auxiliary_loss_mlp": 0.01313789, + "balance_loss_clip": 1.41532016, + "balance_loss_mlp": 1.10703278, + "epoch": 0.10130768074552833, + "flos": 56116372232160.0, + "grad_norm": 0.7701373890097193, + "language_loss": 0.6105653, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.641491, + "num_input_tokens_seen": 36381815, + "step": 1685, + "time_per_iteration": 3.4662249088287354 + }, + { + "auxiliary_loss_clip": 0.01575752, + "auxiliary_loss_mlp": 0.01361128, + "balance_loss_clip": 1.2166338, + "balance_loss_mlp": 1.09104788, + "epoch": 0.1013678039981963, + "flos": 33405218919360.0, + "grad_norm": 2.5519495572836433, + "language_loss": 0.61705858, + "learning_rate": 3.946809212358516e-06, + "loss": 0.64642739, + "num_input_tokens_seen": 36404320, + "step": 1686, + "time_per_iteration": 2.941783905029297 + }, + { + "auxiliary_loss_clip": 0.01583332, + "auxiliary_loss_mlp": 0.01347605, + "balance_loss_clip": 1.22591233, + "balance_loss_mlp": 1.07695246, + "epoch": 0.10142792725086427, + "flos": 31908065722560.0, + "grad_norm": 4.033947735271379, + "language_loss": 0.8140589, + "learning_rate": 3.946719952612972e-06, + "loss": 0.84336829, + "num_input_tokens_seen": 36427510, + "step": 1687, + "time_per_iteration": 4.377343654632568 + }, + { + "auxiliary_loss_clip": 0.01574005, + "auxiliary_loss_mlp": 0.01363509, + "balance_loss_clip": 1.21598864, + "balance_loss_mlp": 1.08694339, + "epoch": 0.10148805050353224, + "flos": 28478453464800.0, + "grad_norm": 2.241679754600191, + "language_loss": 0.72453654, + "learning_rate": 3.94663061904761e-06, + "loss": 0.75391167, + "num_input_tokens_seen": 36448230, + "step": 1688, + "time_per_iteration": 2.884146213531494 + }, + { + "auxiliary_loss_clip": 0.01572072, + "auxiliary_loss_mlp": 0.01346429, + "balance_loss_clip": 1.21418607, + "balance_loss_mlp": 1.07310557, + "epoch": 0.1015481737562002, + "flos": 25150869050400.0, + "grad_norm": 2.638511746375993, + "language_loss": 0.87040436, + "learning_rate": 3.94654121166582e-06, + "loss": 0.89958942, + "num_input_tokens_seen": 36464395, + "step": 1689, + "time_per_iteration": 2.80043888092041 + }, + { + "auxiliary_loss_clip": 0.01579948, + "auxiliary_loss_mlp": 0.01336391, + "balance_loss_clip": 1.22043777, + "balance_loss_mlp": 1.06077862, + "epoch": 0.10160829700886818, + "flos": 30885398543520.0, + "grad_norm": 2.9771047531108503, + "language_loss": 0.8817994, + "learning_rate": 3.946451730470993e-06, + "loss": 0.91096276, + "num_input_tokens_seen": 36486475, + "step": 1690, + "time_per_iteration": 2.8598835468292236 + }, + { + "auxiliary_loss_clip": 0.01565369, + "auxiliary_loss_mlp": 0.01344236, + "balance_loss_clip": 1.20749402, + "balance_loss_mlp": 1.06557274, + "epoch": 0.10166842026153615, + "flos": 20414087570880.0, + "grad_norm": 2.2419702149559084, + "language_loss": 0.83663112, + "learning_rate": 3.946362175466521e-06, + "loss": 0.86572719, + "num_input_tokens_seen": 36505310, + "step": 1691, + "time_per_iteration": 2.800828456878662 + }, + { + "auxiliary_loss_clip": 0.01570513, + "auxiliary_loss_mlp": 0.01326382, + "balance_loss_clip": 1.21209002, + "balance_loss_mlp": 1.04886246, + "epoch": 0.10172854351420411, + "flos": 33479938123200.0, + "grad_norm": 2.702396764351976, + "language_loss": 0.66970325, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69867218, + "num_input_tokens_seen": 36529820, + "step": 1692, + "time_per_iteration": 2.923520088195801 + }, + { + "auxiliary_loss_clip": 0.01560585, + "auxiliary_loss_mlp": 0.01344894, + "balance_loss_clip": 1.20255482, + "balance_loss_mlp": 1.0690912, + "epoch": 0.1017886667668721, + "flos": 23552598286080.0, + "grad_norm": 3.1707588484139166, + "language_loss": 0.76117551, + "learning_rate": 3.94618284404223e-06, + "loss": 0.79023027, + "num_input_tokens_seen": 36549000, + "step": 1693, + "time_per_iteration": 4.3418869972229 + }, + { + "auxiliary_loss_clip": 0.01560162, + "auxiliary_loss_mlp": 0.0134301, + "balance_loss_clip": 1.20278466, + "balance_loss_mlp": 1.06587243, + "epoch": 0.10184879001954006, + "flos": 23298401207520.0, + "grad_norm": 2.817405036872847, + "language_loss": 0.87538999, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.90442169, + "num_input_tokens_seen": 36567515, + "step": 1694, + "time_per_iteration": 4.314241647720337 + }, + { + "auxiliary_loss_clip": 0.01558579, + "auxiliary_loss_mlp": 0.01346137, + "balance_loss_clip": 1.20029712, + "balance_loss_mlp": 1.06613815, + "epoch": 0.10190891327220802, + "flos": 18335110570560.0, + "grad_norm": 7.762309513762596, + "language_loss": 0.79212213, + "learning_rate": 3.946003217420147e-06, + "loss": 0.82116926, + "num_input_tokens_seen": 36586190, + "step": 1695, + "time_per_iteration": 4.344055414199829 + }, + { + "auxiliary_loss_clip": 0.01553761, + "auxiliary_loss_mlp": 0.0134553, + "balance_loss_clip": 1.19606268, + "balance_loss_mlp": 1.07010865, + "epoch": 0.10196903652487599, + "flos": 26467596168480.0, + "grad_norm": 1.9394544640634108, + "language_loss": 0.86810499, + "learning_rate": 3.945913293418447e-06, + "loss": 0.89709789, + "num_input_tokens_seen": 36607495, + "step": 1696, + "time_per_iteration": 2.956965446472168 + }, + { + "auxiliary_loss_clip": 0.01572741, + "auxiliary_loss_mlp": 0.01338923, + "balance_loss_clip": 1.21609855, + "balance_loss_mlp": 1.06350183, + "epoch": 0.10202915977754397, + "flos": 21871567548000.0, + "grad_norm": 3.893377040659145, + "language_loss": 0.82176298, + "learning_rate": 3.945823295627519e-06, + "loss": 0.85087961, + "num_input_tokens_seen": 36628555, + "step": 1697, + "time_per_iteration": 2.8679354190826416 + }, + { + "auxiliary_loss_clip": 0.0156757, + "auxiliary_loss_mlp": 0.01345016, + "balance_loss_clip": 1.21129489, + "balance_loss_mlp": 1.06825924, + "epoch": 0.10208928303021193, + "flos": 22311955785600.0, + "grad_norm": 4.12169833386099, + "language_loss": 0.8116653, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.84079117, + "num_input_tokens_seen": 36646250, + "step": 1698, + "time_per_iteration": 2.8825788497924805 + }, + { + "auxiliary_loss_clip": 0.01569465, + "auxiliary_loss_mlp": 0.01335497, + "balance_loss_clip": 1.21361756, + "balance_loss_mlp": 1.05473495, + "epoch": 0.1021494062828799, + "flos": 22127661034560.0, + "grad_norm": 2.992406555963885, + "language_loss": 0.76606536, + "learning_rate": 3.945643078691637e-06, + "loss": 0.79511499, + "num_input_tokens_seen": 36666675, + "step": 1699, + "time_per_iteration": 2.916879415512085 + }, + { + "auxiliary_loss_clip": 0.01574133, + "auxiliary_loss_mlp": 0.01341265, + "balance_loss_clip": 1.21766675, + "balance_loss_mlp": 1.06031275, + "epoch": 0.10220952953554788, + "flos": 19648803435840.0, + "grad_norm": 2.3646252800168157, + "language_loss": 0.80540991, + "learning_rate": 3.945552859553516e-06, + "loss": 0.83456391, + "num_input_tokens_seen": 36685225, + "step": 1700, + "time_per_iteration": 2.8388595581054688 + }, + { + "auxiliary_loss_clip": 0.015616, + "auxiliary_loss_mlp": 0.01336964, + "balance_loss_clip": 1.20706654, + "balance_loss_mlp": 1.05963588, + "epoch": 0.10226965278821584, + "flos": 29789794784160.0, + "grad_norm": 1.965471783657329, + "language_loss": 0.76843053, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79741621, + "num_input_tokens_seen": 36705985, + "step": 1701, + "time_per_iteration": 2.8569588661193848 + }, + { + "auxiliary_loss_clip": 0.01566142, + "auxiliary_loss_mlp": 0.01347111, + "balance_loss_clip": 1.21150255, + "balance_loss_mlp": 1.07397866, + "epoch": 0.10232977604088381, + "flos": 27019304648640.0, + "grad_norm": 2.3828397621818063, + "language_loss": 0.77763081, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80676335, + "num_input_tokens_seen": 36725815, + "step": 1702, + "time_per_iteration": 2.7637696266174316 + }, + { + "auxiliary_loss_clip": 0.01574383, + "auxiliary_loss_mlp": 0.01348142, + "balance_loss_clip": 1.2182281, + "balance_loss_mlp": 1.07443762, + "epoch": 0.10238989929355179, + "flos": 20779642820160.0, + "grad_norm": 2.259536909577478, + "language_loss": 0.94295335, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97217858, + "num_input_tokens_seen": 36742345, + "step": 1703, + "time_per_iteration": 2.730384588241577 + }, + { + "auxiliary_loss_clip": 0.01714929, + "auxiliary_loss_mlp": 0.01250305, + "balance_loss_clip": 1.35924435, + "balance_loss_mlp": 1.03057861, + "epoch": 0.10245002254621975, + "flos": 57704402393280.0, + "grad_norm": 0.9108009002079163, + "language_loss": 0.55012292, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57977527, + "num_input_tokens_seen": 36798775, + "step": 1704, + "time_per_iteration": 3.275362730026245 + }, + { + "auxiliary_loss_clip": 0.01562049, + "auxiliary_loss_mlp": 0.01330462, + "balance_loss_clip": 1.20758224, + "balance_loss_mlp": 1.04912853, + "epoch": 0.10251014579888772, + "flos": 16802001113760.0, + "grad_norm": 2.1450157889860417, + "language_loss": 0.84534502, + "learning_rate": 3.945100657298039e-06, + "loss": 0.87427014, + "num_input_tokens_seen": 36816295, + "step": 1705, + "time_per_iteration": 2.7527077198028564 + }, + { + "auxiliary_loss_clip": 0.01699944, + "auxiliary_loss_mlp": 0.01248825, + "balance_loss_clip": 1.34524751, + "balance_loss_mlp": 1.03062439, + "epoch": 0.1025702690515557, + "flos": 68571877004640.0, + "grad_norm": 0.8227419510219834, + "language_loss": 0.60405678, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.63354445, + "num_input_tokens_seen": 36882030, + "step": 1706, + "time_per_iteration": 3.3137705326080322 + }, + { + "auxiliary_loss_clip": 0.01560585, + "auxiliary_loss_mlp": 0.01334443, + "balance_loss_clip": 1.20586026, + "balance_loss_mlp": 1.06283617, + "epoch": 0.10263039230422366, + "flos": 14868404208000.0, + "grad_norm": 2.6110793733327715, + "language_loss": 0.86613071, + "learning_rate": 3.94491926006294e-06, + "loss": 0.89508092, + "num_input_tokens_seen": 36899245, + "step": 1707, + "time_per_iteration": 2.7769157886505127 + }, + { + "auxiliary_loss_clip": 0.01556045, + "auxiliary_loss_mlp": 0.01352818, + "balance_loss_clip": 1.20166337, + "balance_loss_mlp": 1.0836916, + "epoch": 0.10269051555689163, + "flos": 25339942749600.0, + "grad_norm": 1.642661254201688, + "language_loss": 0.73422432, + "learning_rate": 3.944828450816369e-06, + "loss": 0.763313, + "num_input_tokens_seen": 36920950, + "step": 1708, + "time_per_iteration": 2.8674519062042236 + }, + { + "auxiliary_loss_clip": 0.01557524, + "auxiliary_loss_mlp": 0.01340645, + "balance_loss_clip": 1.20268285, + "balance_loss_mlp": 1.0701828, + "epoch": 0.10275063880955959, + "flos": 21070934003520.0, + "grad_norm": 1.9231545796982723, + "language_loss": 0.90901887, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93800056, + "num_input_tokens_seen": 36938900, + "step": 1709, + "time_per_iteration": 2.7838566303253174 + }, + { + "auxiliary_loss_clip": 0.01561737, + "auxiliary_loss_mlp": 0.01355047, + "balance_loss_clip": 1.20672679, + "balance_loss_mlp": 1.08897209, + "epoch": 0.10281076206222757, + "flos": 30369077400960.0, + "grad_norm": 2.9271659132817165, + "language_loss": 0.88572037, + "learning_rate": 3.944646611082406e-06, + "loss": 0.91488826, + "num_input_tokens_seen": 36957010, + "step": 1710, + "time_per_iteration": 2.8158106803894043 + }, + { + "auxiliary_loss_clip": 0.01562509, + "auxiliary_loss_mlp": 0.01341966, + "balance_loss_clip": 1.20808136, + "balance_loss_mlp": 1.07112193, + "epoch": 0.10287088531489554, + "flos": 22420469344320.0, + "grad_norm": 2.2756561672312863, + "language_loss": 0.796471, + "learning_rate": 3.944555580601908e-06, + "loss": 0.82551569, + "num_input_tokens_seen": 36977690, + "step": 1711, + "time_per_iteration": 2.8697738647460938 + }, + { + "auxiliary_loss_clip": 0.01558904, + "auxiliary_loss_mlp": 0.01362205, + "balance_loss_clip": 1.20415139, + "balance_loss_mlp": 1.09193397, + "epoch": 0.1029310085675635, + "flos": 25118174612160.0, + "grad_norm": 3.3580524060439485, + "language_loss": 0.73732096, + "learning_rate": 3.944464476383668e-06, + "loss": 0.76653206, + "num_input_tokens_seen": 36997300, + "step": 1712, + "time_per_iteration": 2.8686680793762207 + }, + { + "auxiliary_loss_clip": 0.01552075, + "auxiliary_loss_mlp": 0.01354553, + "balance_loss_clip": 1.19754791, + "balance_loss_mlp": 1.08790612, + "epoch": 0.10299113182023148, + "flos": 19867726961280.0, + "grad_norm": 3.674111033965456, + "language_loss": 0.87261206, + "learning_rate": 3.94437329843114e-06, + "loss": 0.90167832, + "num_input_tokens_seen": 37016110, + "step": 1713, + "time_per_iteration": 2.8309390544891357 + }, + { + "auxiliary_loss_clip": 0.01551547, + "auxiliary_loss_mlp": 0.01327754, + "balance_loss_clip": 1.19673061, + "balance_loss_mlp": 1.05652928, + "epoch": 0.10305125507289944, + "flos": 20449626621120.0, + "grad_norm": 2.7700912531927466, + "language_loss": 0.72738564, + "learning_rate": 3.944282046747782e-06, + "loss": 0.75617868, + "num_input_tokens_seen": 37036405, + "step": 1714, + "time_per_iteration": 2.865410327911377 + }, + { + "auxiliary_loss_clip": 0.01556547, + "auxiliary_loss_mlp": 0.01352171, + "balance_loss_clip": 1.20222175, + "balance_loss_mlp": 1.07827568, + "epoch": 0.10311137832556741, + "flos": 26253224022240.0, + "grad_norm": 1.9556362312807605, + "language_loss": 0.90916693, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93825406, + "num_input_tokens_seen": 37057580, + "step": 1715, + "time_per_iteration": 2.9135172367095947 + }, + { + "auxiliary_loss_clip": 0.01555157, + "auxiliary_loss_mlp": 0.01335774, + "balance_loss_clip": 1.19933021, + "balance_loss_mlp": 1.06206954, + "epoch": 0.10317150157823539, + "flos": 35301873432960.0, + "grad_norm": 1.9313353708840229, + "language_loss": 0.75717998, + "learning_rate": 3.944099322202418e-06, + "loss": 0.7860893, + "num_input_tokens_seen": 37079120, + "step": 1716, + "time_per_iteration": 2.9176876544952393 + }, + { + "auxiliary_loss_clip": 0.01560728, + "auxiliary_loss_mlp": 0.01343579, + "balance_loss_clip": 1.20609498, + "balance_loss_mlp": 1.07063711, + "epoch": 0.10323162483090335, + "flos": 25742364534720.0, + "grad_norm": 3.8230358057895533, + "language_loss": 0.85220492, + "learning_rate": 3.944007849347342e-06, + "loss": 0.881248, + "num_input_tokens_seen": 37099710, + "step": 1717, + "time_per_iteration": 2.788163900375366 + }, + { + "auxiliary_loss_clip": 0.01557416, + "auxiliary_loss_mlp": 0.0134319, + "balance_loss_clip": 1.20307004, + "balance_loss_mlp": 1.06700599, + "epoch": 0.10329174808357132, + "flos": 16291748476800.0, + "grad_norm": 2.9963229386054673, + "language_loss": 0.83221918, + "learning_rate": 3.943916302775292e-06, + "loss": 0.86122525, + "num_input_tokens_seen": 37117775, + "step": 1718, + "time_per_iteration": 2.7767081260681152 + }, + { + "auxiliary_loss_clip": 0.01553311, + "auxiliary_loss_mlp": 0.01334791, + "balance_loss_clip": 1.19857633, + "balance_loss_mlp": 1.06070518, + "epoch": 0.10335187133623928, + "flos": 36690285502080.0, + "grad_norm": 2.089773347506607, + "language_loss": 0.73442954, + "learning_rate": 3.943824682489742e-06, + "loss": 0.76331055, + "num_input_tokens_seen": 37140280, + "step": 1719, + "time_per_iteration": 2.9039652347564697 + }, + { + "auxiliary_loss_clip": 0.01557318, + "auxiliary_loss_mlp": 0.01338484, + "balance_loss_clip": 1.20426822, + "balance_loss_mlp": 1.06458914, + "epoch": 0.10341199458890726, + "flos": 14977410832800.0, + "grad_norm": 4.027186989635407, + "language_loss": 0.92793548, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.95689344, + "num_input_tokens_seen": 37158350, + "step": 1720, + "time_per_iteration": 2.7742538452148438 + }, + { + "auxiliary_loss_clip": 0.01556486, + "auxiliary_loss_mlp": 0.01341606, + "balance_loss_clip": 1.20210457, + "balance_loss_mlp": 1.06885517, + "epoch": 0.10347211784157523, + "flos": 21033650257920.0, + "grad_norm": 2.3415756059292616, + "language_loss": 0.79686922, + "learning_rate": 3.943641220792039e-06, + "loss": 0.82585013, + "num_input_tokens_seen": 37177120, + "step": 1721, + "time_per_iteration": 2.8104472160339355 + }, + { + "auxiliary_loss_clip": 0.01552816, + "auxiliary_loss_mlp": 0.01337766, + "balance_loss_clip": 1.19777369, + "balance_loss_mlp": 1.06024718, + "epoch": 0.1035322410942432, + "flos": 19794145602240.0, + "grad_norm": 2.057195942347403, + "language_loss": 0.81064177, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.83954763, + "num_input_tokens_seen": 37195895, + "step": 1722, + "time_per_iteration": 2.7744905948638916 + }, + { + "auxiliary_loss_clip": 0.01656486, + "auxiliary_loss_mlp": 0.01252472, + "balance_loss_clip": 1.30669212, + "balance_loss_mlp": 1.04495239, + "epoch": 0.10359236434691117, + "flos": 52704624502080.0, + "grad_norm": 0.9559477566241847, + "language_loss": 0.67152387, + "learning_rate": 3.943457464282059e-06, + "loss": 0.70061344, + "num_input_tokens_seen": 37247270, + "step": 1723, + "time_per_iteration": 3.114020824432373 + }, + { + "auxiliary_loss_clip": 0.01556841, + "auxiliary_loss_mlp": 0.01346973, + "balance_loss_clip": 1.20193291, + "balance_loss_mlp": 1.06792831, + "epoch": 0.10365248759957914, + "flos": 18407364444000.0, + "grad_norm": 3.340694912863031, + "language_loss": 0.77668059, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80571872, + "num_input_tokens_seen": 37265595, + "step": 1724, + "time_per_iteration": 2.7390084266662598 + }, + { + "auxiliary_loss_clip": 0.0156313, + "auxiliary_loss_mlp": 0.01351176, + "balance_loss_clip": 1.20891416, + "balance_loss_mlp": 1.06812525, + "epoch": 0.1037126108522471, + "flos": 47556925693920.0, + "grad_norm": 3.2683298239719196, + "language_loss": 0.7517063, + "learning_rate": 3.943273412987676e-06, + "loss": 0.78084934, + "num_input_tokens_seen": 37286660, + "step": 1725, + "time_per_iteration": 4.620249032974243 + }, + { + "auxiliary_loss_clip": 0.01564991, + "auxiliary_loss_mlp": 0.01331633, + "balance_loss_clip": 1.20992088, + "balance_loss_mlp": 1.05316043, + "epoch": 0.10377273410491508, + "flos": 22818681103680.0, + "grad_norm": 3.7616761804404852, + "language_loss": 0.7483775, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77734375, + "num_input_tokens_seen": 37304915, + "step": 1726, + "time_per_iteration": 2.827608346939087 + }, + { + "auxiliary_loss_clip": 0.01565764, + "auxiliary_loss_mlp": 0.01345054, + "balance_loss_clip": 1.21053457, + "balance_loss_mlp": 1.05838001, + "epoch": 0.10383285735758305, + "flos": 26140993503840.0, + "grad_norm": 2.782057436072128, + "language_loss": 0.73480457, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.76391274, + "num_input_tokens_seen": 37325265, + "step": 1727, + "time_per_iteration": 2.830111265182495 + }, + { + "auxiliary_loss_clip": 0.01564249, + "auxiliary_loss_mlp": 0.01353811, + "balance_loss_clip": 1.20793521, + "balance_loss_mlp": 1.07152355, + "epoch": 0.10389298061025101, + "flos": 17093671578720.0, + "grad_norm": 2.4478974968029004, + "language_loss": 0.84640545, + "learning_rate": 3.942996783386422e-06, + "loss": 0.87558603, + "num_input_tokens_seen": 37341650, + "step": 1728, + "time_per_iteration": 2.8299098014831543 + }, + { + "auxiliary_loss_clip": 0.01574293, + "auxiliary_loss_mlp": 0.01361168, + "balance_loss_clip": 1.21759057, + "balance_loss_mlp": 1.08193207, + "epoch": 0.10395310386291898, + "flos": 20778580831680.0, + "grad_norm": 3.5750835085302994, + "language_loss": 0.70641047, + "learning_rate": 3.942904426157406e-06, + "loss": 0.7357651, + "num_input_tokens_seen": 37360270, + "step": 1729, + "time_per_iteration": 2.862499713897705 + }, + { + "auxiliary_loss_clip": 0.0157025, + "auxiliary_loss_mlp": 0.01339979, + "balance_loss_clip": 1.21570182, + "balance_loss_mlp": 1.05921757, + "epoch": 0.10401322711558696, + "flos": 12822045789600.0, + "grad_norm": 2.6772253528726573, + "language_loss": 0.81911266, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.84821498, + "num_input_tokens_seen": 37375225, + "step": 1730, + "time_per_iteration": 2.7908120155334473 + }, + { + "auxiliary_loss_clip": 0.01571236, + "auxiliary_loss_mlp": 0.01329824, + "balance_loss_clip": 1.21508098, + "balance_loss_mlp": 1.05096984, + "epoch": 0.10407335036825492, + "flos": 23186891324160.0, + "grad_norm": 1.9037581243372683, + "language_loss": 0.75788689, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78689754, + "num_input_tokens_seen": 37395165, + "step": 1731, + "time_per_iteration": 4.290908098220825 + }, + { + "auxiliary_loss_clip": 0.01574546, + "auxiliary_loss_mlp": 0.01338572, + "balance_loss_clip": 1.21964824, + "balance_loss_mlp": 1.06181586, + "epoch": 0.10413347362092289, + "flos": 26106592298400.0, + "grad_norm": 2.39106801552466, + "language_loss": 0.82892132, + "learning_rate": 3.9426269124336e-06, + "loss": 0.85805249, + "num_input_tokens_seen": 37414845, + "step": 1732, + "time_per_iteration": 4.37512469291687 + }, + { + "auxiliary_loss_clip": 0.01574749, + "auxiliary_loss_mlp": 0.01332307, + "balance_loss_clip": 1.21953773, + "balance_loss_mlp": 1.05574179, + "epoch": 0.10419359687359087, + "flos": 12642719627520.0, + "grad_norm": 3.6174561389746245, + "language_loss": 0.83703661, + "learning_rate": 3.942534260525104e-06, + "loss": 0.86610723, + "num_input_tokens_seen": 37432490, + "step": 1733, + "time_per_iteration": 2.788848400115967 + }, + { + "auxiliary_loss_clip": 0.01573662, + "auxiliary_loss_mlp": 0.01340016, + "balance_loss_clip": 1.21837568, + "balance_loss_mlp": 1.05791879, + "epoch": 0.10425372012625883, + "flos": 12126436413120.0, + "grad_norm": 2.796636601450761, + "language_loss": 0.77113885, + "learning_rate": 3.942441534955514e-06, + "loss": 0.80027562, + "num_input_tokens_seen": 37449435, + "step": 1734, + "time_per_iteration": 4.190448045730591 + }, + { + "auxiliary_loss_clip": 0.01573276, + "auxiliary_loss_mlp": 0.01335643, + "balance_loss_clip": 1.21788716, + "balance_loss_mlp": 1.06155741, + "epoch": 0.1043138433789268, + "flos": 25339904821440.0, + "grad_norm": 1.8752491382125904, + "language_loss": 0.74858034, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77766955, + "num_input_tokens_seen": 37469105, + "step": 1735, + "time_per_iteration": 2.863598585128784 + }, + { + "auxiliary_loss_clip": 0.01582856, + "auxiliary_loss_mlp": 0.0133332, + "balance_loss_clip": 1.22754872, + "balance_loss_mlp": 1.06247652, + "epoch": 0.10437396663159478, + "flos": 29169625246560.0, + "grad_norm": 2.299753301574991, + "language_loss": 0.78408611, + "learning_rate": 3.94225586284712e-06, + "loss": 0.81324786, + "num_input_tokens_seen": 37490540, + "step": 1736, + "time_per_iteration": 2.864110231399536 + }, + { + "auxiliary_loss_clip": 0.01585347, + "auxiliary_loss_mlp": 0.01341098, + "balance_loss_clip": 1.23007846, + "balance_loss_mlp": 1.07330632, + "epoch": 0.10443408988426274, + "flos": 25083204484320.0, + "grad_norm": 2.01300144218436, + "language_loss": 0.70632154, + "learning_rate": 3.942162916315356e-06, + "loss": 0.73558605, + "num_input_tokens_seen": 37511905, + "step": 1737, + "time_per_iteration": 2.852736473083496 + }, + { + "auxiliary_loss_clip": 0.0158448, + "auxiliary_loss_mlp": 0.01339086, + "balance_loss_clip": 1.23104429, + "balance_loss_mlp": 1.06385517, + "epoch": 0.1044942131369307, + "flos": 26762035389120.0, + "grad_norm": 2.079400640321626, + "language_loss": 0.81755906, + "learning_rate": 3.942069896136581e-06, + "loss": 0.84679478, + "num_input_tokens_seen": 37533635, + "step": 1738, + "time_per_iteration": 2.8451197147369385 + }, + { + "auxiliary_loss_clip": 0.01595688, + "auxiliary_loss_mlp": 0.0135546, + "balance_loss_clip": 1.24288154, + "balance_loss_mlp": 1.08347178, + "epoch": 0.10455433638959867, + "flos": 18444572333280.0, + "grad_norm": 2.818224400656607, + "language_loss": 0.74965286, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77916431, + "num_input_tokens_seen": 37552035, + "step": 1739, + "time_per_iteration": 2.8330554962158203 + }, + { + "auxiliary_loss_clip": 0.01595148, + "auxiliary_loss_mlp": 0.01339823, + "balance_loss_clip": 1.24275875, + "balance_loss_mlp": 1.06993341, + "epoch": 0.10461445964226665, + "flos": 23221064960640.0, + "grad_norm": 2.934536486485271, + "language_loss": 0.77460825, + "learning_rate": 3.941883634852104e-06, + "loss": 0.803958, + "num_input_tokens_seen": 37571540, + "step": 1740, + "time_per_iteration": 2.860574722290039 + }, + { + "auxiliary_loss_clip": 0.01596707, + "auxiliary_loss_mlp": 0.01348732, + "balance_loss_clip": 1.24572659, + "balance_loss_mlp": 1.08170319, + "epoch": 0.10467458289493461, + "flos": 24347125396800.0, + "grad_norm": 4.7330039677087425, + "language_loss": 0.85683835, + "learning_rate": 3.941790393753467e-06, + "loss": 0.8862927, + "num_input_tokens_seen": 37588265, + "step": 1741, + "time_per_iteration": 2.8274765014648438 + }, + { + "auxiliary_loss_clip": 0.01591947, + "auxiliary_loss_mlp": 0.01362442, + "balance_loss_clip": 1.24056041, + "balance_loss_mlp": 1.0927428, + "epoch": 0.10473470614760258, + "flos": 21290085097920.0, + "grad_norm": 2.6755611239272024, + "language_loss": 0.75237125, + "learning_rate": 3.941697079021942e-06, + "loss": 0.78191519, + "num_input_tokens_seen": 37606860, + "step": 1742, + "time_per_iteration": 2.8378379344940186 + }, + { + "auxiliary_loss_clip": 0.01595354, + "auxiliary_loss_mlp": 0.0134869, + "balance_loss_clip": 1.24356174, + "balance_loss_mlp": 1.075176, + "epoch": 0.10479482940027056, + "flos": 21689245061280.0, + "grad_norm": 2.1571799795488564, + "language_loss": 0.8785997, + "learning_rate": 3.94160369066107e-06, + "loss": 0.90804011, + "num_input_tokens_seen": 37625210, + "step": 1743, + "time_per_iteration": 2.8573622703552246 + }, + { + "auxiliary_loss_clip": 0.01594265, + "auxiliary_loss_mlp": 0.01338746, + "balance_loss_clip": 1.24227118, + "balance_loss_mlp": 1.06923795, + "epoch": 0.10485495265293852, + "flos": 21575497416480.0, + "grad_norm": 3.8072738305613747, + "language_loss": 0.76400745, + "learning_rate": 3.941510228674391e-06, + "loss": 0.79333758, + "num_input_tokens_seen": 37644110, + "step": 1744, + "time_per_iteration": 2.887737989425659 + }, + { + "auxiliary_loss_clip": 0.01592553, + "auxiliary_loss_mlp": 0.01353755, + "balance_loss_clip": 1.24156547, + "balance_loss_mlp": 1.08100462, + "epoch": 0.10491507590560649, + "flos": 37964987854560.0, + "grad_norm": 2.16591169711968, + "language_loss": 0.79678881, + "learning_rate": 3.941416693065451e-06, + "loss": 0.82625186, + "num_input_tokens_seen": 37665800, + "step": 1745, + "time_per_iteration": 2.928269386291504 + }, + { + "auxiliary_loss_clip": 0.01592862, + "auxiliary_loss_mlp": 0.01343424, + "balance_loss_clip": 1.24044371, + "balance_loss_mlp": 1.07086349, + "epoch": 0.10497519915827447, + "flos": 26398717901280.0, + "grad_norm": 2.4738782988606935, + "language_loss": 0.83372092, + "learning_rate": 3.941323083837794e-06, + "loss": 0.86308378, + "num_input_tokens_seen": 37685095, + "step": 1746, + "time_per_iteration": 2.9217281341552734 + }, + { + "auxiliary_loss_clip": 0.015856, + "auxiliary_loss_mlp": 0.01327788, + "balance_loss_clip": 1.23453462, + "balance_loss_mlp": 1.05179453, + "epoch": 0.10503532241094243, + "flos": 40665423949920.0, + "grad_norm": 1.7734342304003776, + "language_loss": 0.70212996, + "learning_rate": 3.941229400994971e-06, + "loss": 0.73126382, + "num_input_tokens_seen": 37707445, + "step": 1747, + "time_per_iteration": 2.9659669399261475 + }, + { + "auxiliary_loss_clip": 0.01591749, + "auxiliary_loss_mlp": 0.01352915, + "balance_loss_clip": 1.24057412, + "balance_loss_mlp": 1.0778749, + "epoch": 0.1050954456636104, + "flos": 29792032545600.0, + "grad_norm": 6.0557756586349765, + "language_loss": 0.8453598, + "learning_rate": 3.941135644540535e-06, + "loss": 0.8748064, + "num_input_tokens_seen": 37728325, + "step": 1748, + "time_per_iteration": 2.8443729877471924 + }, + { + "auxiliary_loss_clip": 0.01584879, + "auxiliary_loss_mlp": 0.01325982, + "balance_loss_clip": 1.23267651, + "balance_loss_mlp": 1.05189633, + "epoch": 0.10515556891627838, + "flos": 23950885901760.0, + "grad_norm": 2.016528813385208, + "language_loss": 0.71561128, + "learning_rate": 3.941041814478041e-06, + "loss": 0.74471986, + "num_input_tokens_seen": 37748910, + "step": 1749, + "time_per_iteration": 2.864917755126953 + }, + { + "auxiliary_loss_clip": 0.01597025, + "auxiliary_loss_mlp": 0.01338482, + "balance_loss_clip": 1.24553931, + "balance_loss_mlp": 1.06897402, + "epoch": 0.10521569216894634, + "flos": 18261567139680.0, + "grad_norm": 2.5095755924806444, + "language_loss": 0.82055724, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84991229, + "num_input_tokens_seen": 37765745, + "step": 1750, + "time_per_iteration": 2.8114285469055176 + }, + { + "auxiliary_loss_clip": 0.01584934, + "auxiliary_loss_mlp": 0.01341802, + "balance_loss_clip": 1.23285174, + "balance_loss_mlp": 1.06580865, + "epoch": 0.10527581542161431, + "flos": 15632626354560.0, + "grad_norm": 3.062061809446935, + "language_loss": 0.92458409, + "learning_rate": 3.940853933543114e-06, + "loss": 0.95385146, + "num_input_tokens_seen": 37780520, + "step": 1751, + "time_per_iteration": 2.815324544906616 + }, + { + "auxiliary_loss_clip": 0.01584127, + "auxiliary_loss_mlp": 0.01330266, + "balance_loss_clip": 1.23254943, + "balance_loss_mlp": 1.05865955, + "epoch": 0.10533593867428227, + "flos": 18298547460000.0, + "grad_norm": 2.419078346364378, + "language_loss": 0.79480243, + "learning_rate": 3.940759882677805e-06, + "loss": 0.8239463, + "num_input_tokens_seen": 37799515, + "step": 1752, + "time_per_iteration": 2.8496530055999756 + }, + { + "auxiliary_loss_clip": 0.01583769, + "auxiliary_loss_mlp": 0.01340306, + "balance_loss_clip": 1.23162329, + "balance_loss_mlp": 1.06507516, + "epoch": 0.10539606192695025, + "flos": 29025800206560.0, + "grad_norm": 1.996524243910427, + "language_loss": 0.76177537, + "learning_rate": 3.940665758218686e-06, + "loss": 0.79101616, + "num_input_tokens_seen": 37818695, + "step": 1753, + "time_per_iteration": 2.879065990447998 + }, + { + "auxiliary_loss_clip": 0.01578024, + "auxiliary_loss_mlp": 0.01336693, + "balance_loss_clip": 1.22647095, + "balance_loss_mlp": 1.05974555, + "epoch": 0.10545618517961822, + "flos": 19971082290240.0, + "grad_norm": 2.4364451035275376, + "language_loss": 0.84033877, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86948591, + "num_input_tokens_seen": 37837860, + "step": 1754, + "time_per_iteration": 2.880380392074585 + }, + { + "auxiliary_loss_clip": 0.01586043, + "auxiliary_loss_mlp": 0.01341252, + "balance_loss_clip": 1.23486876, + "balance_loss_mlp": 1.06449592, + "epoch": 0.10551630843228618, + "flos": 16145268465600.0, + "grad_norm": 5.497814790102726, + "language_loss": 0.69096488, + "learning_rate": 3.940477288533302e-06, + "loss": 0.72023785, + "num_input_tokens_seen": 37856260, + "step": 1755, + "time_per_iteration": 2.7902450561523438 + }, + { + "auxiliary_loss_clip": 0.01580414, + "auxiliary_loss_mlp": 0.01340313, + "balance_loss_clip": 1.23006642, + "balance_loss_mlp": 1.06851578, + "epoch": 0.10557643168495416, + "flos": 23442340032000.0, + "grad_norm": 2.9144586027685184, + "language_loss": 0.77276891, + "learning_rate": 3.940382943314182e-06, + "loss": 0.8019762, + "num_input_tokens_seen": 37876960, + "step": 1756, + "time_per_iteration": 2.8817758560180664 + }, + { + "auxiliary_loss_clip": 0.01577704, + "auxiliary_loss_mlp": 0.01324107, + "balance_loss_clip": 1.22692537, + "balance_loss_mlp": 1.05002117, + "epoch": 0.10563655493762213, + "flos": 21801210082560.0, + "grad_norm": 2.689959818169322, + "language_loss": 0.80140662, + "learning_rate": 3.940288524515547e-06, + "loss": 0.83042479, + "num_input_tokens_seen": 37897070, + "step": 1757, + "time_per_iteration": 2.869067668914795 + }, + { + "auxiliary_loss_clip": 0.01582172, + "auxiliary_loss_mlp": 0.01346282, + "balance_loss_clip": 1.23168588, + "balance_loss_mlp": 1.07353151, + "epoch": 0.10569667819029009, + "flos": 53805045502080.0, + "grad_norm": 1.8573467945410567, + "language_loss": 0.79154485, + "learning_rate": 3.940194032140976e-06, + "loss": 0.82082933, + "num_input_tokens_seen": 37923635, + "step": 1758, + "time_per_iteration": 3.0946168899536133 + }, + { + "auxiliary_loss_clip": 0.01575359, + "auxiliary_loss_mlp": 0.01334663, + "balance_loss_clip": 1.22334051, + "balance_loss_mlp": 1.05847907, + "epoch": 0.10575680144295807, + "flos": 22927498087680.0, + "grad_norm": 1.836329325773433, + "language_loss": 0.91927016, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94837034, + "num_input_tokens_seen": 37942650, + "step": 1759, + "time_per_iteration": 2.9593753814697266 + }, + { + "auxiliary_loss_clip": 0.01581095, + "auxiliary_loss_mlp": 0.01326547, + "balance_loss_clip": 1.2297647, + "balance_loss_mlp": 1.04998136, + "epoch": 0.10581692469562604, + "flos": 14138393626080.0, + "grad_norm": 2.5534173943410243, + "language_loss": 0.7772671, + "learning_rate": 3.940004826678365e-06, + "loss": 0.80634356, + "num_input_tokens_seen": 37960660, + "step": 1760, + "time_per_iteration": 2.7755861282348633 + }, + { + "auxiliary_loss_clip": 0.01584245, + "auxiliary_loss_mlp": 0.01331309, + "balance_loss_clip": 1.23200774, + "balance_loss_mlp": 1.05016565, + "epoch": 0.105877047948294, + "flos": 25961098419360.0, + "grad_norm": 2.3079347878633105, + "language_loss": 0.89478767, + "learning_rate": 3.939910113597498e-06, + "loss": 0.92394322, + "num_input_tokens_seen": 37978625, + "step": 1761, + "time_per_iteration": 2.878876209259033 + }, + { + "auxiliary_loss_clip": 0.01573436, + "auxiliary_loss_mlp": 0.01339244, + "balance_loss_clip": 1.22201419, + "balance_loss_mlp": 1.06229711, + "epoch": 0.10593717120096197, + "flos": 30667423222080.0, + "grad_norm": 2.431745940439246, + "language_loss": 0.7849437, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.81407046, + "num_input_tokens_seen": 38000005, + "step": 1762, + "time_per_iteration": 2.8540282249450684 + }, + { + "auxiliary_loss_clip": 0.01734868, + "auxiliary_loss_mlp": 0.01230102, + "balance_loss_clip": 1.38343561, + "balance_loss_mlp": 1.01571655, + "epoch": 0.10599729445362994, + "flos": 66444275374560.0, + "grad_norm": 0.7625034254797278, + "language_loss": 0.60505593, + "learning_rate": 3.939720466754602e-06, + "loss": 0.6347056, + "num_input_tokens_seen": 38066165, + "step": 1763, + "time_per_iteration": 4.988255977630615 + }, + { + "auxiliary_loss_clip": 0.01577948, + "auxiliary_loss_mlp": 0.01323286, + "balance_loss_clip": 1.2263993, + "balance_loss_mlp": 1.04881811, + "epoch": 0.10605741770629791, + "flos": 23950279051200.0, + "grad_norm": 2.0906339284930824, + "language_loss": 0.80244589, + "learning_rate": 3.939625532999763e-06, + "loss": 0.83145821, + "num_input_tokens_seen": 38086150, + "step": 1764, + "time_per_iteration": 2.8116469383239746 + }, + { + "auxiliary_loss_clip": 0.01585745, + "auxiliary_loss_mlp": 0.01322066, + "balance_loss_clip": 1.23339701, + "balance_loss_mlp": 1.05084109, + "epoch": 0.10611754095896588, + "flos": 19389372271200.0, + "grad_norm": 2.2920028697499846, + "language_loss": 0.80451232, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.83359039, + "num_input_tokens_seen": 38104205, + "step": 1765, + "time_per_iteration": 2.7739224433898926 + }, + { + "auxiliary_loss_clip": 0.01590507, + "auxiliary_loss_mlp": 0.01343947, + "balance_loss_clip": 1.23863268, + "balance_loss_mlp": 1.07710862, + "epoch": 0.10617766421163385, + "flos": 22240005337440.0, + "grad_norm": 2.053264250216974, + "language_loss": 0.76975024, + "learning_rate": 3.939435444841306e-06, + "loss": 0.7990948, + "num_input_tokens_seen": 38122005, + "step": 1766, + "time_per_iteration": 2.7928717136383057 + }, + { + "auxiliary_loss_clip": 0.01584822, + "auxiliary_loss_mlp": 0.01341795, + "balance_loss_clip": 1.23278201, + "balance_loss_mlp": 1.07285905, + "epoch": 0.10623778746430182, + "flos": 28407034010880.0, + "grad_norm": 1.868912738968606, + "language_loss": 0.77490336, + "learning_rate": 3.939340290444895e-06, + "loss": 0.80416954, + "num_input_tokens_seen": 38143365, + "step": 1767, + "time_per_iteration": 2.8303678035736084 + }, + { + "auxiliary_loss_clip": 0.01711153, + "auxiliary_loss_mlp": 0.01235168, + "balance_loss_clip": 1.36079454, + "balance_loss_mlp": 1.02078247, + "epoch": 0.10629791071696978, + "flos": 64241385982560.0, + "grad_norm": 0.6973372904776938, + "language_loss": 0.57822096, + "learning_rate": 3.939245062508506e-06, + "loss": 0.60768414, + "num_input_tokens_seen": 38210035, + "step": 1768, + "time_per_iteration": 3.457216501235962 + }, + { + "auxiliary_loss_clip": 0.01588219, + "auxiliary_loss_mlp": 0.01337701, + "balance_loss_clip": 1.2375598, + "balance_loss_mlp": 1.07391524, + "epoch": 0.10635803396963776, + "flos": 22749802836480.0, + "grad_norm": 1.4922205194902123, + "language_loss": 0.86674911, + "learning_rate": 3.939149761035749e-06, + "loss": 0.89600831, + "num_input_tokens_seen": 38231230, + "step": 1769, + "time_per_iteration": 4.368149518966675 + }, + { + "auxiliary_loss_clip": 0.01581929, + "auxiliary_loss_mlp": 0.01333749, + "balance_loss_clip": 1.22972953, + "balance_loss_mlp": 1.05985391, + "epoch": 0.10641815722230573, + "flos": 31398344079840.0, + "grad_norm": 2.8800449484467285, + "language_loss": 0.61846387, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64762068, + "num_input_tokens_seen": 38253890, + "step": 1770, + "time_per_iteration": 4.399774551391602 + }, + { + "auxiliary_loss_clip": 0.01689891, + "auxiliary_loss_mlp": 0.01241333, + "balance_loss_clip": 1.33888221, + "balance_loss_mlp": 1.0284729, + "epoch": 0.1064782804749737, + "flos": 58557453383520.0, + "grad_norm": 0.8870900953623302, + "language_loss": 0.57044375, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.599756, + "num_input_tokens_seen": 38304290, + "step": 1771, + "time_per_iteration": 3.1864876747131348 + }, + { + "auxiliary_loss_clip": 0.01578007, + "auxiliary_loss_mlp": 0.01347494, + "balance_loss_clip": 1.22691691, + "balance_loss_mlp": 1.07417154, + "epoch": 0.10653840372764166, + "flos": 23990103983520.0, + "grad_norm": 3.4353248362861395, + "language_loss": 0.88375914, + "learning_rate": 3.938863415435429e-06, + "loss": 0.91301411, + "num_input_tokens_seen": 38324725, + "step": 1772, + "time_per_iteration": 4.303237199783325 + }, + { + "auxiliary_loss_clip": 0.01574541, + "auxiliary_loss_mlp": 0.01326538, + "balance_loss_clip": 1.2222991, + "balance_loss_mlp": 1.0534054, + "epoch": 0.10659852698030964, + "flos": 18296537267520.0, + "grad_norm": 2.91552751271966, + "language_loss": 0.76276559, + "learning_rate": 3.93876781985337e-06, + "loss": 0.79177636, + "num_input_tokens_seen": 38340735, + "step": 1773, + "time_per_iteration": 2.8344838619232178 + }, + { + "auxiliary_loss_clip": 0.0157873, + "auxiliary_loss_mlp": 0.01347666, + "balance_loss_clip": 1.2280432, + "balance_loss_mlp": 1.07338977, + "epoch": 0.1066586502329776, + "flos": 32163324789600.0, + "grad_norm": 2.317343817547319, + "language_loss": 0.82938194, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85864586, + "num_input_tokens_seen": 38361315, + "step": 1774, + "time_per_iteration": 2.9146735668182373 + }, + { + "auxiliary_loss_clip": 0.01579786, + "auxiliary_loss_mlp": 0.01334691, + "balance_loss_clip": 1.22932446, + "balance_loss_mlp": 1.06022382, + "epoch": 0.10671877348564557, + "flos": 17787081121920.0, + "grad_norm": 2.7543128899179075, + "language_loss": 0.7693066, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.79845136, + "num_input_tokens_seen": 38377425, + "step": 1775, + "time_per_iteration": 2.714492082595825 + }, + { + "auxiliary_loss_clip": 0.01649732, + "auxiliary_loss_mlp": 0.01238357, + "balance_loss_clip": 1.3006022, + "balance_loss_mlp": 1.02778625, + "epoch": 0.10677889673831355, + "flos": 63517178409120.0, + "grad_norm": 0.8245308394956761, + "language_loss": 0.57384652, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.60272741, + "num_input_tokens_seen": 38440275, + "step": 1776, + "time_per_iteration": 3.332024335861206 + }, + { + "auxiliary_loss_clip": 0.01586693, + "auxiliary_loss_mlp": 0.01346633, + "balance_loss_clip": 1.23706579, + "balance_loss_mlp": 1.07445455, + "epoch": 0.10683901999098151, + "flos": 22019868110880.0, + "grad_norm": 2.1999481440680793, + "language_loss": 0.83369952, + "learning_rate": 3.938384702378727e-06, + "loss": 0.86303282, + "num_input_tokens_seen": 38461820, + "step": 1777, + "time_per_iteration": 2.7397255897521973 + }, + { + "auxiliary_loss_clip": 0.0157917, + "auxiliary_loss_mlp": 0.01334446, + "balance_loss_clip": 1.22929204, + "balance_loss_mlp": 1.0580709, + "epoch": 0.10689914324364948, + "flos": 25045124247360.0, + "grad_norm": 2.216176244186399, + "language_loss": 0.8757863, + "learning_rate": 3.938288739241625e-06, + "loss": 0.90492243, + "num_input_tokens_seen": 38482235, + "step": 1778, + "time_per_iteration": 2.7211263179779053 + }, + { + "auxiliary_loss_clip": 0.01577892, + "auxiliary_loss_mlp": 0.01333852, + "balance_loss_clip": 1.22756863, + "balance_loss_mlp": 1.04946685, + "epoch": 0.10695926649631746, + "flos": 16436976858720.0, + "grad_norm": 2.17177997806712, + "language_loss": 0.84451389, + "learning_rate": 3.938192702604417e-06, + "loss": 0.87363142, + "num_input_tokens_seen": 38500690, + "step": 1779, + "time_per_iteration": 2.642159938812256 + }, + { + "auxiliary_loss_clip": 0.0157544, + "auxiliary_loss_mlp": 0.01334161, + "balance_loss_clip": 1.22387671, + "balance_loss_mlp": 1.06198239, + "epoch": 0.10701938974898542, + "flos": 16980910066080.0, + "grad_norm": 2.2001135212956298, + "language_loss": 0.67135376, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.70044976, + "num_input_tokens_seen": 38518405, + "step": 1780, + "time_per_iteration": 2.7718145847320557 + }, + { + "auxiliary_loss_clip": 0.01579063, + "auxiliary_loss_mlp": 0.01331422, + "balance_loss_clip": 1.2275424, + "balance_loss_mlp": 1.05809879, + "epoch": 0.10707951300165339, + "flos": 15889933542240.0, + "grad_norm": 2.2550872552379575, + "language_loss": 0.91714543, + "learning_rate": 3.938000408844265e-06, + "loss": 0.94625032, + "num_input_tokens_seen": 38535060, + "step": 1781, + "time_per_iteration": 2.8620352745056152 + }, + { + "auxiliary_loss_clip": 0.01569206, + "auxiliary_loss_mlp": 0.01342366, + "balance_loss_clip": 1.2183969, + "balance_loss_mlp": 1.06827974, + "epoch": 0.10713963625432135, + "flos": 14248955305440.0, + "grad_norm": 2.123946369063572, + "language_loss": 0.79218996, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.82130563, + "num_input_tokens_seen": 38552855, + "step": 1782, + "time_per_iteration": 2.7110276222229004 + }, + { + "auxiliary_loss_clip": 0.0156763, + "auxiliary_loss_mlp": 0.013337, + "balance_loss_clip": 1.21552992, + "balance_loss_mlp": 1.05427301, + "epoch": 0.10719975950698933, + "flos": 16758269580960.0, + "grad_norm": 2.1805959132863357, + "language_loss": 0.79343432, + "learning_rate": 3.937807821127436e-06, + "loss": 0.8224476, + "num_input_tokens_seen": 38570075, + "step": 1783, + "time_per_iteration": 2.7404470443725586 + }, + { + "auxiliary_loss_clip": 0.01571975, + "auxiliary_loss_mlp": 0.01336155, + "balance_loss_clip": 1.2198751, + "balance_loss_mlp": 1.05863571, + "epoch": 0.1072598827596573, + "flos": 22712822516160.0, + "grad_norm": 2.9646213892158544, + "language_loss": 0.86766732, + "learning_rate": 3.937711417044395e-06, + "loss": 0.89674854, + "num_input_tokens_seen": 38587970, + "step": 1784, + "time_per_iteration": 2.76143479347229 + }, + { + "auxiliary_loss_clip": 0.01580444, + "auxiliary_loss_mlp": 0.01355228, + "balance_loss_clip": 1.2271533, + "balance_loss_mlp": 1.07980728, + "epoch": 0.10732000601232526, + "flos": 23260510611360.0, + "grad_norm": 3.1208257381873374, + "language_loss": 1.00941861, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03877532, + "num_input_tokens_seen": 38605840, + "step": 1785, + "time_per_iteration": 2.779451608657837 + }, + { + "auxiliary_loss_clip": 0.01566616, + "auxiliary_loss_mlp": 0.01316389, + "balance_loss_clip": 1.21334684, + "balance_loss_mlp": 1.04459178, + "epoch": 0.10738012926499324, + "flos": 24209368862400.0, + "grad_norm": 1.6887731910136166, + "language_loss": 0.85265505, + "learning_rate": 3.937518388447339e-06, + "loss": 0.88148504, + "num_input_tokens_seen": 38627070, + "step": 1786, + "time_per_iteration": 2.771214008331299 + }, + { + "auxiliary_loss_clip": 0.01567323, + "auxiliary_loss_mlp": 0.01326722, + "balance_loss_clip": 1.21472836, + "balance_loss_mlp": 1.0549252, + "epoch": 0.1074402525176612, + "flos": 20925174627360.0, + "grad_norm": 2.0315327147531437, + "language_loss": 0.78973103, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81867146, + "num_input_tokens_seen": 38645840, + "step": 1787, + "time_per_iteration": 2.760310411453247 + }, + { + "auxiliary_loss_clip": 0.01567044, + "auxiliary_loss_mlp": 0.01350585, + "balance_loss_clip": 1.21442151, + "balance_loss_mlp": 1.08126712, + "epoch": 0.10750037577032917, + "flos": 16948860406560.0, + "grad_norm": 2.1658369421346633, + "language_loss": 0.82557631, + "learning_rate": 3.937325065966719e-06, + "loss": 0.85475266, + "num_input_tokens_seen": 38664770, + "step": 1788, + "time_per_iteration": 2.7966866493225098 + }, + { + "auxiliary_loss_clip": 0.01571115, + "auxiliary_loss_mlp": 0.01337425, + "balance_loss_clip": 1.21834075, + "balance_loss_mlp": 1.06372011, + "epoch": 0.10756049902299715, + "flos": 20268631620000.0, + "grad_norm": 3.9039135682471127, + "language_loss": 0.78654146, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.81562686, + "num_input_tokens_seen": 38683865, + "step": 1789, + "time_per_iteration": 2.7921810150146484 + }, + { + "auxiliary_loss_clip": 0.01568361, + "auxiliary_loss_mlp": 0.01346638, + "balance_loss_clip": 1.21512389, + "balance_loss_mlp": 1.07560396, + "epoch": 0.10762062227566511, + "flos": 23588782115040.0, + "grad_norm": 3.770164284580546, + "language_loss": 0.74953407, + "learning_rate": 3.937131449631859e-06, + "loss": 0.77868408, + "num_input_tokens_seen": 38702485, + "step": 1790, + "time_per_iteration": 2.7857260704040527 + }, + { + "auxiliary_loss_clip": 0.0157423, + "auxiliary_loss_mlp": 0.01351904, + "balance_loss_clip": 1.22161746, + "balance_loss_mlp": 1.07552922, + "epoch": 0.10768074552833308, + "flos": 24312610406880.0, + "grad_norm": 2.6116279196614522, + "language_loss": 0.78971446, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.81897569, + "num_input_tokens_seen": 38722475, + "step": 1791, + "time_per_iteration": 2.815187692642212 + }, + { + "auxiliary_loss_clip": 0.01568334, + "auxiliary_loss_mlp": 0.01339551, + "balance_loss_clip": 1.21538973, + "balance_loss_mlp": 1.06851649, + "epoch": 0.10774086878100106, + "flos": 25302241794240.0, + "grad_norm": 1.9225524494338988, + "language_loss": 0.7136364, + "learning_rate": 3.936937539472126e-06, + "loss": 0.7427153, + "num_input_tokens_seen": 38743285, + "step": 1792, + "time_per_iteration": 2.8352484703063965 + }, + { + "auxiliary_loss_clip": 0.01581016, + "auxiliary_loss_mlp": 0.01346976, + "balance_loss_clip": 1.2291702, + "balance_loss_mlp": 1.07536936, + "epoch": 0.10780099203366902, + "flos": 22056203652480.0, + "grad_norm": 2.4226194367614107, + "language_loss": 0.76614225, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.7954222, + "num_input_tokens_seen": 38763035, + "step": 1793, + "time_per_iteration": 2.716986894607544 + }, + { + "auxiliary_loss_clip": 0.01576746, + "auxiliary_loss_mlp": 0.01339991, + "balance_loss_clip": 1.22502613, + "balance_loss_mlp": 1.06895697, + "epoch": 0.10786111528633699, + "flos": 22749689052000.0, + "grad_norm": 1.72372060059682, + "language_loss": 0.85275084, + "learning_rate": 3.936743335516936e-06, + "loss": 0.88191819, + "num_input_tokens_seen": 38784900, + "step": 1794, + "time_per_iteration": 2.847566604614258 + }, + { + "auxiliary_loss_clip": 0.01574587, + "auxiliary_loss_mlp": 0.01349649, + "balance_loss_clip": 1.22277713, + "balance_loss_mlp": 1.0789963, + "epoch": 0.10792123853900495, + "flos": 20853224179200.0, + "grad_norm": 2.235761217616962, + "language_loss": 0.75057507, + "learning_rate": 3.936646123375246e-06, + "loss": 0.7798174, + "num_input_tokens_seen": 38804695, + "step": 1795, + "time_per_iteration": 2.7692019939422607 + }, + { + "auxiliary_loss_clip": 0.01573874, + "auxiliary_loss_mlp": 0.01346089, + "balance_loss_clip": 1.22139573, + "balance_loss_mlp": 1.07333827, + "epoch": 0.10798136179167293, + "flos": 17750631795840.0, + "grad_norm": 7.923634120690184, + "language_loss": 0.82211834, + "learning_rate": 3.936548837795741e-06, + "loss": 0.85131794, + "num_input_tokens_seen": 38822395, + "step": 1796, + "time_per_iteration": 2.7679011821746826 + }, + { + "auxiliary_loss_clip": 0.01576906, + "auxiliary_loss_mlp": 0.01337653, + "balance_loss_clip": 1.22588325, + "balance_loss_mlp": 1.06089687, + "epoch": 0.1080414850443409, + "flos": 13591350309600.0, + "grad_norm": 2.4214678397685714, + "language_loss": 0.7394464, + "learning_rate": 3.936451478782111e-06, + "loss": 0.768592, + "num_input_tokens_seen": 38839865, + "step": 1797, + "time_per_iteration": 2.718928337097168 + }, + { + "auxiliary_loss_clip": 0.01567942, + "auxiliary_loss_mlp": 0.0132208, + "balance_loss_clip": 1.21654499, + "balance_loss_mlp": 1.05409789, + "epoch": 0.10810160829700886, + "flos": 16255299150720.0, + "grad_norm": 2.6758229521730117, + "language_loss": 0.81580079, + "learning_rate": 3.936354046338046e-06, + "loss": 0.84470105, + "num_input_tokens_seen": 38857300, + "step": 1798, + "time_per_iteration": 2.7336678504943848 + }, + { + "auxiliary_loss_clip": 0.01570627, + "auxiliary_loss_mlp": 0.01336315, + "balance_loss_clip": 1.21975982, + "balance_loss_mlp": 1.06108522, + "epoch": 0.10816173154967684, + "flos": 15159771247680.0, + "grad_norm": 2.6710916892185708, + "language_loss": 0.86439192, + "learning_rate": 3.936256540467242e-06, + "loss": 0.89346141, + "num_input_tokens_seen": 38874960, + "step": 1799, + "time_per_iteration": 2.788341760635376 + }, + { + "auxiliary_loss_clip": 0.01572214, + "auxiliary_loss_mlp": 0.01332608, + "balance_loss_clip": 1.22219992, + "balance_loss_mlp": 1.06424379, + "epoch": 0.10822185480234481, + "flos": 17787270762720.0, + "grad_norm": 2.7714863937541687, + "language_loss": 0.77878827, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.80783653, + "num_input_tokens_seen": 38893610, + "step": 1800, + "time_per_iteration": 2.7595033645629883 + }, + { + "auxiliary_loss_clip": 0.01574564, + "auxiliary_loss_mlp": 0.01324774, + "balance_loss_clip": 1.22461915, + "balance_loss_mlp": 1.05259514, + "epoch": 0.10828197805501277, + "flos": 25559017987680.0, + "grad_norm": 2.415812341661908, + "language_loss": 0.73309314, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.76208645, + "num_input_tokens_seen": 38913485, + "step": 1801, + "time_per_iteration": 2.832484006881714 + }, + { + "auxiliary_loss_clip": 0.015717, + "auxiliary_loss_mlp": 0.01341431, + "balance_loss_clip": 1.22048688, + "balance_loss_mlp": 1.06772661, + "epoch": 0.10834210130768075, + "flos": 28986885550080.0, + "grad_norm": 2.930276413257177, + "language_loss": 0.66512978, + "learning_rate": 3.935963582331381e-06, + "loss": 0.69426107, + "num_input_tokens_seen": 38935650, + "step": 1802, + "time_per_iteration": 4.458786487579346 + }, + { + "auxiliary_loss_clip": 0.01576011, + "auxiliary_loss_mlp": 0.01321056, + "balance_loss_clip": 1.22682178, + "balance_loss_mlp": 1.04525316, + "epoch": 0.10840222456034872, + "flos": 20266166289600.0, + "grad_norm": 1.7458516193280542, + "language_loss": 0.81702518, + "learning_rate": 3.935865782790621e-06, + "loss": 0.8459959, + "num_input_tokens_seen": 38954130, + "step": 1803, + "time_per_iteration": 2.743513822555542 + }, + { + "auxiliary_loss_clip": 0.01582555, + "auxiliary_loss_mlp": 0.0133925, + "balance_loss_clip": 1.23368311, + "balance_loss_mlp": 1.06821597, + "epoch": 0.10846234781301668, + "flos": 19864996133760.0, + "grad_norm": 1.69246351858703, + "language_loss": 0.9109925, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.94021058, + "num_input_tokens_seen": 38972905, + "step": 1804, + "time_per_iteration": 2.8564441204071045 + }, + { + "auxiliary_loss_clip": 0.01579951, + "auxiliary_loss_mlp": 0.0134489, + "balance_loss_clip": 1.23032677, + "balance_loss_mlp": 1.07500052, + "epoch": 0.10852247106568465, + "flos": 26471464840800.0, + "grad_norm": 2.9428980717470985, + "language_loss": 0.76627845, + "learning_rate": 3.935669963488139e-06, + "loss": 0.7955268, + "num_input_tokens_seen": 38993255, + "step": 1805, + "time_per_iteration": 2.8185317516326904 + }, + { + "auxiliary_loss_clip": 0.01572554, + "auxiliary_loss_mlp": 0.01332351, + "balance_loss_clip": 1.22358656, + "balance_loss_mlp": 1.05940986, + "epoch": 0.10858259431835263, + "flos": 30084158148480.0, + "grad_norm": 1.9876732831741104, + "language_loss": 0.8609035, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88995254, + "num_input_tokens_seen": 39012610, + "step": 1806, + "time_per_iteration": 2.8673555850982666 + }, + { + "auxiliary_loss_clip": 0.01564272, + "auxiliary_loss_mlp": 0.01330279, + "balance_loss_clip": 1.21622288, + "balance_loss_mlp": 1.05581164, + "epoch": 0.10864271757102059, + "flos": 19065234936960.0, + "grad_norm": 3.1549830345699883, + "language_loss": 0.81376034, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.84270585, + "num_input_tokens_seen": 39030120, + "step": 1807, + "time_per_iteration": 2.7661232948303223 + }, + { + "auxiliary_loss_clip": 0.01573085, + "auxiliary_loss_mlp": 0.01329211, + "balance_loss_clip": 1.22556806, + "balance_loss_mlp": 1.05989337, + "epoch": 0.10870284082368856, + "flos": 24717232025280.0, + "grad_norm": 3.2974558160125977, + "language_loss": 0.78958881, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.8186118, + "num_input_tokens_seen": 39049875, + "step": 1808, + "time_per_iteration": 5.869494438171387 + }, + { + "auxiliary_loss_clip": 0.01569416, + "auxiliary_loss_mlp": 0.01330736, + "balance_loss_clip": 1.22038138, + "balance_loss_mlp": 1.05588686, + "epoch": 0.10876296407635654, + "flos": 20629559633760.0, + "grad_norm": 3.6488916644239953, + "language_loss": 0.79164749, + "learning_rate": 3.935277444103342e-06, + "loss": 0.82064903, + "num_input_tokens_seen": 39068935, + "step": 1809, + "time_per_iteration": 2.790485143661499 + }, + { + "auxiliary_loss_clip": 0.01576836, + "auxiliary_loss_mlp": 0.01342779, + "balance_loss_clip": 1.22892165, + "balance_loss_mlp": 1.07651329, + "epoch": 0.1088230873290245, + "flos": 21581983131840.0, + "grad_norm": 2.3382883199713067, + "language_loss": 0.85501003, + "learning_rate": 3.935179130783046e-06, + "loss": 0.88420618, + "num_input_tokens_seen": 39087370, + "step": 1810, + "time_per_iteration": 4.318295478820801 + }, + { + "auxiliary_loss_clip": 0.01567295, + "auxiliary_loss_mlp": 0.01340418, + "balance_loss_clip": 1.21853948, + "balance_loss_mlp": 1.0678575, + "epoch": 0.10888321058169247, + "flos": 26471464840800.0, + "grad_norm": 1.868124410274934, + "language_loss": 0.6380344, + "learning_rate": 3.935080744080564e-06, + "loss": 0.66711152, + "num_input_tokens_seen": 39106635, + "step": 1811, + "time_per_iteration": 2.8342642784118652 + }, + { + "auxiliary_loss_clip": 0.01568048, + "auxiliary_loss_mlp": 0.0132978, + "balance_loss_clip": 1.22007346, + "balance_loss_mlp": 1.05359578, + "epoch": 0.10894333383436045, + "flos": 25850991877920.0, + "grad_norm": 2.5400223705092713, + "language_loss": 0.74627101, + "learning_rate": 3.934982283999626e-06, + "loss": 0.77524936, + "num_input_tokens_seen": 39126335, + "step": 1812, + "time_per_iteration": 2.7947022914886475 + }, + { + "auxiliary_loss_clip": 0.01567804, + "auxiliary_loss_mlp": 0.01325153, + "balance_loss_clip": 1.21986175, + "balance_loss_mlp": 1.05488133, + "epoch": 0.10900345708702841, + "flos": 19539152032320.0, + "grad_norm": 3.124431506926878, + "language_loss": 0.72652924, + "learning_rate": 3.934883750543966e-06, + "loss": 0.75545883, + "num_input_tokens_seen": 39144820, + "step": 1813, + "time_per_iteration": 2.786893844604492 + }, + { + "auxiliary_loss_clip": 0.01568639, + "auxiliary_loss_mlp": 0.01315167, + "balance_loss_clip": 1.221223, + "balance_loss_mlp": 1.04203439, + "epoch": 0.10906358033969638, + "flos": 23625572794560.0, + "grad_norm": 2.3445047165815116, + "language_loss": 0.82715023, + "learning_rate": 3.93478514371732e-06, + "loss": 0.85598826, + "num_input_tokens_seen": 39165945, + "step": 1814, + "time_per_iteration": 2.78090763092041 + }, + { + "auxiliary_loss_clip": 0.0156652, + "auxiliary_loss_mlp": 0.01335947, + "balance_loss_clip": 1.21822453, + "balance_loss_mlp": 1.06376839, + "epoch": 0.10912370359236434, + "flos": 21216845092320.0, + "grad_norm": 2.891640595568699, + "language_loss": 0.84465933, + "learning_rate": 3.934686463523429e-06, + "loss": 0.87368399, + "num_input_tokens_seen": 39183520, + "step": 1815, + "time_per_iteration": 2.757918119430542 + }, + { + "auxiliary_loss_clip": 0.01575198, + "auxiliary_loss_mlp": 0.01343912, + "balance_loss_clip": 1.22771478, + "balance_loss_mlp": 1.0759294, + "epoch": 0.10918382684503232, + "flos": 13554900983520.0, + "grad_norm": 2.6993611974360374, + "language_loss": 0.7168256, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.74601668, + "num_input_tokens_seen": 39201190, + "step": 1816, + "time_per_iteration": 2.7282540798187256 + }, + { + "auxiliary_loss_clip": 0.01567109, + "auxiliary_loss_mlp": 0.01330796, + "balance_loss_clip": 1.21970153, + "balance_loss_mlp": 1.05384946, + "epoch": 0.10924395009770028, + "flos": 27966797485920.0, + "grad_norm": 2.481922065593604, + "language_loss": 0.72680795, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75578701, + "num_input_tokens_seen": 39221210, + "step": 1817, + "time_per_iteration": 2.789447069168091 + }, + { + "auxiliary_loss_clip": 0.01565633, + "auxiliary_loss_mlp": 0.01318824, + "balance_loss_clip": 1.21812773, + "balance_loss_mlp": 1.04740787, + "epoch": 0.10930407335036825, + "flos": 25596150020640.0, + "grad_norm": 2.241243923659301, + "language_loss": 0.67640328, + "learning_rate": 3.934389982775706e-06, + "loss": 0.70524788, + "num_input_tokens_seen": 39242025, + "step": 1818, + "time_per_iteration": 2.8110783100128174 + }, + { + "auxiliary_loss_clip": 0.01570283, + "auxiliary_loss_mlp": 0.01321647, + "balance_loss_clip": 1.22252417, + "balance_loss_mlp": 1.04450941, + "epoch": 0.10936419660303623, + "flos": 18408198863520.0, + "grad_norm": 2.3447198175872623, + "language_loss": 0.73719335, + "learning_rate": 3.934291009150275e-06, + "loss": 0.76611269, + "num_input_tokens_seen": 39259870, + "step": 1819, + "time_per_iteration": 2.8188424110412598 + }, + { + "auxiliary_loss_clip": 0.01575179, + "auxiliary_loss_mlp": 0.01347629, + "balance_loss_clip": 1.22767949, + "balance_loss_mlp": 1.07583165, + "epoch": 0.1094243198557042, + "flos": 23842220630400.0, + "grad_norm": 3.062664107648095, + "language_loss": 0.73992312, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76915121, + "num_input_tokens_seen": 39278500, + "step": 1820, + "time_per_iteration": 2.884324312210083 + }, + { + "auxiliary_loss_clip": 0.01576331, + "auxiliary_loss_mlp": 0.01336754, + "balance_loss_clip": 1.22813559, + "balance_loss_mlp": 1.0672462, + "epoch": 0.10948444310837216, + "flos": 14645498225760.0, + "grad_norm": 3.6830548345221032, + "language_loss": 0.82763147, + "learning_rate": 3.934092841857642e-06, + "loss": 0.85676229, + "num_input_tokens_seen": 39294800, + "step": 1821, + "time_per_iteration": 2.731285572052002 + }, + { + "auxiliary_loss_clip": 0.0156877, + "auxiliary_loss_mlp": 0.0132971, + "balance_loss_clip": 1.22122264, + "balance_loss_mlp": 1.05276263, + "epoch": 0.10954456636104014, + "flos": 27821303606880.0, + "grad_norm": 2.0989582522742487, + "language_loss": 0.76583582, + "learning_rate": 3.933993648197955e-06, + "loss": 0.79482067, + "num_input_tokens_seen": 39314625, + "step": 1822, + "time_per_iteration": 2.8413994312286377 + }, + { + "auxiliary_loss_clip": 0.01566816, + "auxiliary_loss_mlp": 0.01318878, + "balance_loss_clip": 1.21933746, + "balance_loss_mlp": 1.04689026, + "epoch": 0.1096046896137081, + "flos": 33623801091360.0, + "grad_norm": 1.9751511118548488, + "language_loss": 0.79662651, + "learning_rate": 3.933894381201034e-06, + "loss": 0.82548344, + "num_input_tokens_seen": 39336465, + "step": 1823, + "time_per_iteration": 2.8429417610168457 + }, + { + "auxiliary_loss_clip": 0.01565624, + "auxiliary_loss_mlp": 0.01323982, + "balance_loss_clip": 1.21830118, + "balance_loss_mlp": 1.04837012, + "epoch": 0.10966481286637607, + "flos": 26982476040960.0, + "grad_norm": 1.9917916207528505, + "language_loss": 0.79726225, + "learning_rate": 3.933795040870645e-06, + "loss": 0.82615834, + "num_input_tokens_seen": 39357930, + "step": 1824, + "time_per_iteration": 2.8229787349700928 + }, + { + "auxiliary_loss_clip": 0.01569837, + "auxiliary_loss_mlp": 0.01335176, + "balance_loss_clip": 1.22142076, + "balance_loss_mlp": 1.06604922, + "epoch": 0.10972493611904403, + "flos": 23038628689440.0, + "grad_norm": 3.3861243570789834, + "language_loss": 0.8778435, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90689367, + "num_input_tokens_seen": 39376380, + "step": 1825, + "time_per_iteration": 2.761894702911377 + }, + { + "auxiliary_loss_clip": 0.01576369, + "auxiliary_loss_mlp": 0.01346451, + "balance_loss_clip": 1.2294271, + "balance_loss_mlp": 1.07923102, + "epoch": 0.10978505937171201, + "flos": 38107523337120.0, + "grad_norm": 2.72748642333209, + "language_loss": 0.76489604, + "learning_rate": 3.933596140224532e-06, + "loss": 0.79412425, + "num_input_tokens_seen": 39399935, + "step": 1826, + "time_per_iteration": 2.860257148742676 + }, + { + "auxiliary_loss_clip": 0.01633455, + "auxiliary_loss_mlp": 0.01243637, + "balance_loss_clip": 1.28740525, + "balance_loss_mlp": 1.04222107, + "epoch": 0.10984518262437998, + "flos": 59855822907840.0, + "grad_norm": 0.8596978357425082, + "language_loss": 0.5493964, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57816732, + "num_input_tokens_seen": 39460685, + "step": 1827, + "time_per_iteration": 3.338474988937378 + }, + { + "auxiliary_loss_clip": 0.01632088, + "auxiliary_loss_mlp": 0.01241669, + "balance_loss_clip": 1.28621483, + "balance_loss_mlp": 1.03720093, + "epoch": 0.10990530587704794, + "flos": 66726956865600.0, + "grad_norm": 0.7477767296601049, + "language_loss": 0.55248332, + "learning_rate": 3.933396946289784e-06, + "loss": 0.58122087, + "num_input_tokens_seen": 39524765, + "step": 1828, + "time_per_iteration": 3.265502452850342 + }, + { + "auxiliary_loss_clip": 0.01569919, + "auxiliary_loss_mlp": 0.01350542, + "balance_loss_clip": 1.22330427, + "balance_loss_mlp": 1.06939876, + "epoch": 0.10996542912971592, + "flos": 25449556224960.0, + "grad_norm": 3.097516281336185, + "language_loss": 0.84727705, + "learning_rate": 3.933297239348612e-06, + "loss": 0.87648159, + "num_input_tokens_seen": 39543640, + "step": 1829, + "time_per_iteration": 2.814112424850464 + }, + { + "auxiliary_loss_clip": 0.01571323, + "auxiliary_loss_mlp": 0.01333006, + "balance_loss_clip": 1.22415912, + "balance_loss_mlp": 1.05205369, + "epoch": 0.11002555238238389, + "flos": 44021568633120.0, + "grad_norm": 1.8721334749100977, + "language_loss": 0.89083916, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91988242, + "num_input_tokens_seen": 39567525, + "step": 1830, + "time_per_iteration": 2.9816296100616455 + }, + { + "auxiliary_loss_clip": 0.0162825, + "auxiliary_loss_mlp": 0.01233887, + "balance_loss_clip": 1.28369069, + "balance_loss_mlp": 1.01950073, + "epoch": 0.11008567563505185, + "flos": 54071872293600.0, + "grad_norm": 0.7003792367289305, + "language_loss": 0.55426079, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.58288217, + "num_input_tokens_seen": 39628470, + "step": 1831, + "time_per_iteration": 3.268198013305664 + }, + { + "auxiliary_loss_clip": 0.01568041, + "auxiliary_loss_mlp": 0.01342405, + "balance_loss_clip": 1.22034168, + "balance_loss_mlp": 1.06507659, + "epoch": 0.11014579888771983, + "flos": 24245438906880.0, + "grad_norm": 2.692174981666631, + "language_loss": 0.90879172, + "learning_rate": 3.932997678675282e-06, + "loss": 0.93789613, + "num_input_tokens_seen": 39646670, + "step": 1832, + "time_per_iteration": 2.8311758041381836 + }, + { + "auxiliary_loss_clip": 0.01617845, + "auxiliary_loss_mlp": 0.01247246, + "balance_loss_clip": 1.27368855, + "balance_loss_mlp": 1.03514862, + "epoch": 0.1102059221403878, + "flos": 57750561328320.0, + "grad_norm": 0.75575529538439, + "language_loss": 0.59874874, + "learning_rate": 3.932897678513523e-06, + "loss": 0.62739962, + "num_input_tokens_seen": 39712915, + "step": 1833, + "time_per_iteration": 3.2435965538024902 + }, + { + "auxiliary_loss_clip": 0.01569245, + "auxiliary_loss_mlp": 0.01343298, + "balance_loss_clip": 1.22211134, + "balance_loss_mlp": 1.07169127, + "epoch": 0.11026604539305576, + "flos": 16797563519040.0, + "grad_norm": 2.5491492227188703, + "language_loss": 0.81062591, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83975136, + "num_input_tokens_seen": 39730650, + "step": 1834, + "time_per_iteration": 2.7704250812530518 + }, + { + "auxiliary_loss_clip": 0.01573044, + "auxiliary_loss_mlp": 0.01351157, + "balance_loss_clip": 1.22623205, + "balance_loss_mlp": 1.07840586, + "epoch": 0.11032616864572373, + "flos": 23990369480640.0, + "grad_norm": 3.312875798892225, + "language_loss": 0.90995836, + "learning_rate": 3.932697458306779e-06, + "loss": 0.93920034, + "num_input_tokens_seen": 39751065, + "step": 1835, + "time_per_iteration": 2.8181540966033936 + }, + { + "auxiliary_loss_clip": 0.01567133, + "auxiliary_loss_mlp": 0.01330708, + "balance_loss_clip": 1.22094893, + "balance_loss_mlp": 1.05948257, + "epoch": 0.1103862918983917, + "flos": 19685214833760.0, + "grad_norm": 2.174392825019243, + "language_loss": 0.63959455, + "learning_rate": 3.932597238269386e-06, + "loss": 0.66857296, + "num_input_tokens_seen": 39769245, + "step": 1836, + "time_per_iteration": 2.798114061355591 + }, + { + "auxiliary_loss_clip": 0.01563107, + "auxiliary_loss_mlp": 0.01326696, + "balance_loss_clip": 1.21698511, + "balance_loss_mlp": 1.05699706, + "epoch": 0.11044641515105967, + "flos": 32163628214880.0, + "grad_norm": 2.503482335681509, + "language_loss": 0.73106313, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75996113, + "num_input_tokens_seen": 39790830, + "step": 1837, + "time_per_iteration": 2.920485496520996 + }, + { + "auxiliary_loss_clip": 0.01566136, + "auxiliary_loss_mlp": 0.01328466, + "balance_loss_clip": 1.2193346, + "balance_loss_mlp": 1.06048298, + "epoch": 0.11050653840372764, + "flos": 16691022224640.0, + "grad_norm": 3.1700447011646076, + "language_loss": 0.78313863, + "learning_rate": 3.93239657834556e-06, + "loss": 0.81208462, + "num_input_tokens_seen": 39809475, + "step": 1838, + "time_per_iteration": 2.7528419494628906 + }, + { + "auxiliary_loss_clip": 0.01568616, + "auxiliary_loss_mlp": 0.01336138, + "balance_loss_clip": 1.22278535, + "balance_loss_mlp": 1.06701136, + "epoch": 0.11056666165639562, + "flos": 21210662802240.0, + "grad_norm": 7.26753223482191, + "language_loss": 0.71472591, + "learning_rate": 3.932296138466736e-06, + "loss": 0.74377346, + "num_input_tokens_seen": 39826355, + "step": 1839, + "time_per_iteration": 4.2972283363342285 + }, + { + "auxiliary_loss_clip": 0.01578331, + "auxiliary_loss_mlp": 0.01359484, + "balance_loss_clip": 1.23308372, + "balance_loss_mlp": 1.08959436, + "epoch": 0.11062678490906358, + "flos": 19167148995840.0, + "grad_norm": 2.7333895967586153, + "language_loss": 0.7897808, + "learning_rate": 3.93219562531505e-06, + "loss": 0.81915891, + "num_input_tokens_seen": 39845335, + "step": 1840, + "time_per_iteration": 2.737677812576294 + }, + { + "auxiliary_loss_clip": 0.01575186, + "auxiliary_loss_mlp": 0.01337293, + "balance_loss_clip": 1.22938752, + "balance_loss_mlp": 1.07274425, + "epoch": 0.11068690816173155, + "flos": 24897354678720.0, + "grad_norm": 1.7785653300587978, + "language_loss": 0.87806803, + "learning_rate": 3.932095038894311e-06, + "loss": 0.90719283, + "num_input_tokens_seen": 39865065, + "step": 1841, + "time_per_iteration": 2.789870500564575 + }, + { + "auxiliary_loss_clip": 0.0157469, + "auxiliary_loss_mlp": 0.0136069, + "balance_loss_clip": 1.22838616, + "balance_loss_mlp": 1.09118176, + "epoch": 0.11074703141439952, + "flos": 16474260604320.0, + "grad_norm": 1.962092734862155, + "language_loss": 0.90510881, + "learning_rate": 3.931994379208334e-06, + "loss": 0.93446261, + "num_input_tokens_seen": 39882780, + "step": 1842, + "time_per_iteration": 2.747849941253662 + }, + { + "auxiliary_loss_clip": 0.0156974, + "auxiliary_loss_mlp": 0.01327683, + "balance_loss_clip": 1.2237041, + "balance_loss_mlp": 1.06141722, + "epoch": 0.11080715466706749, + "flos": 19174469130720.0, + "grad_norm": 4.796818013835874, + "language_loss": 0.85770774, + "learning_rate": 3.931893646260937e-06, + "loss": 0.88668191, + "num_input_tokens_seen": 39900295, + "step": 1843, + "time_per_iteration": 2.7127366065979004 + }, + { + "auxiliary_loss_clip": 0.01574443, + "auxiliary_loss_mlp": 0.01332943, + "balance_loss_clip": 1.22780132, + "balance_loss_mlp": 1.06820333, + "epoch": 0.11086727791973545, + "flos": 27706911183360.0, + "grad_norm": 1.8091033659800384, + "language_loss": 0.74730974, + "learning_rate": 3.931792840055941e-06, + "loss": 0.77638358, + "num_input_tokens_seen": 39922075, + "step": 1844, + "time_per_iteration": 2.857529401779175 + }, + { + "auxiliary_loss_clip": 0.0158519, + "auxiliary_loss_mlp": 0.01348403, + "balance_loss_clip": 1.24006426, + "balance_loss_mlp": 1.0823276, + "epoch": 0.11092740117240343, + "flos": 18516939991200.0, + "grad_norm": 2.4713722576905264, + "language_loss": 0.75669205, + "learning_rate": 3.931691960597165e-06, + "loss": 0.78602803, + "num_input_tokens_seen": 39940115, + "step": 1845, + "time_per_iteration": 4.374290227890015 + }, + { + "auxiliary_loss_clip": 0.015814, + "auxiliary_loss_mlp": 0.01324268, + "balance_loss_clip": 1.23635888, + "balance_loss_mlp": 1.06067276, + "epoch": 0.1109875244250714, + "flos": 20524573393920.0, + "grad_norm": 1.6786786744669746, + "language_loss": 0.76487648, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.79393315, + "num_input_tokens_seen": 39959920, + "step": 1846, + "time_per_iteration": 2.7660348415374756 + }, + { + "auxiliary_loss_clip": 0.01590099, + "auxiliary_loss_mlp": 0.01354794, + "balance_loss_clip": 1.24503994, + "balance_loss_mlp": 1.08604848, + "epoch": 0.11104764767773936, + "flos": 14100237532800.0, + "grad_norm": 2.659118426136626, + "language_loss": 0.85862166, + "learning_rate": 3.931489981933584e-06, + "loss": 0.88807058, + "num_input_tokens_seen": 39974755, + "step": 1847, + "time_per_iteration": 4.2319276332855225 + }, + { + "auxiliary_loss_clip": 0.0158172, + "auxiliary_loss_mlp": 0.01347747, + "balance_loss_clip": 1.23649478, + "balance_loss_mlp": 1.08548617, + "epoch": 0.11110777093040733, + "flos": 20596599698400.0, + "grad_norm": 2.8187242475764647, + "language_loss": 0.7737143, + "learning_rate": 3.931388882736438e-06, + "loss": 0.80300903, + "num_input_tokens_seen": 39993355, + "step": 1848, + "time_per_iteration": 2.7630953788757324 + }, + { + "auxiliary_loss_clip": 0.0158537, + "auxiliary_loss_mlp": 0.01314162, + "balance_loss_clip": 1.24032915, + "balance_loss_mlp": 1.04350901, + "epoch": 0.11116789418307531, + "flos": 21872136470400.0, + "grad_norm": 2.713190774462197, + "language_loss": 0.77719367, + "learning_rate": 3.931287710300832e-06, + "loss": 0.80618894, + "num_input_tokens_seen": 40012410, + "step": 1849, + "time_per_iteration": 4.209035634994507 + }, + { + "auxiliary_loss_clip": 0.01582732, + "auxiliary_loss_mlp": 0.01320045, + "balance_loss_clip": 1.23750734, + "balance_loss_mlp": 1.04748452, + "epoch": 0.11122801743574327, + "flos": 15524909287200.0, + "grad_norm": 10.32901869097278, + "language_loss": 0.71853685, + "learning_rate": 3.931186464630601e-06, + "loss": 0.74756461, + "num_input_tokens_seen": 40029315, + "step": 1850, + "time_per_iteration": 2.6907241344451904 + }, + { + "auxiliary_loss_clip": 0.01585067, + "auxiliary_loss_mlp": 0.01322808, + "balance_loss_clip": 1.23958755, + "balance_loss_mlp": 1.05081975, + "epoch": 0.11128814068841124, + "flos": 14394031974720.0, + "grad_norm": 3.898546586627535, + "language_loss": 0.81782162, + "learning_rate": 3.931085145729588e-06, + "loss": 0.84690034, + "num_input_tokens_seen": 40045765, + "step": 1851, + "time_per_iteration": 2.8891055583953857 + }, + { + "auxiliary_loss_clip": 0.01589084, + "auxiliary_loss_mlp": 0.01318309, + "balance_loss_clip": 1.24414182, + "balance_loss_mlp": 1.04422331, + "epoch": 0.11134826394107922, + "flos": 16655786599680.0, + "grad_norm": 3.1176537240680573, + "language_loss": 0.88507301, + "learning_rate": 3.930983753601631e-06, + "loss": 0.91414696, + "num_input_tokens_seen": 40061660, + "step": 1852, + "time_per_iteration": 2.8072891235351562 + }, + { + "auxiliary_loss_clip": 0.01589324, + "auxiliary_loss_mlp": 0.01334967, + "balance_loss_clip": 1.24392974, + "balance_loss_mlp": 1.05992699, + "epoch": 0.11140838719374718, + "flos": 16692880704480.0, + "grad_norm": 6.517485946818969, + "language_loss": 0.72148263, + "learning_rate": 3.930882288250578e-06, + "loss": 0.75072551, + "num_input_tokens_seen": 40080180, + "step": 1853, + "time_per_iteration": 2.8168044090270996 + }, + { + "auxiliary_loss_clip": 0.01649299, + "auxiliary_loss_mlp": 0.01220177, + "balance_loss_clip": 1.30770683, + "balance_loss_mlp": 1.01494598, + "epoch": 0.11146851044641515, + "flos": 60982300553760.0, + "grad_norm": 0.7796264862931447, + "language_loss": 0.5360809, + "learning_rate": 3.930780749680273e-06, + "loss": 0.56477565, + "num_input_tokens_seen": 40138910, + "step": 1854, + "time_per_iteration": 3.271193504333496 + }, + { + "auxiliary_loss_clip": 0.01577, + "auxiliary_loss_mlp": 0.01333327, + "balance_loss_clip": 1.23009872, + "balance_loss_mlp": 1.05275655, + "epoch": 0.11152863369908313, + "flos": 22195666954080.0, + "grad_norm": 4.208071053241383, + "language_loss": 0.85177565, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.88087893, + "num_input_tokens_seen": 40157745, + "step": 1855, + "time_per_iteration": 2.7947657108306885 + }, + { + "auxiliary_loss_clip": 0.01580257, + "auxiliary_loss_mlp": 0.01337872, + "balance_loss_clip": 1.23564863, + "balance_loss_mlp": 1.04871798, + "epoch": 0.11158875695175109, + "flos": 19539607170240.0, + "grad_norm": 2.16314771598226, + "language_loss": 0.81679583, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84597707, + "num_input_tokens_seen": 40175375, + "step": 1856, + "time_per_iteration": 2.774998903274536 + }, + { + "auxiliary_loss_clip": 0.01579214, + "auxiliary_loss_mlp": 0.01330848, + "balance_loss_clip": 1.23380399, + "balance_loss_mlp": 1.05390131, + "epoch": 0.11164888020441906, + "flos": 25444815204960.0, + "grad_norm": 2.704626863796713, + "language_loss": 0.83126152, + "learning_rate": 3.93047569469238e-06, + "loss": 0.86036217, + "num_input_tokens_seen": 40195715, + "step": 1857, + "time_per_iteration": 2.8167033195495605 + }, + { + "auxiliary_loss_clip": 0.01573655, + "auxiliary_loss_mlp": 0.0132089, + "balance_loss_clip": 1.22886825, + "balance_loss_mlp": 1.0523355, + "epoch": 0.11170900345708702, + "flos": 15634560690720.0, + "grad_norm": 2.986219785774391, + "language_loss": 0.82970166, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85864711, + "num_input_tokens_seen": 40213975, + "step": 1858, + "time_per_iteration": 2.7519936561584473 + }, + { + "auxiliary_loss_clip": 0.01571834, + "auxiliary_loss_mlp": 0.01325236, + "balance_loss_clip": 1.22745359, + "balance_loss_mlp": 1.05114985, + "epoch": 0.111769126709755, + "flos": 23041662942240.0, + "grad_norm": 1.9574843556601031, + "language_loss": 0.91711497, + "learning_rate": 3.930271958674866e-06, + "loss": 0.94608569, + "num_input_tokens_seen": 40233905, + "step": 1859, + "time_per_iteration": 2.8050525188446045 + }, + { + "auxiliary_loss_clip": 0.01569711, + "auxiliary_loss_mlp": 0.01333906, + "balance_loss_clip": 1.22455657, + "balance_loss_mlp": 1.06687701, + "epoch": 0.11182924996242297, + "flos": 20852844897600.0, + "grad_norm": 5.044914797344564, + "language_loss": 0.82248831, + "learning_rate": 3.930169980870018e-06, + "loss": 0.85152447, + "num_input_tokens_seen": 40252810, + "step": 1860, + "time_per_iteration": 2.794226884841919 + }, + { + "auxiliary_loss_clip": 0.01578758, + "auxiliary_loss_mlp": 0.01318086, + "balance_loss_clip": 1.23358238, + "balance_loss_mlp": 1.05315518, + "epoch": 0.11188937321509093, + "flos": 17457140779200.0, + "grad_norm": 2.4659089292970378, + "language_loss": 0.75075066, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77971911, + "num_input_tokens_seen": 40272000, + "step": 1861, + "time_per_iteration": 2.7270801067352295 + }, + { + "auxiliary_loss_clip": 0.01574769, + "auxiliary_loss_mlp": 0.01333891, + "balance_loss_clip": 1.22943521, + "balance_loss_mlp": 1.06323814, + "epoch": 0.11194949646775891, + "flos": 24098124476160.0, + "grad_norm": 2.4596149622792285, + "language_loss": 0.89465284, + "learning_rate": 3.929965805687474e-06, + "loss": 0.92373943, + "num_input_tokens_seen": 40290660, + "step": 1862, + "time_per_iteration": 2.8471598625183105 + }, + { + "auxiliary_loss_clip": 0.01570703, + "auxiliary_loss_mlp": 0.01327347, + "balance_loss_clip": 1.2246902, + "balance_loss_mlp": 1.06699371, + "epoch": 0.11200961972042688, + "flos": 25156141064640.0, + "grad_norm": 3.55758931147988, + "language_loss": 0.87022722, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89920771, + "num_input_tokens_seen": 40307820, + "step": 1863, + "time_per_iteration": 2.7963924407958984 + }, + { + "auxiliary_loss_clip": 0.01570772, + "auxiliary_loss_mlp": 0.01333487, + "balance_loss_clip": 1.22581744, + "balance_loss_mlp": 1.07198942, + "epoch": 0.11206974297309484, + "flos": 21290464379520.0, + "grad_norm": 2.861108010144713, + "language_loss": 0.6420911, + "learning_rate": 3.929761337766945e-06, + "loss": 0.67113376, + "num_input_tokens_seen": 40327430, + "step": 1864, + "time_per_iteration": 2.7601821422576904 + }, + { + "auxiliary_loss_clip": 0.01582627, + "auxiliary_loss_mlp": 0.01332198, + "balance_loss_clip": 1.23694777, + "balance_loss_mlp": 1.06936526, + "epoch": 0.11212986622576282, + "flos": 18917958434400.0, + "grad_norm": 3.3680438408534537, + "language_loss": 0.73654276, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76569104, + "num_input_tokens_seen": 40344545, + "step": 1865, + "time_per_iteration": 2.7297277450561523 + }, + { + "auxiliary_loss_clip": 0.01584805, + "auxiliary_loss_mlp": 0.0134497, + "balance_loss_clip": 1.23890388, + "balance_loss_mlp": 1.0867151, + "epoch": 0.11218998947843078, + "flos": 22056962215680.0, + "grad_norm": 2.1164361267324927, + "language_loss": 0.8456099, + "learning_rate": 3.929556577139446e-06, + "loss": 0.87490767, + "num_input_tokens_seen": 40362300, + "step": 1866, + "time_per_iteration": 2.778175115585327 + }, + { + "auxiliary_loss_clip": 0.01584408, + "auxiliary_loss_mlp": 0.01323751, + "balance_loss_clip": 1.23848569, + "balance_loss_mlp": 1.05939221, + "epoch": 0.11225011273109875, + "flos": 24574013835840.0, + "grad_norm": 1.8253077495212777, + "language_loss": 0.81511664, + "learning_rate": 3.929454087070286e-06, + "loss": 0.84419823, + "num_input_tokens_seen": 40384720, + "step": 1867, + "time_per_iteration": 2.7879719734191895 + }, + { + "auxiliary_loss_clip": 0.01583653, + "auxiliary_loss_mlp": 0.01354733, + "balance_loss_clip": 1.23864555, + "balance_loss_mlp": 1.08884907, + "epoch": 0.11231023598376672, + "flos": 28441169719200.0, + "grad_norm": 2.2658909790376636, + "language_loss": 0.86880922, + "learning_rate": 3.929351523836035e-06, + "loss": 0.898193, + "num_input_tokens_seen": 40404000, + "step": 1868, + "time_per_iteration": 2.7961485385894775 + }, + { + "auxiliary_loss_clip": 0.01594516, + "auxiliary_loss_mlp": 0.01335237, + "balance_loss_clip": 1.24958587, + "balance_loss_mlp": 1.07431173, + "epoch": 0.1123703592364347, + "flos": 14428091826720.0, + "grad_norm": 2.4411891544595363, + "language_loss": 0.68613684, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.71543437, + "num_input_tokens_seen": 40418665, + "step": 1869, + "time_per_iteration": 2.783376693725586 + }, + { + "auxiliary_loss_clip": 0.01601801, + "auxiliary_loss_mlp": 0.01339236, + "balance_loss_clip": 1.25734627, + "balance_loss_mlp": 1.07525849, + "epoch": 0.11243048248910266, + "flos": 22238450282880.0, + "grad_norm": 3.2742907560816326, + "language_loss": 0.77535009, + "learning_rate": 3.929146177887814e-06, + "loss": 0.8047604, + "num_input_tokens_seen": 40437870, + "step": 1870, + "time_per_iteration": 2.7555935382843018 + }, + { + "auxiliary_loss_clip": 0.01584904, + "auxiliary_loss_mlp": 0.01340299, + "balance_loss_clip": 1.23872733, + "balance_loss_mlp": 1.07365203, + "epoch": 0.11249060574177062, + "flos": 18585932042880.0, + "grad_norm": 2.2554774802082496, + "language_loss": 0.758219, + "learning_rate": 3.929043395181631e-06, + "loss": 0.78747106, + "num_input_tokens_seen": 40455570, + "step": 1871, + "time_per_iteration": 2.745347738265991 + }, + { + "auxiliary_loss_clip": 0.01602218, + "auxiliary_loss_mlp": 0.01356669, + "balance_loss_clip": 1.25759196, + "balance_loss_mlp": 1.0890677, + "epoch": 0.1125507289944386, + "flos": 22858771533120.0, + "grad_norm": 2.2755255594812867, + "language_loss": 0.81741834, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84700716, + "num_input_tokens_seen": 40473600, + "step": 1872, + "time_per_iteration": 2.736677646636963 + }, + { + "auxiliary_loss_clip": 0.01605179, + "auxiliary_loss_mlp": 0.01352304, + "balance_loss_clip": 1.25936913, + "balance_loss_mlp": 1.08489347, + "epoch": 0.11261085224710657, + "flos": 19678084339680.0, + "grad_norm": 3.6064705053220845, + "language_loss": 0.83165848, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.86123335, + "num_input_tokens_seen": 40490025, + "step": 1873, + "time_per_iteration": 2.8802764415740967 + }, + { + "auxiliary_loss_clip": 0.01596208, + "auxiliary_loss_mlp": 0.01330459, + "balance_loss_clip": 1.25087595, + "balance_loss_mlp": 1.06323957, + "epoch": 0.11267097549977453, + "flos": 26065401952320.0, + "grad_norm": 2.2749441633572576, + "language_loss": 0.92294961, + "learning_rate": 3.928734608181575e-06, + "loss": 0.95221627, + "num_input_tokens_seen": 40511580, + "step": 1874, + "time_per_iteration": 2.850616693496704 + }, + { + "auxiliary_loss_clip": 0.01611058, + "auxiliary_loss_mlp": 0.01332757, + "balance_loss_clip": 1.26551163, + "balance_loss_mlp": 1.06782627, + "epoch": 0.11273109875244251, + "flos": 21070099584000.0, + "grad_norm": 1.6925036225526333, + "language_loss": 0.75239062, + "learning_rate": 3.928631532900729e-06, + "loss": 0.78182876, + "num_input_tokens_seen": 40530155, + "step": 1875, + "time_per_iteration": 2.7321982383728027 + }, + { + "auxiliary_loss_clip": 0.01601094, + "auxiliary_loss_mlp": 0.01317038, + "balance_loss_clip": 1.255463, + "balance_loss_mlp": 1.05058122, + "epoch": 0.11279122200511048, + "flos": 27091444737600.0, + "grad_norm": 2.820129240592321, + "language_loss": 0.72053063, + "learning_rate": 3.928528384485984e-06, + "loss": 0.74971193, + "num_input_tokens_seen": 40549500, + "step": 1876, + "time_per_iteration": 2.777280330657959 + }, + { + "auxiliary_loss_clip": 0.0160183, + "auxiliary_loss_mlp": 0.01328782, + "balance_loss_clip": 1.25591683, + "balance_loss_mlp": 1.05717564, + "epoch": 0.11285134525777844, + "flos": 20189664462240.0, + "grad_norm": 2.092260015860789, + "language_loss": 0.76853216, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.79783821, + "num_input_tokens_seen": 40567475, + "step": 1877, + "time_per_iteration": 4.207789182662964 + }, + { + "auxiliary_loss_clip": 0.01607209, + "auxiliary_loss_mlp": 0.01358749, + "balance_loss_clip": 1.26111865, + "balance_loss_mlp": 1.0905757, + "epoch": 0.11291146851044641, + "flos": 12460093715520.0, + "grad_norm": 2.7964074761979125, + "language_loss": 0.88242793, + "learning_rate": 3.928321868270436e-06, + "loss": 0.9120875, + "num_input_tokens_seen": 40583280, + "step": 1878, + "time_per_iteration": 2.6831912994384766 + }, + { + "auxiliary_loss_clip": 0.01595829, + "auxiliary_loss_mlp": 0.01327168, + "balance_loss_clip": 1.25091124, + "balance_loss_mlp": 1.05994892, + "epoch": 0.11297159176311439, + "flos": 23844610104480.0, + "grad_norm": 2.5964203005460744, + "language_loss": 0.80956304, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83879304, + "num_input_tokens_seen": 40603080, + "step": 1879, + "time_per_iteration": 2.79184889793396 + }, + { + "auxiliary_loss_clip": 0.01599231, + "auxiliary_loss_mlp": 0.01335765, + "balance_loss_clip": 1.25306797, + "balance_loss_mlp": 1.06701982, + "epoch": 0.11303171501578235, + "flos": 29932899189120.0, + "grad_norm": 5.741219027431462, + "language_loss": 0.7042501, + "learning_rate": 3.928115059566259e-06, + "loss": 0.73360002, + "num_input_tokens_seen": 40623255, + "step": 1880, + "time_per_iteration": 2.8325185775756836 + }, + { + "auxiliary_loss_clip": 0.01599022, + "auxiliary_loss_mlp": 0.01333405, + "balance_loss_clip": 1.25277436, + "balance_loss_mlp": 1.06599426, + "epoch": 0.11309183826845032, + "flos": 16182438426720.0, + "grad_norm": 1.9634721399431303, + "language_loss": 0.72543824, + "learning_rate": 3.928011545540734e-06, + "loss": 0.75476241, + "num_input_tokens_seen": 40641570, + "step": 1881, + "time_per_iteration": 2.783524513244629 + }, + { + "auxiliary_loss_clip": 0.01592423, + "auxiliary_loss_mlp": 0.01342858, + "balance_loss_clip": 1.24554849, + "balance_loss_mlp": 1.07296813, + "epoch": 0.1131519615211183, + "flos": 12022322520960.0, + "grad_norm": 4.112836210440839, + "language_loss": 0.74196994, + "learning_rate": 3.927907958404819e-06, + "loss": 0.77132273, + "num_input_tokens_seen": 40658775, + "step": 1882, + "time_per_iteration": 2.744030714035034 + }, + { + "auxiliary_loss_clip": 0.01595789, + "auxiliary_loss_mlp": 0.01331985, + "balance_loss_clip": 1.24847829, + "balance_loss_mlp": 1.06610084, + "epoch": 0.11321208477378626, + "flos": 26252541315360.0, + "grad_norm": 2.5764240211487226, + "language_loss": 0.79639316, + "learning_rate": 3.92780429816244e-06, + "loss": 0.82567084, + "num_input_tokens_seen": 40679555, + "step": 1883, + "time_per_iteration": 4.283812761306763 + }, + { + "auxiliary_loss_clip": 0.0158508, + "auxiliary_loss_mlp": 0.01323876, + "balance_loss_clip": 1.23730302, + "balance_loss_mlp": 1.05436754, + "epoch": 0.11327220802645423, + "flos": 13628103060960.0, + "grad_norm": 8.951093637105359, + "language_loss": 0.77392852, + "learning_rate": 3.927700564817529e-06, + "loss": 0.80301803, + "num_input_tokens_seen": 40697295, + "step": 1884, + "time_per_iteration": 2.7816543579101562 + }, + { + "auxiliary_loss_clip": 0.01705945, + "auxiliary_loss_mlp": 0.01219871, + "balance_loss_clip": 1.36313152, + "balance_loss_mlp": 1.01311493, + "epoch": 0.1133323312791222, + "flos": 57198094284960.0, + "grad_norm": 0.7870630232116446, + "language_loss": 0.55169356, + "learning_rate": 3.927596758374019e-06, + "loss": 0.58095169, + "num_input_tokens_seen": 40758095, + "step": 1885, + "time_per_iteration": 4.637825965881348 + }, + { + "auxiliary_loss_clip": 0.01590309, + "auxiliary_loss_mlp": 0.01317492, + "balance_loss_clip": 1.24472213, + "balance_loss_mlp": 1.05484998, + "epoch": 0.11339245453179017, + "flos": 24353952465600.0, + "grad_norm": 2.671570528973917, + "language_loss": 0.90381265, + "learning_rate": 3.927492878835848e-06, + "loss": 0.93289065, + "num_input_tokens_seen": 40777140, + "step": 1886, + "time_per_iteration": 4.3306708335876465 + }, + { + "auxiliary_loss_clip": 0.01592183, + "auxiliary_loss_mlp": 0.01329473, + "balance_loss_clip": 1.2454499, + "balance_loss_mlp": 1.06511497, + "epoch": 0.11345257778445814, + "flos": 22672694158560.0, + "grad_norm": 2.2121291920234505, + "language_loss": 0.85208255, + "learning_rate": 3.927388926206953e-06, + "loss": 0.88129902, + "num_input_tokens_seen": 40797505, + "step": 1887, + "time_per_iteration": 2.7636830806732178 + }, + { + "auxiliary_loss_clip": 0.01595517, + "auxiliary_loss_mlp": 0.01349394, + "balance_loss_clip": 1.24936855, + "balance_loss_mlp": 1.08560741, + "epoch": 0.11351270103712612, + "flos": 20989918725120.0, + "grad_norm": 5.7510596474807745, + "language_loss": 0.75963092, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78908002, + "num_input_tokens_seen": 40812970, + "step": 1888, + "time_per_iteration": 2.747469186782837 + }, + { + "auxiliary_loss_clip": 0.01587111, + "auxiliary_loss_mlp": 0.01351519, + "balance_loss_clip": 1.24201298, + "balance_loss_mlp": 1.08773279, + "epoch": 0.11357282428979408, + "flos": 37351304032320.0, + "grad_norm": 2.2492810217214987, + "language_loss": 0.68471891, + "learning_rate": 3.927180801692764e-06, + "loss": 0.71410519, + "num_input_tokens_seen": 40837745, + "step": 1889, + "time_per_iteration": 3.028731107711792 + }, + { + "auxiliary_loss_clip": 0.01596676, + "auxiliary_loss_mlp": 0.01346263, + "balance_loss_clip": 1.25004053, + "balance_loss_mlp": 1.08228612, + "epoch": 0.11363294754246205, + "flos": 21758881891680.0, + "grad_norm": 33.88241506551548, + "language_loss": 0.84003949, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86946893, + "num_input_tokens_seen": 40856490, + "step": 1890, + "time_per_iteration": 2.769272804260254 + }, + { + "auxiliary_loss_clip": 0.01597241, + "auxiliary_loss_mlp": 0.0133684, + "balance_loss_clip": 1.25263405, + "balance_loss_mlp": 1.07686853, + "epoch": 0.11369307079513001, + "flos": 22603853819520.0, + "grad_norm": 3.9971691480486378, + "language_loss": 0.64969063, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67903149, + "num_input_tokens_seen": 40874070, + "step": 1891, + "time_per_iteration": 2.7357804775238037 + }, + { + "auxiliary_loss_clip": 0.01591763, + "auxiliary_loss_mlp": 0.01313661, + "balance_loss_clip": 1.24799371, + "balance_loss_mlp": 1.05216372, + "epoch": 0.11375319404779799, + "flos": 21946400536320.0, + "grad_norm": 3.233227877471151, + "language_loss": 0.88411003, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.91316426, + "num_input_tokens_seen": 40892425, + "step": 1892, + "time_per_iteration": 2.7894127368927 + }, + { + "auxiliary_loss_clip": 0.0160731, + "auxiliary_loss_mlp": 0.01338931, + "balance_loss_clip": 1.26275599, + "balance_loss_mlp": 1.0762887, + "epoch": 0.11381331730046595, + "flos": 26397845553600.0, + "grad_norm": 2.5015251601381654, + "language_loss": 0.73039079, + "learning_rate": 3.926763675749339e-06, + "loss": 0.75985318, + "num_input_tokens_seen": 40912190, + "step": 1893, + "time_per_iteration": 2.760490894317627 + }, + { + "auxiliary_loss_clip": 0.01612406, + "auxiliary_loss_mlp": 0.01317268, + "balance_loss_clip": 1.2675786, + "balance_loss_mlp": 1.05405366, + "epoch": 0.11387344055313392, + "flos": 23806947077280.0, + "grad_norm": 2.0945512143519025, + "language_loss": 0.7995137, + "learning_rate": 3.92665921159591e-06, + "loss": 0.82881045, + "num_input_tokens_seen": 40928395, + "step": 1894, + "time_per_iteration": 2.8029024600982666 + }, + { + "auxiliary_loss_clip": 0.01606705, + "auxiliary_loss_mlp": 0.01329636, + "balance_loss_clip": 1.26227355, + "balance_loss_mlp": 1.06413269, + "epoch": 0.1139335638058019, + "flos": 34525210849920.0, + "grad_norm": 3.1985769885299327, + "language_loss": 0.80103391, + "learning_rate": 3.926554674383371e-06, + "loss": 0.83039737, + "num_input_tokens_seen": 40946555, + "step": 1895, + "time_per_iteration": 2.9075722694396973 + }, + { + "auxiliary_loss_clip": 0.01765233, + "auxiliary_loss_mlp": 0.01212799, + "balance_loss_clip": 1.42364097, + "balance_loss_mlp": 1.0083313, + "epoch": 0.11399368705846986, + "flos": 70595174737440.0, + "grad_norm": 0.8003571690330036, + "language_loss": 0.6325959, + "learning_rate": 3.926450064115686e-06, + "loss": 0.66237617, + "num_input_tokens_seen": 41004910, + "step": 1896, + "time_per_iteration": 3.401296377182007 + }, + { + "auxiliary_loss_clip": 0.01601741, + "auxiliary_loss_mlp": 0.01329248, + "balance_loss_clip": 1.25820768, + "balance_loss_mlp": 1.06546152, + "epoch": 0.11405381031113783, + "flos": 21326382711360.0, + "grad_norm": 2.14446369581225, + "language_loss": 0.84916401, + "learning_rate": 3.926345380796821e-06, + "loss": 0.87847388, + "num_input_tokens_seen": 41026385, + "step": 1897, + "time_per_iteration": 2.80757474899292 + }, + { + "auxiliary_loss_clip": 0.01605245, + "auxiliary_loss_mlp": 0.01337746, + "balance_loss_clip": 1.26050889, + "balance_loss_mlp": 1.07033515, + "epoch": 0.11411393356380581, + "flos": 19721702088000.0, + "grad_norm": 3.1290785176284497, + "language_loss": 0.80255544, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.83198535, + "num_input_tokens_seen": 41045315, + "step": 1898, + "time_per_iteration": 2.757133722305298 + }, + { + "auxiliary_loss_clip": 0.01594218, + "auxiliary_loss_mlp": 0.01326206, + "balance_loss_clip": 1.24930263, + "balance_loss_mlp": 1.05860484, + "epoch": 0.11417405681647377, + "flos": 17532390977280.0, + "grad_norm": 2.5005040110801744, + "language_loss": 0.73742342, + "learning_rate": 3.926135795021435e-06, + "loss": 0.76662767, + "num_input_tokens_seen": 41063390, + "step": 1899, + "time_per_iteration": 2.8061983585357666 + }, + { + "auxiliary_loss_clip": 0.01746324, + "auxiliary_loss_mlp": 0.0125354, + "balance_loss_clip": 1.40304363, + "balance_loss_mlp": 1.05441284, + "epoch": 0.11423418006914174, + "flos": 59681199837600.0, + "grad_norm": 0.9182438859326774, + "language_loss": 0.63381171, + "learning_rate": 3.92603089257286e-06, + "loss": 0.66381037, + "num_input_tokens_seen": 41124180, + "step": 1900, + "time_per_iteration": 3.1929028034210205 + }, + { + "auxiliary_loss_clip": 0.01589097, + "auxiliary_loss_mlp": 0.01332291, + "balance_loss_clip": 1.24343741, + "balance_loss_mlp": 1.06526184, + "epoch": 0.1142943033218097, + "flos": 22965161114880.0, + "grad_norm": 1.7312313649321178, + "language_loss": 0.78372395, + "learning_rate": 3.925925917089001e-06, + "loss": 0.8129378, + "num_input_tokens_seen": 41143485, + "step": 1901, + "time_per_iteration": 2.780412197113037 + }, + { + "auxiliary_loss_clip": 0.01592235, + "auxiliary_loss_mlp": 0.01325275, + "balance_loss_clip": 1.24803793, + "balance_loss_mlp": 1.05919993, + "epoch": 0.11435442657447768, + "flos": 18258039820800.0, + "grad_norm": 2.2480262410744545, + "language_loss": 0.83983648, + "learning_rate": 3.925820868573839e-06, + "loss": 0.86901152, + "num_input_tokens_seen": 41161695, + "step": 1902, + "time_per_iteration": 2.7362403869628906 + }, + { + "auxiliary_loss_clip": 0.01591462, + "auxiliary_loss_mlp": 0.01334972, + "balance_loss_clip": 1.24497056, + "balance_loss_mlp": 1.06775284, + "epoch": 0.11441454982714565, + "flos": 24063495701760.0, + "grad_norm": 2.8393122907186514, + "language_loss": 0.78222716, + "learning_rate": 3.925715747031356e-06, + "loss": 0.81149155, + "num_input_tokens_seen": 41181715, + "step": 1903, + "time_per_iteration": 2.820417881011963 + }, + { + "auxiliary_loss_clip": 0.01582323, + "auxiliary_loss_mlp": 0.01323865, + "balance_loss_clip": 1.23533344, + "balance_loss_mlp": 1.05588233, + "epoch": 0.11447467307981361, + "flos": 25340056534080.0, + "grad_norm": 2.100885408302502, + "language_loss": 0.75564146, + "learning_rate": 3.925610552465539e-06, + "loss": 0.78470337, + "num_input_tokens_seen": 41201770, + "step": 1904, + "time_per_iteration": 2.8240251541137695 + }, + { + "auxiliary_loss_clip": 0.01591258, + "auxiliary_loss_mlp": 0.01350825, + "balance_loss_clip": 1.24313724, + "balance_loss_mlp": 1.08227038, + "epoch": 0.11453479633248159, + "flos": 21728235574080.0, + "grad_norm": 4.1462104354490865, + "language_loss": 0.92004776, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94946855, + "num_input_tokens_seen": 41220590, + "step": 1905, + "time_per_iteration": 2.840899705886841 + }, + { + "auxiliary_loss_clip": 0.01580875, + "auxiliary_loss_mlp": 0.01354084, + "balance_loss_clip": 1.23309135, + "balance_loss_mlp": 1.07751811, + "epoch": 0.11459491958514956, + "flos": 12971408340960.0, + "grad_norm": 3.702407066650207, + "language_loss": 0.77762014, + "learning_rate": 3.925399944279861e-06, + "loss": 0.80696976, + "num_input_tokens_seen": 41237250, + "step": 1906, + "time_per_iteration": 2.7789313793182373 + }, + { + "auxiliary_loss_clip": 0.0159173, + "auxiliary_loss_mlp": 0.01359879, + "balance_loss_clip": 1.24252772, + "balance_loss_mlp": 1.09094286, + "epoch": 0.11465504283781752, + "flos": 22713505223040.0, + "grad_norm": 2.6939710240353607, + "language_loss": 0.82055771, + "learning_rate": 3.925294530667986e-06, + "loss": 0.85007375, + "num_input_tokens_seen": 41256680, + "step": 1907, + "time_per_iteration": 2.74688982963562 + }, + { + "auxiliary_loss_clip": 0.01606189, + "auxiliary_loss_mlp": 0.01356282, + "balance_loss_clip": 1.25766945, + "balance_loss_mlp": 1.09135175, + "epoch": 0.1147151660904855, + "flos": 23400315266400.0, + "grad_norm": 2.358952560637877, + "language_loss": 0.84653866, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87616342, + "num_input_tokens_seen": 41270955, + "step": 1908, + "time_per_iteration": 2.7717223167419434 + }, + { + "auxiliary_loss_clip": 0.01707188, + "auxiliary_loss_mlp": 0.01224991, + "balance_loss_clip": 1.35959148, + "balance_loss_mlp": 1.01823425, + "epoch": 0.11477528934315347, + "flos": 63017470164960.0, + "grad_norm": 0.947504328892833, + "language_loss": 0.61038333, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63970506, + "num_input_tokens_seen": 41319180, + "step": 1909, + "time_per_iteration": 3.0896217823028564 + }, + { + "auxiliary_loss_clip": 0.01589202, + "auxiliary_loss_mlp": 0.01346619, + "balance_loss_clip": 1.24067986, + "balance_loss_mlp": 1.08416843, + "epoch": 0.11483541259582143, + "flos": 16327439239680.0, + "grad_norm": 1.9960578329970193, + "language_loss": 0.78666663, + "learning_rate": 3.924977851804197e-06, + "loss": 0.81602478, + "num_input_tokens_seen": 41337480, + "step": 1910, + "time_per_iteration": 2.8298614025115967 + }, + { + "auxiliary_loss_clip": 0.01589414, + "auxiliary_loss_mlp": 0.01342326, + "balance_loss_clip": 1.24068153, + "balance_loss_mlp": 1.07949376, + "epoch": 0.1148955358484894, + "flos": 21582589982400.0, + "grad_norm": 2.2832199275902934, + "language_loss": 0.76835918, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79767662, + "num_input_tokens_seen": 41354650, + "step": 1911, + "time_per_iteration": 2.8051304817199707 + }, + { + "auxiliary_loss_clip": 0.01581685, + "auxiliary_loss_mlp": 0.01375698, + "balance_loss_clip": 1.23194826, + "balance_loss_mlp": 1.12373781, + "epoch": 0.11495565910115738, + "flos": 27676226937600.0, + "grad_norm": 1.7953095525116192, + "language_loss": 0.79315305, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.82272685, + "num_input_tokens_seen": 41376935, + "step": 1912, + "time_per_iteration": 2.894055128097534 + }, + { + "auxiliary_loss_clip": 0.0158326, + "auxiliary_loss_mlp": 0.01391975, + "balance_loss_clip": 1.23363018, + "balance_loss_mlp": 1.14173126, + "epoch": 0.11501578235382534, + "flos": 20634528222720.0, + "grad_norm": 2.084176634373519, + "language_loss": 0.78069568, + "learning_rate": 3.924660515982246e-06, + "loss": 0.81044805, + "num_input_tokens_seen": 41396105, + "step": 1913, + "time_per_iteration": 2.8385889530181885 + }, + { + "auxiliary_loss_clip": 0.01580415, + "auxiliary_loss_mlp": 0.01382395, + "balance_loss_clip": 1.23003101, + "balance_loss_mlp": 1.13157845, + "epoch": 0.1150759056064933, + "flos": 19831125922560.0, + "grad_norm": 2.1427056325469844, + "language_loss": 0.70073491, + "learning_rate": 3.924554591402939e-06, + "loss": 0.73036301, + "num_input_tokens_seen": 41415600, + "step": 1914, + "time_per_iteration": 2.748417377471924 + }, + { + "auxiliary_loss_clip": 0.01704378, + "auxiliary_loss_mlp": 0.012696, + "balance_loss_clip": 1.3525573, + "balance_loss_mlp": 1.06970978, + "epoch": 0.11513602885916129, + "flos": 70053403435200.0, + "grad_norm": 0.7653789359647972, + "language_loss": 0.60962439, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63936412, + "num_input_tokens_seen": 41478760, + "step": 1915, + "time_per_iteration": 4.937025785446167 + }, + { + "auxiliary_loss_clip": 0.01590809, + "auxiliary_loss_mlp": 0.01376976, + "balance_loss_clip": 1.24076796, + "balance_loss_mlp": 1.12139082, + "epoch": 0.11519615211182925, + "flos": 15743529387360.0, + "grad_norm": 3.2996584618345657, + "language_loss": 0.9296459, + "learning_rate": 3.924342523310436e-06, + "loss": 0.9593237, + "num_input_tokens_seen": 41495720, + "step": 1916, + "time_per_iteration": 2.745763063430786 + }, + { + "auxiliary_loss_clip": 0.015797, + "auxiliary_loss_mlp": 0.01348708, + "balance_loss_clip": 1.22909927, + "balance_loss_mlp": 1.09846425, + "epoch": 0.11525627536449722, + "flos": 20669725919520.0, + "grad_norm": 2.00928549248525, + "language_loss": 0.72940719, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.75869131, + "num_input_tokens_seen": 41513585, + "step": 1917, + "time_per_iteration": 2.7850501537323 + }, + { + "auxiliary_loss_clip": 0.01573332, + "auxiliary_loss_mlp": 0.0134917, + "balance_loss_clip": 1.22228646, + "balance_loss_mlp": 1.09968829, + "epoch": 0.1153163986171652, + "flos": 20305384371360.0, + "grad_norm": 5.0838841471521325, + "language_loss": 0.73990178, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76912677, + "num_input_tokens_seen": 41533390, + "step": 1918, + "time_per_iteration": 2.7762839794158936 + }, + { + "auxiliary_loss_clip": 0.0157336, + "auxiliary_loss_mlp": 0.01344802, + "balance_loss_clip": 1.22226834, + "balance_loss_mlp": 1.08788252, + "epoch": 0.11537652186983316, + "flos": 17641245889440.0, + "grad_norm": 2.6589257625017226, + "language_loss": 0.86586308, + "learning_rate": 3.92402387389729e-06, + "loss": 0.89504474, + "num_input_tokens_seen": 41551015, + "step": 1919, + "time_per_iteration": 2.7819266319274902 + }, + { + "auxiliary_loss_clip": 0.01576056, + "auxiliary_loss_mlp": 0.01347735, + "balance_loss_clip": 1.22524714, + "balance_loss_mlp": 1.09081531, + "epoch": 0.11543664512250112, + "flos": 21071540854080.0, + "grad_norm": 2.148621671918416, + "language_loss": 0.86812615, + "learning_rate": 3.923917511502512e-06, + "loss": 0.89736408, + "num_input_tokens_seen": 41568055, + "step": 1920, + "time_per_iteration": 2.7705442905426025 + }, + { + "auxiliary_loss_clip": 0.01582042, + "auxiliary_loss_mlp": 0.0133513, + "balance_loss_clip": 1.23027039, + "balance_loss_mlp": 1.07286978, + "epoch": 0.11549676837516909, + "flos": 22749764908320.0, + "grad_norm": 2.162940897167377, + "language_loss": 0.79555786, + "learning_rate": 3.923811076152589e-06, + "loss": 0.82472956, + "num_input_tokens_seen": 41587435, + "step": 1921, + "time_per_iteration": 2.833897829055786 + }, + { + "auxiliary_loss_clip": 0.01582895, + "auxiliary_loss_mlp": 0.01338792, + "balance_loss_clip": 1.23294759, + "balance_loss_mlp": 1.07252645, + "epoch": 0.11555689162783707, + "flos": 19170828027360.0, + "grad_norm": 3.1535593961807997, + "language_loss": 0.77898228, + "learning_rate": 3.923704567851557e-06, + "loss": 0.80819917, + "num_input_tokens_seen": 41604975, + "step": 1922, + "time_per_iteration": 4.3811259269714355 + }, + { + "auxiliary_loss_clip": 0.01581083, + "auxiliary_loss_mlp": 0.01340731, + "balance_loss_clip": 1.2311883, + "balance_loss_mlp": 1.07675362, + "epoch": 0.11561701488050503, + "flos": 24574393117440.0, + "grad_norm": 1.996379279642874, + "language_loss": 0.84153771, + "learning_rate": 3.923597986603456e-06, + "loss": 0.87075579, + "num_input_tokens_seen": 41626155, + "step": 1923, + "time_per_iteration": 2.8232529163360596 + }, + { + "auxiliary_loss_clip": 0.01583545, + "auxiliary_loss_mlp": 0.0135312, + "balance_loss_clip": 1.23235095, + "balance_loss_mlp": 1.08284843, + "epoch": 0.115677138133173, + "flos": 17094468070080.0, + "grad_norm": 4.349634084493278, + "language_loss": 0.8088873, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.83825397, + "num_input_tokens_seen": 41644805, + "step": 1924, + "time_per_iteration": 5.712247371673584 + }, + { + "auxiliary_loss_clip": 0.01692406, + "auxiliary_loss_mlp": 0.01232475, + "balance_loss_clip": 1.34257698, + "balance_loss_mlp": 1.02190399, + "epoch": 0.11573726138584098, + "flos": 62710628071680.0, + "grad_norm": 0.832264179908398, + "language_loss": 0.61194283, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6411916, + "num_input_tokens_seen": 41709345, + "step": 1925, + "time_per_iteration": 3.3952908515930176 + }, + { + "auxiliary_loss_clip": 0.01575146, + "auxiliary_loss_mlp": 0.01331162, + "balance_loss_clip": 1.22466731, + "balance_loss_mlp": 1.0725255, + "epoch": 0.11579738463850894, + "flos": 22603322825280.0, + "grad_norm": 2.0459846603906873, + "language_loss": 0.7550211, + "learning_rate": 3.923277805217161e-06, + "loss": 0.78408414, + "num_input_tokens_seen": 41730210, + "step": 1926, + "time_per_iteration": 2.788083791732788 + }, + { + "auxiliary_loss_clip": 0.01578948, + "auxiliary_loss_mlp": 0.01349596, + "balance_loss_clip": 1.22788656, + "balance_loss_mlp": 1.08828974, + "epoch": 0.11585750789117691, + "flos": 21728311430400.0, + "grad_norm": 2.9191289406528993, + "language_loss": 0.72545731, + "learning_rate": 3.923170932221222e-06, + "loss": 0.75474268, + "num_input_tokens_seen": 41750270, + "step": 1927, + "time_per_iteration": 2.83231520652771 + }, + { + "auxiliary_loss_clip": 0.01576705, + "auxiliary_loss_mlp": 0.013852, + "balance_loss_clip": 1.225191, + "balance_loss_mlp": 1.13152254, + "epoch": 0.11591763114384489, + "flos": 26290014701760.0, + "grad_norm": 1.6968476585479606, + "language_loss": 0.86999279, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89961183, + "num_input_tokens_seen": 41772975, + "step": 1928, + "time_per_iteration": 2.888470411300659 + }, + { + "auxiliary_loss_clip": 0.01578308, + "auxiliary_loss_mlp": 0.01364117, + "balance_loss_clip": 1.22814131, + "balance_loss_mlp": 1.10490787, + "epoch": 0.11597775439651285, + "flos": 23002558644960.0, + "grad_norm": 3.4827607010697523, + "language_loss": 0.77331102, + "learning_rate": 3.922956967452898e-06, + "loss": 0.80273533, + "num_input_tokens_seen": 41791765, + "step": 1929, + "time_per_iteration": 2.7867395877838135 + }, + { + "auxiliary_loss_clip": 0.01569972, + "auxiliary_loss_mlp": 0.01363313, + "balance_loss_clip": 1.21784401, + "balance_loss_mlp": 1.10944462, + "epoch": 0.11603787764918082, + "flos": 31944439192320.0, + "grad_norm": 2.586849205698356, + "language_loss": 0.77034837, + "learning_rate": 3.922849875688626e-06, + "loss": 0.79968125, + "num_input_tokens_seen": 41815615, + "step": 1930, + "time_per_iteration": 2.907259464263916 + }, + { + "auxiliary_loss_clip": 0.01562561, + "auxiliary_loss_mlp": 0.01377791, + "balance_loss_clip": 1.21080232, + "balance_loss_mlp": 1.11877263, + "epoch": 0.1160980009018488, + "flos": 22273761764160.0, + "grad_norm": 1.82187374295418, + "language_loss": 0.72235382, + "learning_rate": 3.922742711009693e-06, + "loss": 0.75175732, + "num_input_tokens_seen": 41834810, + "step": 1931, + "time_per_iteration": 2.7609474658966064 + }, + { + "auxiliary_loss_clip": 0.0155897, + "auxiliary_loss_mlp": 0.0135956, + "balance_loss_clip": 1.20795965, + "balance_loss_mlp": 1.09996974, + "epoch": 0.11615812415451676, + "flos": 22785834952800.0, + "grad_norm": 1.6388097112456188, + "language_loss": 0.82457995, + "learning_rate": 3.922635473420164e-06, + "loss": 0.85376525, + "num_input_tokens_seen": 41854975, + "step": 1932, + "time_per_iteration": 2.7706751823425293 + }, + { + "auxiliary_loss_clip": 0.01629447, + "auxiliary_loss_mlp": 0.01254272, + "balance_loss_clip": 1.2768209, + "balance_loss_mlp": 1.0559082, + "epoch": 0.11621824740718473, + "flos": 67152591048960.0, + "grad_norm": 0.8163010682013637, + "language_loss": 0.61009836, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63893557, + "num_input_tokens_seen": 41911105, + "step": 1933, + "time_per_iteration": 3.2148921489715576 + }, + { + "auxiliary_loss_clip": 0.01560671, + "auxiliary_loss_mlp": 0.01318244, + "balance_loss_clip": 1.21019197, + "balance_loss_mlp": 1.05312252, + "epoch": 0.11627837065985269, + "flos": 20378017526400.0, + "grad_norm": 3.1951958017397866, + "language_loss": 0.86322021, + "learning_rate": 3.922420779525586e-06, + "loss": 0.89200932, + "num_input_tokens_seen": 41931750, + "step": 1934, + "time_per_iteration": 2.7880117893218994 + }, + { + "auxiliary_loss_clip": 0.01569295, + "auxiliary_loss_mlp": 0.01326083, + "balance_loss_clip": 1.2170186, + "balance_loss_mlp": 1.06019819, + "epoch": 0.11633849391252067, + "flos": 21727970076960.0, + "grad_norm": 2.637169803606759, + "language_loss": 0.65946704, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.68842083, + "num_input_tokens_seen": 41949400, + "step": 1935, + "time_per_iteration": 2.8358025550842285 + }, + { + "auxiliary_loss_clip": 0.01566299, + "auxiliary_loss_mlp": 0.01334748, + "balance_loss_clip": 1.21539664, + "balance_loss_mlp": 1.06714678, + "epoch": 0.11639861716518864, + "flos": 18807245042400.0, + "grad_norm": 2.0380309642004817, + "language_loss": 0.75726867, + "learning_rate": 3.922205794037456e-06, + "loss": 0.7862792, + "num_input_tokens_seen": 41968100, + "step": 1936, + "time_per_iteration": 2.810861587524414 + }, + { + "auxiliary_loss_clip": 0.01564654, + "auxiliary_loss_mlp": 0.01346068, + "balance_loss_clip": 1.21347213, + "balance_loss_mlp": 1.07942057, + "epoch": 0.1164587404178566, + "flos": 21217110589440.0, + "grad_norm": 2.895862285601526, + "language_loss": 0.84459835, + "learning_rate": 3.922098191955998e-06, + "loss": 0.87370563, + "num_input_tokens_seen": 41986375, + "step": 1937, + "time_per_iteration": 2.8370563983917236 + }, + { + "auxiliary_loss_clip": 0.0156402, + "auxiliary_loss_mlp": 0.01364671, + "balance_loss_clip": 1.21294487, + "balance_loss_mlp": 1.0985955, + "epoch": 0.11651886367052458, + "flos": 27821113966080.0, + "grad_norm": 2.713387992184006, + "language_loss": 0.76292783, + "learning_rate": 3.921990516988384e-06, + "loss": 0.79221475, + "num_input_tokens_seen": 42006055, + "step": 1938, + "time_per_iteration": 2.846848964691162 + }, + { + "auxiliary_loss_clip": 0.01573301, + "auxiliary_loss_mlp": 0.01363195, + "balance_loss_clip": 1.22170055, + "balance_loss_mlp": 1.0946399, + "epoch": 0.11657898692319255, + "flos": 22891579755840.0, + "grad_norm": 1.9624394066834825, + "language_loss": 0.79788011, + "learning_rate": 3.921882769138696e-06, + "loss": 0.82724512, + "num_input_tokens_seen": 42024995, + "step": 1939, + "time_per_iteration": 2.804145336151123 + }, + { + "auxiliary_loss_clip": 0.01572651, + "auxiliary_loss_mlp": 0.01351558, + "balance_loss_clip": 1.22159779, + "balance_loss_mlp": 1.08586478, + "epoch": 0.11663911017586051, + "flos": 24318185846400.0, + "grad_norm": 2.73585386477744, + "language_loss": 0.86518574, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.89442778, + "num_input_tokens_seen": 42042640, + "step": 1940, + "time_per_iteration": 2.75903582572937 + }, + { + "auxiliary_loss_clip": 0.01580138, + "auxiliary_loss_mlp": 0.01354059, + "balance_loss_clip": 1.22797632, + "balance_loss_mlp": 1.08836532, + "epoch": 0.11669923342852849, + "flos": 42343458363360.0, + "grad_norm": 1.5433191838750733, + "language_loss": 0.76098645, + "learning_rate": 3.921667054809449e-06, + "loss": 0.7903285, + "num_input_tokens_seen": 42067005, + "step": 1941, + "time_per_iteration": 3.0545194149017334 + }, + { + "auxiliary_loss_clip": 0.01570829, + "auxiliary_loss_mlp": 0.01345656, + "balance_loss_clip": 1.22128963, + "balance_loss_mlp": 1.07633829, + "epoch": 0.11675935668119646, + "flos": 14643753530400.0, + "grad_norm": 2.3154860108205413, + "language_loss": 0.88618314, + "learning_rate": 3.921559088338068e-06, + "loss": 0.91534793, + "num_input_tokens_seen": 42082295, + "step": 1942, + "time_per_iteration": 2.770075559616089 + }, + { + "auxiliary_loss_clip": 0.01574802, + "auxiliary_loss_mlp": 0.01326904, + "balance_loss_clip": 1.22368228, + "balance_loss_mlp": 1.06559777, + "epoch": 0.11681947993386442, + "flos": 35119399233600.0, + "grad_norm": 1.7206215536823029, + "language_loss": 0.68102485, + "learning_rate": 3.921451049000975e-06, + "loss": 0.71004194, + "num_input_tokens_seen": 42105295, + "step": 1943, + "time_per_iteration": 2.9241392612457275 + }, + { + "auxiliary_loss_clip": 0.01579982, + "auxiliary_loss_mlp": 0.01324131, + "balance_loss_clip": 1.22867143, + "balance_loss_mlp": 1.06301486, + "epoch": 0.11687960318653239, + "flos": 38986365476160.0, + "grad_norm": 1.9337207394976812, + "language_loss": 0.69704723, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72608829, + "num_input_tokens_seen": 42125520, + "step": 1944, + "time_per_iteration": 2.9673874378204346 + }, + { + "auxiliary_loss_clip": 0.01575935, + "auxiliary_loss_mlp": 0.01327563, + "balance_loss_clip": 1.22670484, + "balance_loss_mlp": 1.06472993, + "epoch": 0.11693972643920036, + "flos": 25997396032800.0, + "grad_norm": 1.644100885937932, + "language_loss": 0.82684952, + "learning_rate": 3.921234751746038e-06, + "loss": 0.85588455, + "num_input_tokens_seen": 42146335, + "step": 1945, + "time_per_iteration": 2.8191094398498535 + }, + { + "auxiliary_loss_clip": 0.01568996, + "auxiliary_loss_mlp": 0.01324827, + "balance_loss_clip": 1.21894956, + "balance_loss_mlp": 1.06046844, + "epoch": 0.11699984969186833, + "flos": 27274601643840.0, + "grad_norm": 2.1328425616846327, + "language_loss": 0.76823413, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.79717243, + "num_input_tokens_seen": 42165320, + "step": 1946, + "time_per_iteration": 2.834531784057617 + }, + { + "auxiliary_loss_clip": 0.01583359, + "auxiliary_loss_mlp": 0.01341238, + "balance_loss_clip": 1.23262382, + "balance_loss_mlp": 1.07974076, + "epoch": 0.1170599729445363, + "flos": 15269650220160.0, + "grad_norm": 1.9135807125748292, + "language_loss": 0.68488681, + "learning_rate": 3.921018163077448e-06, + "loss": 0.71413279, + "num_input_tokens_seen": 42182955, + "step": 1947, + "time_per_iteration": 2.8010354042053223 + }, + { + "auxiliary_loss_clip": 0.01588559, + "auxiliary_loss_mlp": 0.01337431, + "balance_loss_clip": 1.23904264, + "balance_loss_mlp": 1.07993889, + "epoch": 0.11712009619720427, + "flos": 17166304733760.0, + "grad_norm": 2.092545252345004, + "language_loss": 0.85274416, + "learning_rate": 3.920909759473295e-06, + "loss": 0.88200402, + "num_input_tokens_seen": 42200760, + "step": 1948, + "time_per_iteration": 2.7414121627807617 + }, + { + "auxiliary_loss_clip": 0.01671713, + "auxiliary_loss_mlp": 0.01226158, + "balance_loss_clip": 1.31970787, + "balance_loss_mlp": 1.02321625, + "epoch": 0.11718021944987224, + "flos": 70947720627840.0, + "grad_norm": 0.8616695115260482, + "language_loss": 0.65006626, + "learning_rate": 3.920801283028054e-06, + "loss": 0.67904496, + "num_input_tokens_seen": 42265745, + "step": 1949, + "time_per_iteration": 3.358828544616699 + }, + { + "auxiliary_loss_clip": 0.01575972, + "auxiliary_loss_mlp": 0.01322168, + "balance_loss_clip": 1.22610784, + "balance_loss_mlp": 1.0625782, + "epoch": 0.1172403427025402, + "flos": 27455900070240.0, + "grad_norm": 1.7192683234015171, + "language_loss": 0.71792573, + "learning_rate": 3.920692733745835e-06, + "loss": 0.74690711, + "num_input_tokens_seen": 42286245, + "step": 1950, + "time_per_iteration": 2.8255512714385986 + }, + { + "auxiliary_loss_clip": 0.01581601, + "auxiliary_loss_mlp": 0.0134541, + "balance_loss_clip": 1.23165596, + "balance_loss_mlp": 1.08715487, + "epoch": 0.11730046595520818, + "flos": 15670516950720.0, + "grad_norm": 2.9232195157697496, + "language_loss": 0.76849747, + "learning_rate": 3.920584111630755e-06, + "loss": 0.79776764, + "num_input_tokens_seen": 42302710, + "step": 1951, + "time_per_iteration": 2.7681970596313477 + }, + { + "auxiliary_loss_clip": 0.01581991, + "auxiliary_loss_mlp": 0.01347707, + "balance_loss_clip": 1.23180485, + "balance_loss_mlp": 1.08830762, + "epoch": 0.11736058920787615, + "flos": 25632561418560.0, + "grad_norm": 3.4748989621780018, + "language_loss": 0.76501131, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.7943083, + "num_input_tokens_seen": 42324115, + "step": 1952, + "time_per_iteration": 2.8451430797576904 + }, + { + "auxiliary_loss_clip": 0.01567512, + "auxiliary_loss_mlp": 0.01343167, + "balance_loss_clip": 1.21829414, + "balance_loss_mlp": 1.08224154, + "epoch": 0.11742071246054411, + "flos": 21436451324640.0, + "grad_norm": 3.5994348567984358, + "language_loss": 0.72653013, + "learning_rate": 3.920366648918491e-06, + "loss": 0.75563693, + "num_input_tokens_seen": 42342505, + "step": 1953, + "time_per_iteration": 4.33372163772583 + }, + { + "auxiliary_loss_clip": 0.01580346, + "auxiliary_loss_mlp": 0.01385572, + "balance_loss_clip": 1.23040748, + "balance_loss_mlp": 1.12598145, + "epoch": 0.11748083571321208, + "flos": 15999698730240.0, + "grad_norm": 2.822033180039456, + "language_loss": 0.80309677, + "learning_rate": 3.920257808329552e-06, + "loss": 0.83275598, + "num_input_tokens_seen": 42360525, + "step": 1954, + "time_per_iteration": 2.7980029582977295 + }, + { + "auxiliary_loss_clip": 0.01577957, + "auxiliary_loss_mlp": 0.01362815, + "balance_loss_clip": 1.22623622, + "balance_loss_mlp": 1.10379696, + "epoch": 0.11754095896588006, + "flos": 16181907432480.0, + "grad_norm": 3.625945600433933, + "language_loss": 0.85979038, + "learning_rate": 3.920148894924246e-06, + "loss": 0.88919806, + "num_input_tokens_seen": 42377045, + "step": 1955, + "time_per_iteration": 2.7632222175598145 + }, + { + "auxiliary_loss_clip": 0.01571773, + "auxiliary_loss_mlp": 0.01357178, + "balance_loss_clip": 1.22177243, + "balance_loss_mlp": 1.09739709, + "epoch": 0.11760108221854802, + "flos": 13263306374880.0, + "grad_norm": 2.651967051780965, + "language_loss": 0.78347313, + "learning_rate": 3.920039908706701e-06, + "loss": 0.81276268, + "num_input_tokens_seen": 42393960, + "step": 1956, + "time_per_iteration": 2.7418036460876465 + }, + { + "auxiliary_loss_clip": 0.01568362, + "auxiliary_loss_mlp": 0.01347712, + "balance_loss_clip": 1.21751547, + "balance_loss_mlp": 1.08850312, + "epoch": 0.11766120547121599, + "flos": 24500735902080.0, + "grad_norm": 1.9868376906048733, + "language_loss": 0.80576307, + "learning_rate": 3.91993084968105e-06, + "loss": 0.83492374, + "num_input_tokens_seen": 42413160, + "step": 1957, + "time_per_iteration": 2.8530495166778564 + }, + { + "auxiliary_loss_clip": 0.015765, + "auxiliary_loss_mlp": 0.0135296, + "balance_loss_clip": 1.22570467, + "balance_loss_mlp": 1.09203494, + "epoch": 0.11772132872388397, + "flos": 17785867420800.0, + "grad_norm": 5.34380884669256, + "language_loss": 0.78218645, + "learning_rate": 3.919821717851428e-06, + "loss": 0.811481, + "num_input_tokens_seen": 42432590, + "step": 1958, + "time_per_iteration": 2.761988639831543 + }, + { + "auxiliary_loss_clip": 0.01569314, + "auxiliary_loss_mlp": 0.01337621, + "balance_loss_clip": 1.21653938, + "balance_loss_mlp": 1.07440722, + "epoch": 0.11778145197655193, + "flos": 13218285284640.0, + "grad_norm": 6.351430412713987, + "language_loss": 0.7680254, + "learning_rate": 3.919712513221976e-06, + "loss": 0.7970947, + "num_input_tokens_seen": 42450135, + "step": 1959, + "time_per_iteration": 2.8089020252227783 + }, + { + "auxiliary_loss_clip": 0.01571826, + "auxiliary_loss_mlp": 0.01328245, + "balance_loss_clip": 1.22055769, + "balance_loss_mlp": 1.06503105, + "epoch": 0.1178415752292199, + "flos": 20232296078400.0, + "grad_norm": 2.9161814583447154, + "language_loss": 0.7065239, + "learning_rate": 3.919603235796832e-06, + "loss": 0.73552459, + "num_input_tokens_seen": 42470050, + "step": 1960, + "time_per_iteration": 4.354807615280151 + }, + { + "auxiliary_loss_clip": 0.01563367, + "auxiliary_loss_mlp": 0.01328593, + "balance_loss_clip": 1.21186519, + "balance_loss_mlp": 1.06289959, + "epoch": 0.11790169848188788, + "flos": 13041689950080.0, + "grad_norm": 5.964075253692442, + "language_loss": 0.81391025, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.84282982, + "num_input_tokens_seen": 42484335, + "step": 1961, + "time_per_iteration": 4.362867593765259 + }, + { + "auxiliary_loss_clip": 0.01573423, + "auxiliary_loss_mlp": 0.01315268, + "balance_loss_clip": 1.22238612, + "balance_loss_mlp": 1.05624962, + "epoch": 0.11796182173455584, + "flos": 22267503617760.0, + "grad_norm": 1.8981740398865685, + "language_loss": 0.92448723, + "learning_rate": 3.919384462576049e-06, + "loss": 0.95337415, + "num_input_tokens_seen": 42502720, + "step": 1962, + "time_per_iteration": 4.3954222202301025 + }, + { + "auxiliary_loss_clip": 0.01570399, + "auxiliary_loss_mlp": 0.01336661, + "balance_loss_clip": 1.21824372, + "balance_loss_mlp": 1.07058573, + "epoch": 0.1180219449872238, + "flos": 10636603351200.0, + "grad_norm": 2.1346974535482466, + "language_loss": 0.87906659, + "learning_rate": 3.919274966788707e-06, + "loss": 0.9081372, + "num_input_tokens_seen": 42519460, + "step": 1963, + "time_per_iteration": 2.80192232131958 + }, + { + "auxiliary_loss_clip": 0.01569243, + "auxiliary_loss_mlp": 0.01333237, + "balance_loss_clip": 1.21670175, + "balance_loss_mlp": 1.073838, + "epoch": 0.11808206823989177, + "flos": 20925667693440.0, + "grad_norm": 2.2717751028906013, + "language_loss": 0.84504688, + "learning_rate": 3.919165398222265e-06, + "loss": 0.87407172, + "num_input_tokens_seen": 42539420, + "step": 1964, + "time_per_iteration": 2.7396457195281982 + }, + { + "auxiliary_loss_clip": 0.01583567, + "auxiliary_loss_mlp": 0.01337384, + "balance_loss_clip": 1.23007393, + "balance_loss_mlp": 1.07550454, + "epoch": 0.11814219149255975, + "flos": 20779908317280.0, + "grad_norm": 2.473132507988784, + "language_loss": 0.8313821, + "learning_rate": 3.919055756880879e-06, + "loss": 0.86059159, + "num_input_tokens_seen": 42558225, + "step": 1965, + "time_per_iteration": 2.7373645305633545 + }, + { + "auxiliary_loss_clip": 0.01566479, + "auxiliary_loss_mlp": 0.01324276, + "balance_loss_clip": 1.21366835, + "balance_loss_mlp": 1.0620153, + "epoch": 0.11820231474522772, + "flos": 48763508342400.0, + "grad_norm": 1.8895775941989612, + "language_loss": 0.74652469, + "learning_rate": 3.918946042768707e-06, + "loss": 0.77543223, + "num_input_tokens_seen": 42580790, + "step": 1966, + "time_per_iteration": 3.127582550048828 + }, + { + "auxiliary_loss_clip": 0.01578987, + "auxiliary_loss_mlp": 0.01325013, + "balance_loss_clip": 1.2260499, + "balance_loss_mlp": 1.06027281, + "epoch": 0.11826243799789568, + "flos": 16692463494720.0, + "grad_norm": 3.135788005204179, + "language_loss": 0.73408484, + "learning_rate": 3.918836255889908e-06, + "loss": 0.76312482, + "num_input_tokens_seen": 42597355, + "step": 1967, + "time_per_iteration": 2.755270004272461 + }, + { + "auxiliary_loss_clip": 0.01575655, + "auxiliary_loss_mlp": 0.01335046, + "balance_loss_clip": 1.22204018, + "balance_loss_mlp": 1.07030594, + "epoch": 0.11832256125056366, + "flos": 16911766301760.0, + "grad_norm": 7.354226604164455, + "language_loss": 0.8843298, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.91343683, + "num_input_tokens_seen": 42616060, + "step": 1968, + "time_per_iteration": 2.8164656162261963 + }, + { + "auxiliary_loss_clip": 0.0157376, + "auxiliary_loss_mlp": 0.01320684, + "balance_loss_clip": 1.22110033, + "balance_loss_mlp": 1.05689812, + "epoch": 0.11838268450323162, + "flos": 22822625632320.0, + "grad_norm": 2.564471592636538, + "language_loss": 0.67169011, + "learning_rate": 3.918616463849087e-06, + "loss": 0.7006346, + "num_input_tokens_seen": 42636285, + "step": 1969, + "time_per_iteration": 2.8332653045654297 + }, + { + "auxiliary_loss_clip": 0.01583344, + "auxiliary_loss_mlp": 0.01332231, + "balance_loss_clip": 1.23027492, + "balance_loss_mlp": 1.07016146, + "epoch": 0.11844280775589959, + "flos": 33548588821440.0, + "grad_norm": 2.2998959805753505, + "language_loss": 0.8080582, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83721399, + "num_input_tokens_seen": 42658320, + "step": 1970, + "time_per_iteration": 2.8611021041870117 + }, + { + "auxiliary_loss_clip": 0.01670648, + "auxiliary_loss_mlp": 0.01246345, + "balance_loss_clip": 1.31611466, + "balance_loss_mlp": 1.05027008, + "epoch": 0.11850293100856757, + "flos": 66357874297440.0, + "grad_norm": 0.8109752593833319, + "language_loss": 0.66127217, + "learning_rate": 3.918396380791754e-06, + "loss": 0.69044209, + "num_input_tokens_seen": 42721500, + "step": 1971, + "time_per_iteration": 3.312683582305908 + }, + { + "auxiliary_loss_clip": 0.01570675, + "auxiliary_loss_mlp": 0.01318429, + "balance_loss_clip": 1.21716189, + "balance_loss_mlp": 1.0517813, + "epoch": 0.11856305426123553, + "flos": 24683361814080.0, + "grad_norm": 3.331999117134288, + "language_loss": 0.8028971, + "learning_rate": 3.918286230142327e-06, + "loss": 0.83178812, + "num_input_tokens_seen": 42739825, + "step": 1972, + "time_per_iteration": 2.8254165649414062 + }, + { + "auxiliary_loss_clip": 0.01573918, + "auxiliary_loss_mlp": 0.01314602, + "balance_loss_clip": 1.21902204, + "balance_loss_mlp": 1.05024362, + "epoch": 0.1186231775139035, + "flos": 24282419227200.0, + "grad_norm": 2.2658936677554045, + "language_loss": 0.72088557, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74977076, + "num_input_tokens_seen": 42758695, + "step": 1973, + "time_per_iteration": 2.852757453918457 + }, + { + "auxiliary_loss_clip": 0.01580712, + "auxiliary_loss_mlp": 0.01346533, + "balance_loss_clip": 1.2263341, + "balance_loss_mlp": 1.08350968, + "epoch": 0.11868330076657148, + "flos": 21759299101440.0, + "grad_norm": 1.90007130513943, + "language_loss": 0.72236359, + "learning_rate": 3.918065710622832e-06, + "loss": 0.75163603, + "num_input_tokens_seen": 42778510, + "step": 1974, + "time_per_iteration": 2.7847347259521484 + }, + { + "auxiliary_loss_clip": 0.0158437, + "auxiliary_loss_mlp": 0.01339271, + "balance_loss_clip": 1.23029697, + "balance_loss_mlp": 1.06919026, + "epoch": 0.11874342401923944, + "flos": 17194713289920.0, + "grad_norm": 2.665844301781296, + "language_loss": 0.78560168, + "learning_rate": 3.917955341761128e-06, + "loss": 0.81483805, + "num_input_tokens_seen": 42793995, + "step": 1975, + "time_per_iteration": 2.761615753173828 + }, + { + "auxiliary_loss_clip": 0.01576166, + "auxiliary_loss_mlp": 0.01332849, + "balance_loss_clip": 1.22160602, + "balance_loss_mlp": 1.07344913, + "epoch": 0.11880354727190741, + "flos": 15231266557920.0, + "grad_norm": 4.221258437926487, + "language_loss": 0.75376737, + "learning_rate": 3.917844900170364e-06, + "loss": 0.78285754, + "num_input_tokens_seen": 42809000, + "step": 1976, + "time_per_iteration": 2.758112668991089 + }, + { + "auxiliary_loss_clip": 0.01579575, + "auxiliary_loss_mlp": 0.01323937, + "balance_loss_clip": 1.22500026, + "balance_loss_mlp": 1.06549168, + "epoch": 0.11886367052457537, + "flos": 27312264671040.0, + "grad_norm": 2.0185132348679966, + "language_loss": 0.75309873, + "learning_rate": 3.91773438585473e-06, + "loss": 0.78213388, + "num_input_tokens_seen": 42831585, + "step": 1977, + "time_per_iteration": 2.911698818206787 + }, + { + "auxiliary_loss_clip": 0.01582851, + "auxiliary_loss_mlp": 0.01338941, + "balance_loss_clip": 1.22805572, + "balance_loss_mlp": 1.07420099, + "epoch": 0.11892379377724335, + "flos": 21800413591200.0, + "grad_norm": 3.65631858369222, + "language_loss": 0.74514019, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.77435815, + "num_input_tokens_seen": 42848420, + "step": 1978, + "time_per_iteration": 2.8779959678649902 + }, + { + "auxiliary_loss_clip": 0.01576141, + "auxiliary_loss_mlp": 0.01321344, + "balance_loss_clip": 1.22202706, + "balance_loss_mlp": 1.06118155, + "epoch": 0.11898391702991132, + "flos": 13992748034400.0, + "grad_norm": 2.8072537358881258, + "language_loss": 0.73705244, + "learning_rate": 3.917513139065616e-06, + "loss": 0.76602727, + "num_input_tokens_seen": 42866645, + "step": 1979, + "time_per_iteration": 2.78548002243042 + }, + { + "auxiliary_loss_clip": 0.01573746, + "auxiliary_loss_mlp": 0.01310747, + "balance_loss_clip": 1.21931839, + "balance_loss_mlp": 1.04963076, + "epoch": 0.11904404028257928, + "flos": 32237361286560.0, + "grad_norm": 2.2724954093049283, + "language_loss": 0.98750424, + "learning_rate": 3.917402406600525e-06, + "loss": 1.0163492, + "num_input_tokens_seen": 42888515, + "step": 1980, + "time_per_iteration": 2.898315906524658 + }, + { + "auxiliary_loss_clip": 0.01577169, + "auxiliary_loss_mlp": 0.01314013, + "balance_loss_clip": 1.22231889, + "balance_loss_mlp": 1.05327892, + "epoch": 0.11910416353524726, + "flos": 23588516617920.0, + "grad_norm": 2.4318273725497703, + "language_loss": 0.86647636, + "learning_rate": 3.917291601427342e-06, + "loss": 0.89538825, + "num_input_tokens_seen": 42909035, + "step": 1981, + "time_per_iteration": 2.8033742904663086 + }, + { + "auxiliary_loss_clip": 0.01574597, + "auxiliary_loss_mlp": 0.01328069, + "balance_loss_clip": 1.22097993, + "balance_loss_mlp": 1.0669533, + "epoch": 0.11916428678791523, + "flos": 25335125873280.0, + "grad_norm": 2.117426842850808, + "language_loss": 0.84950709, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87853378, + "num_input_tokens_seen": 42927555, + "step": 1982, + "time_per_iteration": 2.844224452972412 + }, + { + "auxiliary_loss_clip": 0.01580151, + "auxiliary_loss_mlp": 0.0131541, + "balance_loss_clip": 1.22729826, + "balance_loss_mlp": 1.05734599, + "epoch": 0.11922441004058319, + "flos": 19790276929920.0, + "grad_norm": 2.0616068833847354, + "language_loss": 0.85074806, + "learning_rate": 3.917069772973513e-06, + "loss": 0.8797037, + "num_input_tokens_seen": 42945300, + "step": 1983, + "time_per_iteration": 2.7985353469848633 + }, + { + "auxiliary_loss_clip": 0.0159266, + "auxiliary_loss_mlp": 0.01330083, + "balance_loss_clip": 1.23906267, + "balance_loss_mlp": 1.07144666, + "epoch": 0.11928453329325117, + "flos": 21538099886400.0, + "grad_norm": 4.683053220877227, + "language_loss": 0.76552713, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79475451, + "num_input_tokens_seen": 42961295, + "step": 1984, + "time_per_iteration": 2.7815933227539062 + }, + { + "auxiliary_loss_clip": 0.01579941, + "auxiliary_loss_mlp": 0.01335909, + "balance_loss_clip": 1.22741485, + "balance_loss_mlp": 1.07479274, + "epoch": 0.11934465654591914, + "flos": 20817192062880.0, + "grad_norm": 1.7661271575594444, + "language_loss": 0.83268428, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.86184275, + "num_input_tokens_seen": 42980330, + "step": 1985, + "time_per_iteration": 2.7825777530670166 + }, + { + "auxiliary_loss_clip": 0.0157373, + "auxiliary_loss_mlp": 0.01308808, + "balance_loss_clip": 1.22090518, + "balance_loss_mlp": 1.0507443, + "epoch": 0.1194047797985871, + "flos": 19062390324960.0, + "grad_norm": 3.743607545926359, + "language_loss": 0.74336398, + "learning_rate": 3.916736485087216e-06, + "loss": 0.77218938, + "num_input_tokens_seen": 42996125, + "step": 1986, + "time_per_iteration": 2.811734437942505 + }, + { + "auxiliary_loss_clip": 0.01567585, + "auxiliary_loss_mlp": 0.01326689, + "balance_loss_clip": 1.21461749, + "balance_loss_mlp": 1.06652713, + "epoch": 0.11946490305125507, + "flos": 27192827802240.0, + "grad_norm": 2.2007046198124725, + "language_loss": 0.72288287, + "learning_rate": 3.916625243753819e-06, + "loss": 0.75182569, + "num_input_tokens_seen": 43014180, + "step": 1987, + "time_per_iteration": 2.850163221359253 + }, + { + "auxiliary_loss_clip": 0.01578447, + "auxiliary_loss_mlp": 0.01331272, + "balance_loss_clip": 1.22560906, + "balance_loss_mlp": 1.07625985, + "epoch": 0.11952502630392305, + "flos": 21142846523520.0, + "grad_norm": 2.414928269811893, + "language_loss": 0.72246873, + "learning_rate": 3.916513929741799e-06, + "loss": 0.75156599, + "num_input_tokens_seen": 43032120, + "step": 1988, + "time_per_iteration": 2.787867546081543 + }, + { + "auxiliary_loss_clip": 0.0156902, + "auxiliary_loss_mlp": 0.01328745, + "balance_loss_clip": 1.21697307, + "balance_loss_mlp": 1.07068086, + "epoch": 0.11958514955659101, + "flos": 22126257692640.0, + "grad_norm": 2.800952806294245, + "language_loss": 0.81305945, + "learning_rate": 3.91640254305538e-06, + "loss": 0.84203708, + "num_input_tokens_seen": 43052215, + "step": 1989, + "time_per_iteration": 2.838064670562744 + }, + { + "auxiliary_loss_clip": 0.01577702, + "auxiliary_loss_mlp": 0.01332911, + "balance_loss_clip": 1.22590721, + "balance_loss_mlp": 1.07465601, + "epoch": 0.11964527280925898, + "flos": 17423422280640.0, + "grad_norm": 2.7573188889894067, + "language_loss": 0.75473607, + "learning_rate": 3.916291083698784e-06, + "loss": 0.78384221, + "num_input_tokens_seen": 43069720, + "step": 1990, + "time_per_iteration": 2.7583062648773193 + }, + { + "auxiliary_loss_clip": 0.01617562, + "auxiliary_loss_mlp": 0.01221191, + "balance_loss_clip": 1.26725221, + "balance_loss_mlp": 1.012146, + "epoch": 0.11970539606192696, + "flos": 70686127558080.0, + "grad_norm": 0.8641216650998162, + "language_loss": 0.55229175, + "learning_rate": 3.916179551676238e-06, + "loss": 0.58067918, + "num_input_tokens_seen": 43123130, + "step": 1991, + "time_per_iteration": 4.867968320846558 + }, + { + "auxiliary_loss_clip": 0.01570996, + "auxiliary_loss_mlp": 0.0132476, + "balance_loss_clip": 1.21749139, + "balance_loss_mlp": 1.07260823, + "epoch": 0.11976551931459492, + "flos": 21217148517600.0, + "grad_norm": 3.057326129369861, + "language_loss": 0.78543264, + "learning_rate": 3.916067946991971e-06, + "loss": 0.81439018, + "num_input_tokens_seen": 43140015, + "step": 1992, + "time_per_iteration": 2.879852056503296 + }, + { + "auxiliary_loss_clip": 0.01571231, + "auxiliary_loss_mlp": 0.01352622, + "balance_loss_clip": 1.21869636, + "balance_loss_mlp": 1.0960834, + "epoch": 0.11982564256726289, + "flos": 25991403383520.0, + "grad_norm": 1.8962010449321989, + "language_loss": 0.79302049, + "learning_rate": 3.915956269650216e-06, + "loss": 0.82225907, + "num_input_tokens_seen": 43160105, + "step": 1993, + "time_per_iteration": 2.907721996307373 + }, + { + "auxiliary_loss_clip": 0.01571635, + "auxiliary_loss_mlp": 0.01345656, + "balance_loss_clip": 1.21900725, + "balance_loss_mlp": 1.09579301, + "epoch": 0.11988576581993086, + "flos": 21652909519680.0, + "grad_norm": 1.8910036700070723, + "language_loss": 0.82343554, + "learning_rate": 3.915844519655208e-06, + "loss": 0.85260844, + "num_input_tokens_seen": 43179835, + "step": 1994, + "time_per_iteration": 2.911320209503174 + }, + { + "auxiliary_loss_clip": 0.01567013, + "auxiliary_loss_mlp": 0.0136389, + "balance_loss_clip": 1.21378636, + "balance_loss_mlp": 1.11421752, + "epoch": 0.11994588907259883, + "flos": 17859297067200.0, + "grad_norm": 2.7731334705153134, + "language_loss": 0.88994277, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9192518, + "num_input_tokens_seen": 43197210, + "step": 1995, + "time_per_iteration": 2.786092758178711 + }, + { + "auxiliary_loss_clip": 0.01575593, + "auxiliary_loss_mlp": 0.01360575, + "balance_loss_clip": 1.22035694, + "balance_loss_mlp": 1.10785162, + "epoch": 0.1200060123252668, + "flos": 24465083067360.0, + "grad_norm": 2.3297511197201617, + "language_loss": 0.74434364, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.77370536, + "num_input_tokens_seen": 43215050, + "step": 1996, + "time_per_iteration": 2.8129117488861084 + }, + { + "auxiliary_loss_clip": 0.01561197, + "auxiliary_loss_mlp": 0.01349055, + "balance_loss_clip": 1.20732021, + "balance_loss_mlp": 1.0955689, + "epoch": 0.12006613557793476, + "flos": 18734080893120.0, + "grad_norm": 2.132230382563024, + "language_loss": 0.88196301, + "learning_rate": 3.915508833793048e-06, + "loss": 0.91106558, + "num_input_tokens_seen": 43233900, + "step": 1997, + "time_per_iteration": 2.846266984939575 + }, + { + "auxiliary_loss_clip": 0.0156455, + "auxiliary_loss_mlp": 0.01349159, + "balance_loss_clip": 1.2098912, + "balance_loss_mlp": 1.09529066, + "epoch": 0.12012625883060274, + "flos": 22269627594720.0, + "grad_norm": 2.155881376265084, + "language_loss": 0.78767204, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81680924, + "num_input_tokens_seen": 43252105, + "step": 1998, + "time_per_iteration": 2.798004627227783 + }, + { + "auxiliary_loss_clip": 0.01567094, + "auxiliary_loss_mlp": 0.01340718, + "balance_loss_clip": 1.21223724, + "balance_loss_mlp": 1.0862782, + "epoch": 0.1201863820832707, + "flos": 21760854156000.0, + "grad_norm": 2.1305752814359398, + "language_loss": 0.73459613, + "learning_rate": 3.915284680029769e-06, + "loss": 0.76367426, + "num_input_tokens_seen": 43270315, + "step": 1999, + "time_per_iteration": 4.310065984725952 + }, + { + "auxiliary_loss_clip": 0.01567182, + "auxiliary_loss_mlp": 0.01324707, + "balance_loss_clip": 1.2117331, + "balance_loss_mlp": 1.0687412, + "epoch": 0.12024650533593867, + "flos": 21910140851040.0, + "grad_norm": 2.8666932711833697, + "language_loss": 0.74478924, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77370816, + "num_input_tokens_seen": 43289935, + "step": 2000, + "time_per_iteration": 4.323744058609009 + }, + { + "auxiliary_loss_clip": 0.01566662, + "auxiliary_loss_mlp": 0.01320199, + "balance_loss_clip": 1.2124815, + "balance_loss_mlp": 1.06556749, + "epoch": 0.12030662858860665, + "flos": 21691369038240.0, + "grad_norm": 2.005997538829403, + "language_loss": 0.85247558, + "learning_rate": 3.915060235755344e-06, + "loss": 0.8813442, + "num_input_tokens_seen": 43309325, + "step": 2001, + "time_per_iteration": 4.251762866973877 + }, + { + "auxiliary_loss_clip": 0.01558872, + "auxiliary_loss_mlp": 0.01329605, + "balance_loss_clip": 1.20276821, + "balance_loss_mlp": 1.07592714, + "epoch": 0.12036675184127461, + "flos": 12934921086720.0, + "grad_norm": 2.792468365045745, + "language_loss": 0.74020749, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76909226, + "num_input_tokens_seen": 43327010, + "step": 2002, + "time_per_iteration": 2.7943155765533447 + }, + { + "auxiliary_loss_clip": 0.01561782, + "auxiliary_loss_mlp": 0.01352082, + "balance_loss_clip": 1.20690763, + "balance_loss_mlp": 1.09478092, + "epoch": 0.12042687509394258, + "flos": 20853186251040.0, + "grad_norm": 2.6936195743103197, + "language_loss": 0.77899933, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80813795, + "num_input_tokens_seen": 43345650, + "step": 2003, + "time_per_iteration": 2.855569839477539 + }, + { + "auxiliary_loss_clip": 0.01554668, + "auxiliary_loss_mlp": 0.01333524, + "balance_loss_clip": 1.19904327, + "balance_loss_mlp": 1.07565045, + "epoch": 0.12048699834661056, + "flos": 23880793933440.0, + "grad_norm": 4.023181879103718, + "language_loss": 0.72258586, + "learning_rate": 3.914723024709793e-06, + "loss": 0.75146782, + "num_input_tokens_seen": 43365555, + "step": 2004, + "time_per_iteration": 2.849376678466797 + }, + { + "auxiliary_loss_clip": 0.01556609, + "auxiliary_loss_mlp": 0.01336261, + "balance_loss_clip": 1.20019603, + "balance_loss_mlp": 1.07628894, + "epoch": 0.12054712159927852, + "flos": 19758454839360.0, + "grad_norm": 3.5402168678263544, + "language_loss": 0.78219569, + "learning_rate": 3.914610475809279e-06, + "loss": 0.81112438, + "num_input_tokens_seen": 43384990, + "step": 2005, + "time_per_iteration": 2.876617670059204 + }, + { + "auxiliary_loss_clip": 0.0159553, + "auxiliary_loss_mlp": 0.01313858, + "balance_loss_clip": 1.23660195, + "balance_loss_mlp": 1.12159729, + "epoch": 0.12060724485194649, + "flos": 51678202435200.0, + "grad_norm": 0.9619920833579864, + "language_loss": 0.58003968, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60913354, + "num_input_tokens_seen": 43436335, + "step": 2006, + "time_per_iteration": 3.1734464168548584 + }, + { + "auxiliary_loss_clip": 0.01552189, + "auxiliary_loss_mlp": 0.01329223, + "balance_loss_clip": 1.19424677, + "balance_loss_mlp": 1.0690608, + "epoch": 0.12066736810461445, + "flos": 18992487997440.0, + "grad_norm": 3.381430313160563, + "language_loss": 0.76392418, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.79273832, + "num_input_tokens_seen": 43456495, + "step": 2007, + "time_per_iteration": 2.798733949661255 + }, + { + "auxiliary_loss_clip": 0.01557341, + "auxiliary_loss_mlp": 0.01322528, + "balance_loss_clip": 1.20064163, + "balance_loss_mlp": 1.05378246, + "epoch": 0.12072749135728243, + "flos": 16473843394560.0, + "grad_norm": 3.1116390167509635, + "language_loss": 0.83449912, + "learning_rate": 3.914272393511494e-06, + "loss": 0.86329782, + "num_input_tokens_seen": 43473085, + "step": 2008, + "time_per_iteration": 2.809321880340576 + }, + { + "auxiliary_loss_clip": 0.01555977, + "auxiliary_loss_mlp": 0.01321509, + "balance_loss_clip": 1.19818544, + "balance_loss_mlp": 1.05161917, + "epoch": 0.1207876146099504, + "flos": 18079055012160.0, + "grad_norm": 2.636842747440979, + "language_loss": 0.84121609, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86999094, + "num_input_tokens_seen": 43491135, + "step": 2009, + "time_per_iteration": 2.85093355178833 + }, + { + "auxiliary_loss_clip": 0.01564489, + "auxiliary_loss_mlp": 0.01343911, + "balance_loss_clip": 1.20595336, + "balance_loss_mlp": 1.07096958, + "epoch": 0.12084773786261836, + "flos": 21873729453120.0, + "grad_norm": 1.8702967689002141, + "language_loss": 0.84256208, + "learning_rate": 3.914046642358844e-06, + "loss": 0.87164605, + "num_input_tokens_seen": 43510440, + "step": 2010, + "time_per_iteration": 2.8776962757110596 + }, + { + "auxiliary_loss_clip": 0.01558802, + "auxiliary_loss_mlp": 0.01352608, + "balance_loss_clip": 1.2014693, + "balance_loss_mlp": 1.0745163, + "epoch": 0.12090786111528634, + "flos": 18335034714240.0, + "grad_norm": 1.8692121807587678, + "language_loss": 0.8410933, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.87020743, + "num_input_tokens_seen": 43530145, + "step": 2011, + "time_per_iteration": 2.7899444103240967 + }, + { + "auxiliary_loss_clip": 0.01554727, + "auxiliary_loss_mlp": 0.01332555, + "balance_loss_clip": 1.19600272, + "balance_loss_mlp": 1.06190205, + "epoch": 0.1209679843679543, + "flos": 21107990180160.0, + "grad_norm": 2.451784674477752, + "language_loss": 0.96284306, + "learning_rate": 3.913820600882834e-06, + "loss": 0.99171585, + "num_input_tokens_seen": 43549315, + "step": 2012, + "time_per_iteration": 2.8238725662231445 + }, + { + "auxiliary_loss_clip": 0.01556339, + "auxiliary_loss_mlp": 0.0133357, + "balance_loss_clip": 1.19662595, + "balance_loss_mlp": 1.06005621, + "epoch": 0.12102810762062227, + "flos": 29243585887200.0, + "grad_norm": 2.056096611217094, + "language_loss": 0.80817354, + "learning_rate": 3.913707471284283e-06, + "loss": 0.83707261, + "num_input_tokens_seen": 43569240, + "step": 2013, + "time_per_iteration": 2.9433491230010986 + }, + { + "auxiliary_loss_clip": 0.01556012, + "auxiliary_loss_mlp": 0.01349422, + "balance_loss_clip": 1.19619787, + "balance_loss_mlp": 1.08010471, + "epoch": 0.12108823087329025, + "flos": 17932612929120.0, + "grad_norm": 3.596617002425998, + "language_loss": 0.77337044, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.80242479, + "num_input_tokens_seen": 43587710, + "step": 2014, + "time_per_iteration": 2.7499067783355713 + }, + { + "auxiliary_loss_clip": 0.01558926, + "auxiliary_loss_mlp": 0.01323589, + "balance_loss_clip": 1.20025051, + "balance_loss_mlp": 1.05160069, + "epoch": 0.12114835412595822, + "flos": 22094094248640.0, + "grad_norm": 2.5034557694830344, + "language_loss": 0.87274873, + "learning_rate": 3.913480994387535e-06, + "loss": 0.9015739, + "num_input_tokens_seen": 43606000, + "step": 2015, + "time_per_iteration": 2.837451219558716 + }, + { + "auxiliary_loss_clip": 0.01556214, + "auxiliary_loss_mlp": 0.01346561, + "balance_loss_clip": 1.19545436, + "balance_loss_mlp": 1.08639908, + "epoch": 0.12120847737862618, + "flos": 20414428924320.0, + "grad_norm": 2.2545994907394666, + "language_loss": 0.69791949, + "learning_rate": 3.913367647097926e-06, + "loss": 0.72694719, + "num_input_tokens_seen": 43624815, + "step": 2016, + "time_per_iteration": 2.7636971473693848 + }, + { + "auxiliary_loss_clip": 0.01551274, + "auxiliary_loss_mlp": 0.01338442, + "balance_loss_clip": 1.19161034, + "balance_loss_mlp": 1.08056819, + "epoch": 0.12126860063129415, + "flos": 22311197222400.0, + "grad_norm": 11.376160862271108, + "language_loss": 0.80708408, + "learning_rate": 3.913254227253225e-06, + "loss": 0.83598125, + "num_input_tokens_seen": 43643960, + "step": 2017, + "time_per_iteration": 2.9048373699188232 + }, + { + "auxiliary_loss_clip": 0.01562968, + "auxiliary_loss_mlp": 0.01338055, + "balance_loss_clip": 1.20276713, + "balance_loss_mlp": 1.07770216, + "epoch": 0.12132872388396213, + "flos": 13701001713120.0, + "grad_norm": 2.3961216989739893, + "language_loss": 0.68904757, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71805781, + "num_input_tokens_seen": 43662650, + "step": 2018, + "time_per_iteration": 2.82405948638916 + }, + { + "auxiliary_loss_clip": 0.01561567, + "auxiliary_loss_mlp": 0.01336781, + "balance_loss_clip": 1.20146322, + "balance_loss_mlp": 1.08215046, + "epoch": 0.12138884713663009, + "flos": 26469265007520.0, + "grad_norm": 1.8852109459935453, + "language_loss": 0.72612882, + "learning_rate": 3.91302716991575e-06, + "loss": 0.75511229, + "num_input_tokens_seen": 43684205, + "step": 2019, + "time_per_iteration": 2.9759137630462646 + }, + { + "auxiliary_loss_clip": 0.01556179, + "auxiliary_loss_mlp": 0.01342409, + "balance_loss_clip": 1.19562817, + "balance_loss_mlp": 1.09254658, + "epoch": 0.12144897038929806, + "flos": 26144065684800.0, + "grad_norm": 2.501135245972832, + "language_loss": 0.92071068, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94969654, + "num_input_tokens_seen": 43706320, + "step": 2020, + "time_per_iteration": 3.0331668853759766 + }, + { + "auxiliary_loss_clip": 0.01552453, + "auxiliary_loss_mlp": 0.01322342, + "balance_loss_clip": 1.19116402, + "balance_loss_mlp": 1.06275129, + "epoch": 0.12150909364196603, + "flos": 24719849068320.0, + "grad_norm": 2.3195074646295213, + "language_loss": 0.78613204, + "learning_rate": 3.912799822409549e-06, + "loss": 0.81487995, + "num_input_tokens_seen": 43724805, + "step": 2021, + "time_per_iteration": 2.9130828380584717 + }, + { + "auxiliary_loss_clip": 0.01556656, + "auxiliary_loss_mlp": 0.01349317, + "balance_loss_clip": 1.19520271, + "balance_loss_mlp": 1.10002685, + "epoch": 0.121569216894634, + "flos": 25189063071840.0, + "grad_norm": 2.195920625874336, + "language_loss": 0.81057686, + "learning_rate": 3.912686039853952e-06, + "loss": 0.83963656, + "num_input_tokens_seen": 43742320, + "step": 2022, + "time_per_iteration": 2.907430648803711 + }, + { + "auxiliary_loss_clip": 0.01565696, + "auxiliary_loss_mlp": 0.0134761, + "balance_loss_clip": 1.20405221, + "balance_loss_mlp": 1.09641218, + "epoch": 0.12162934014730196, + "flos": 13445818502400.0, + "grad_norm": 1.8647874432533746, + "language_loss": 0.85111904, + "learning_rate": 3.912572184769108e-06, + "loss": 0.88025212, + "num_input_tokens_seen": 43760665, + "step": 2023, + "time_per_iteration": 2.8470211029052734 + }, + { + "auxiliary_loss_clip": 0.01564617, + "auxiliary_loss_mlp": 0.01336807, + "balance_loss_clip": 1.2026186, + "balance_loss_mlp": 1.08446503, + "epoch": 0.12168946339996994, + "flos": 16948025987040.0, + "grad_norm": 2.5240388209113815, + "language_loss": 0.85893941, + "learning_rate": 3.912458257159335e-06, + "loss": 0.88795364, + "num_input_tokens_seen": 43779020, + "step": 2024, + "time_per_iteration": 2.8239126205444336 + }, + { + "auxiliary_loss_clip": 0.01550573, + "auxiliary_loss_mlp": 0.01348691, + "balance_loss_clip": 1.18774056, + "balance_loss_mlp": 1.10035372, + "epoch": 0.12174958665263791, + "flos": 29823968420640.0, + "grad_norm": 2.280093740800281, + "language_loss": 0.72474092, + "learning_rate": 3.912344257028954e-06, + "loss": 0.75373352, + "num_input_tokens_seen": 43798850, + "step": 2025, + "time_per_iteration": 2.887769937515259 + }, + { + "auxiliary_loss_clip": 0.0155625, + "auxiliary_loss_mlp": 0.01356593, + "balance_loss_clip": 1.19256592, + "balance_loss_mlp": 1.10920978, + "epoch": 0.12180970990530587, + "flos": 24644067876000.0, + "grad_norm": 1.7130574256958402, + "language_loss": 0.7632153, + "learning_rate": 3.912230184382286e-06, + "loss": 0.79234374, + "num_input_tokens_seen": 43820130, + "step": 2026, + "time_per_iteration": 2.840573787689209 + }, + { + "auxiliary_loss_clip": 0.01543299, + "auxiliary_loss_mlp": 0.0132271, + "balance_loss_clip": 1.18140423, + "balance_loss_mlp": 1.06655276, + "epoch": 0.12186983315797385, + "flos": 20523776902560.0, + "grad_norm": 2.29045779536457, + "language_loss": 0.88854069, + "learning_rate": 3.912116039223659e-06, + "loss": 0.91720074, + "num_input_tokens_seen": 43838485, + "step": 2027, + "time_per_iteration": 2.804133176803589 + }, + { + "auxiliary_loss_clip": 0.01557713, + "auxiliary_loss_mlp": 0.01326438, + "balance_loss_clip": 1.19449055, + "balance_loss_mlp": 1.0698998, + "epoch": 0.12192995641064182, + "flos": 27820810540800.0, + "grad_norm": 1.9709287434737885, + "language_loss": 0.7600224, + "learning_rate": 3.912001821557399e-06, + "loss": 0.78886396, + "num_input_tokens_seen": 43859080, + "step": 2028, + "time_per_iteration": 2.8677890300750732 + }, + { + "auxiliary_loss_clip": 0.01555792, + "auxiliary_loss_mlp": 0.01335232, + "balance_loss_clip": 1.19415724, + "balance_loss_mlp": 1.08212698, + "epoch": 0.12199007966330978, + "flos": 22019261260320.0, + "grad_norm": 2.0708060376808786, + "language_loss": 0.7747978, + "learning_rate": 3.911887531387839e-06, + "loss": 0.80370808, + "num_input_tokens_seen": 43879030, + "step": 2029, + "time_per_iteration": 4.350813388824463 + }, + { + "auxiliary_loss_clip": 0.01559809, + "auxiliary_loss_mlp": 0.01329939, + "balance_loss_clip": 1.19567513, + "balance_loss_mlp": 1.07702446, + "epoch": 0.12205020291597775, + "flos": 23297490931680.0, + "grad_norm": 1.953790591141665, + "language_loss": 0.79339123, + "learning_rate": 3.911773168719313e-06, + "loss": 0.82228869, + "num_input_tokens_seen": 43898505, + "step": 2030, + "time_per_iteration": 2.949892044067383 + }, + { + "auxiliary_loss_clip": 0.01555737, + "auxiliary_loss_mlp": 0.01312776, + "balance_loss_clip": 1.19207418, + "balance_loss_mlp": 1.05795467, + "epoch": 0.12211032616864573, + "flos": 26034034999680.0, + "grad_norm": 2.2151832513159087, + "language_loss": 0.7485134, + "learning_rate": 3.911658733556155e-06, + "loss": 0.77719849, + "num_input_tokens_seen": 43917945, + "step": 2031, + "time_per_iteration": 2.913179397583008 + }, + { + "auxiliary_loss_clip": 0.01561281, + "auxiliary_loss_mlp": 0.0133377, + "balance_loss_clip": 1.19923985, + "balance_loss_mlp": 1.08161807, + "epoch": 0.12217044942131369, + "flos": 20412949726080.0, + "grad_norm": 2.0287948983797373, + "language_loss": 0.75428796, + "learning_rate": 3.911544225902707e-06, + "loss": 0.78323841, + "num_input_tokens_seen": 43937385, + "step": 2032, + "time_per_iteration": 2.9061925411224365 + }, + { + "auxiliary_loss_clip": 0.01550994, + "auxiliary_loss_mlp": 0.01308176, + "balance_loss_clip": 1.18872988, + "balance_loss_mlp": 1.05449843, + "epoch": 0.12223057267398166, + "flos": 22859378383680.0, + "grad_norm": 1.6692703867676402, + "language_loss": 0.8955487, + "learning_rate": 3.911429645763311e-06, + "loss": 0.92414045, + "num_input_tokens_seen": 43958130, + "step": 2033, + "time_per_iteration": 2.848661184310913 + }, + { + "auxiliary_loss_clip": 0.01563773, + "auxiliary_loss_mlp": 0.0135244, + "balance_loss_clip": 1.20107079, + "balance_loss_mlp": 1.10105181, + "epoch": 0.12229069592664964, + "flos": 20049632238240.0, + "grad_norm": 2.26796283031294, + "language_loss": 0.65357375, + "learning_rate": 3.911314993142311e-06, + "loss": 0.68273592, + "num_input_tokens_seen": 43976800, + "step": 2034, + "time_per_iteration": 2.8565492630004883 + }, + { + "auxiliary_loss_clip": 0.01551595, + "auxiliary_loss_mlp": 0.01305488, + "balance_loss_clip": 1.18850863, + "balance_loss_mlp": 1.04723263, + "epoch": 0.1223508191793176, + "flos": 22276492591680.0, + "grad_norm": 1.953548053912095, + "language_loss": 0.76653898, + "learning_rate": 3.911200268044055e-06, + "loss": 0.79510975, + "num_input_tokens_seen": 43996620, + "step": 2035, + "time_per_iteration": 2.9155192375183105 + }, + { + "auxiliary_loss_clip": 0.0156221, + "auxiliary_loss_mlp": 0.01319118, + "balance_loss_clip": 1.20113873, + "balance_loss_mlp": 1.05800211, + "epoch": 0.12241094243198557, + "flos": 21287999049120.0, + "grad_norm": 1.9408977216144452, + "language_loss": 0.71545994, + "learning_rate": 3.911085470472892e-06, + "loss": 0.74427319, + "num_input_tokens_seen": 44016175, + "step": 2036, + "time_per_iteration": 2.886659622192383 + }, + { + "auxiliary_loss_clip": 0.01556502, + "auxiliary_loss_mlp": 0.01328068, + "balance_loss_clip": 1.19368875, + "balance_loss_mlp": 1.07057595, + "epoch": 0.12247106568465355, + "flos": 17384204198880.0, + "grad_norm": 1.7576845361477265, + "language_loss": 0.83200967, + "learning_rate": 3.910970600433178e-06, + "loss": 0.86085534, + "num_input_tokens_seen": 44035060, + "step": 2037, + "time_per_iteration": 4.474416494369507 + }, + { + "auxiliary_loss_clip": 0.01560187, + "auxiliary_loss_mlp": 0.01329518, + "balance_loss_clip": 1.19804168, + "balance_loss_mlp": 1.07278883, + "epoch": 0.12253118893732151, + "flos": 27047485635840.0, + "grad_norm": 5.838682309389109, + "language_loss": 0.80598623, + "learning_rate": 3.910855657929267e-06, + "loss": 0.83488327, + "num_input_tokens_seen": 44053330, + "step": 2038, + "time_per_iteration": 4.389427423477173 + }, + { + "auxiliary_loss_clip": 0.0163556, + "auxiliary_loss_mlp": 0.0135527, + "balance_loss_clip": 1.26941252, + "balance_loss_mlp": 1.12028503, + "epoch": 0.12259131218998948, + "flos": 53867058408000.0, + "grad_norm": 0.8665192516607242, + "language_loss": 0.58582675, + "learning_rate": 3.910740642965518e-06, + "loss": 0.61573505, + "num_input_tokens_seen": 44107575, + "step": 2039, + "time_per_iteration": 3.300053119659424 + }, + { + "auxiliary_loss_clip": 0.01551936, + "auxiliary_loss_mlp": 0.01333718, + "balance_loss_clip": 1.18935442, + "balance_loss_mlp": 1.07813299, + "epoch": 0.12265143544265744, + "flos": 17893698272640.0, + "grad_norm": 2.4565784558354378, + "language_loss": 0.80808842, + "learning_rate": 3.910625555546292e-06, + "loss": 0.83694494, + "num_input_tokens_seen": 44126075, + "step": 2040, + "time_per_iteration": 2.8881161212921143 + }, + { + "auxiliary_loss_clip": 0.01550259, + "auxiliary_loss_mlp": 0.01345081, + "balance_loss_clip": 1.18663764, + "balance_loss_mlp": 1.09350204, + "epoch": 0.12271155869532542, + "flos": 21802158286560.0, + "grad_norm": 2.0493025739718806, + "language_loss": 0.8339709, + "learning_rate": 3.910510395675953e-06, + "loss": 0.86292434, + "num_input_tokens_seen": 44145605, + "step": 2041, + "time_per_iteration": 2.903430938720703 + }, + { + "auxiliary_loss_clip": 0.01559012, + "auxiliary_loss_mlp": 0.01364315, + "balance_loss_clip": 1.19692123, + "balance_loss_mlp": 1.11540604, + "epoch": 0.12277168194799339, + "flos": 19830822497280.0, + "grad_norm": 1.9404812953415276, + "language_loss": 0.67353106, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.70276439, + "num_input_tokens_seen": 44164770, + "step": 2042, + "time_per_iteration": 2.9048101902008057 + }, + { + "auxiliary_loss_clip": 0.0155713, + "auxiliary_loss_mlp": 0.01332006, + "balance_loss_clip": 1.19520319, + "balance_loss_mlp": 1.08385968, + "epoch": 0.12283180520066135, + "flos": 23223226865760.0, + "grad_norm": 2.142341224009046, + "language_loss": 0.81672025, + "learning_rate": 3.910279858599409e-06, + "loss": 0.84561169, + "num_input_tokens_seen": 44184025, + "step": 2043, + "time_per_iteration": 2.8901402950286865 + }, + { + "auxiliary_loss_clip": 0.01561518, + "auxiliary_loss_mlp": 0.01350881, + "balance_loss_clip": 1.20077622, + "balance_loss_mlp": 1.10178161, + "epoch": 0.12289192845332933, + "flos": 18590483422080.0, + "grad_norm": 1.908253188071656, + "language_loss": 0.80540925, + "learning_rate": 3.910164481401946e-06, + "loss": 0.83453321, + "num_input_tokens_seen": 44202950, + "step": 2044, + "time_per_iteration": 2.8644232749938965 + }, + { + "auxiliary_loss_clip": 0.01566122, + "auxiliary_loss_mlp": 0.01339852, + "balance_loss_clip": 1.20600736, + "balance_loss_mlp": 1.09418523, + "epoch": 0.1229520517059973, + "flos": 25771569582240.0, + "grad_norm": 1.8544564786591264, + "language_loss": 0.7809732, + "learning_rate": 3.910049031770853e-06, + "loss": 0.81003296, + "num_input_tokens_seen": 44221115, + "step": 2045, + "time_per_iteration": 2.887244939804077 + }, + { + "auxiliary_loss_clip": 0.01561065, + "auxiliary_loss_mlp": 0.01354709, + "balance_loss_clip": 1.20024252, + "balance_loss_mlp": 1.10751665, + "epoch": 0.12301217495866526, + "flos": 20889294223680.0, + "grad_norm": 1.9664494233851029, + "language_loss": 0.67525113, + "learning_rate": 3.90993350971051e-06, + "loss": 0.70440888, + "num_input_tokens_seen": 44240575, + "step": 2046, + "time_per_iteration": 2.9249579906463623 + }, + { + "auxiliary_loss_clip": 0.01560489, + "auxiliary_loss_mlp": 0.01333917, + "balance_loss_clip": 1.19944906, + "balance_loss_mlp": 1.08786869, + "epoch": 0.12307229821133324, + "flos": 22380796124640.0, + "grad_norm": 2.3339431129588433, + "language_loss": 0.72850507, + "learning_rate": 3.909817915225297e-06, + "loss": 0.75744915, + "num_input_tokens_seen": 44257145, + "step": 2047, + "time_per_iteration": 2.8319039344787598 + }, + { + "auxiliary_loss_clip": 0.01555611, + "auxiliary_loss_mlp": 0.01339403, + "balance_loss_clip": 1.19578254, + "balance_loss_mlp": 1.08706093, + "epoch": 0.1231324214640012, + "flos": 23369631020640.0, + "grad_norm": 2.50289020144491, + "language_loss": 0.77163935, + "learning_rate": 3.909702248319597e-06, + "loss": 0.80058944, + "num_input_tokens_seen": 44278035, + "step": 2048, + "time_per_iteration": 2.8735039234161377 + }, + { + "auxiliary_loss_clip": 0.01558138, + "auxiliary_loss_mlp": 0.01315278, + "balance_loss_clip": 1.19607949, + "balance_loss_mlp": 1.06388974, + "epoch": 0.12319254471666917, + "flos": 23769321978240.0, + "grad_norm": 2.257395344932011, + "language_loss": 0.85611457, + "learning_rate": 3.909586508997797e-06, + "loss": 0.88484871, + "num_input_tokens_seen": 44296980, + "step": 2049, + "time_per_iteration": 2.8297417163848877 + }, + { + "auxiliary_loss_clip": 0.01560116, + "auxiliary_loss_mlp": 0.01333379, + "balance_loss_clip": 1.19937968, + "balance_loss_mlp": 1.08008361, + "epoch": 0.12325266796933713, + "flos": 23552408645280.0, + "grad_norm": 2.1002584518006433, + "language_loss": 0.75748295, + "learning_rate": 3.909470697264285e-06, + "loss": 0.78641796, + "num_input_tokens_seen": 44318005, + "step": 2050, + "time_per_iteration": 2.841898202896118 + }, + { + "auxiliary_loss_clip": 0.01558479, + "auxiliary_loss_mlp": 0.01331686, + "balance_loss_clip": 1.19798124, + "balance_loss_mlp": 1.07514739, + "epoch": 0.12331279122200511, + "flos": 24426206339040.0, + "grad_norm": 2.038203823743785, + "language_loss": 0.8089062, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83780783, + "num_input_tokens_seen": 44335260, + "step": 2051, + "time_per_iteration": 2.892359733581543 + }, + { + "auxiliary_loss_clip": 0.01559337, + "auxiliary_loss_mlp": 0.01319133, + "balance_loss_clip": 1.19847727, + "balance_loss_mlp": 1.06431103, + "epoch": 0.12337291447467308, + "flos": 25486688257920.0, + "grad_norm": 2.393260926353901, + "language_loss": 0.80113292, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82991755, + "num_input_tokens_seen": 44355315, + "step": 2052, + "time_per_iteration": 2.889190196990967 + }, + { + "auxiliary_loss_clip": 0.01557907, + "auxiliary_loss_mlp": 0.0133589, + "balance_loss_clip": 1.19798493, + "balance_loss_mlp": 1.07534599, + "epoch": 0.12343303772734104, + "flos": 23552256932640.0, + "grad_norm": 2.3253488521227967, + "language_loss": 0.73610556, + "learning_rate": 3.909122827637406e-06, + "loss": 0.76504356, + "num_input_tokens_seen": 44373020, + "step": 2053, + "time_per_iteration": 2.8452324867248535 + }, + { + "auxiliary_loss_clip": 0.01550427, + "auxiliary_loss_mlp": 0.01328422, + "balance_loss_clip": 1.18968153, + "balance_loss_mlp": 1.07340908, + "epoch": 0.12349316098000902, + "flos": 47560642653600.0, + "grad_norm": 2.4002301838654407, + "language_loss": 0.74566185, + "learning_rate": 3.909006726300991e-06, + "loss": 0.7744503, + "num_input_tokens_seen": 44397525, + "step": 2054, + "time_per_iteration": 3.0561277866363525 + }, + { + "auxiliary_loss_clip": 0.0156429, + "auxiliary_loss_mlp": 0.01329941, + "balance_loss_clip": 1.20306468, + "balance_loss_mlp": 1.07626343, + "epoch": 0.12355328423267699, + "flos": 25048310212800.0, + "grad_norm": 2.154122009907526, + "language_loss": 0.85511506, + "learning_rate": 3.908890552574849e-06, + "loss": 0.88405734, + "num_input_tokens_seen": 44415890, + "step": 2055, + "time_per_iteration": 2.85304594039917 + }, + { + "auxiliary_loss_clip": 0.01553484, + "auxiliary_loss_mlp": 0.01331903, + "balance_loss_clip": 1.19263363, + "balance_loss_mlp": 1.07727206, + "epoch": 0.12361340748534495, + "flos": 27711614275200.0, + "grad_norm": 2.542018651517905, + "language_loss": 0.7772193, + "learning_rate": 3.908774306463384e-06, + "loss": 0.80607319, + "num_input_tokens_seen": 44436625, + "step": 2056, + "time_per_iteration": 2.8836090564727783 + }, + { + "auxiliary_loss_clip": 0.01546821, + "auxiliary_loss_mlp": 0.01334149, + "balance_loss_clip": 1.18520141, + "balance_loss_mlp": 1.0684551, + "epoch": 0.12367353073801293, + "flos": 26143003696320.0, + "grad_norm": 2.090750860917389, + "language_loss": 0.8330797, + "learning_rate": 3.908657987971009e-06, + "loss": 0.86188942, + "num_input_tokens_seen": 44455265, + "step": 2057, + "time_per_iteration": 2.820122480392456 + }, + { + "auxiliary_loss_clip": 0.01551631, + "auxiliary_loss_mlp": 0.01312609, + "balance_loss_clip": 1.19065809, + "balance_loss_mlp": 1.04977608, + "epoch": 0.1237336539906809, + "flos": 25158416754240.0, + "grad_norm": 1.5829529232500057, + "language_loss": 0.78364491, + "learning_rate": 3.90854159710213e-06, + "loss": 0.81228733, + "num_input_tokens_seen": 44475815, + "step": 2058, + "time_per_iteration": 2.8750531673431396 + }, + { + "auxiliary_loss_clip": 0.0155135, + "auxiliary_loss_mlp": 0.01314399, + "balance_loss_clip": 1.19009995, + "balance_loss_mlp": 1.05309248, + "epoch": 0.12379377724334886, + "flos": 15306402971520.0, + "grad_norm": 3.1740145006681724, + "language_loss": 0.83484119, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.86349869, + "num_input_tokens_seen": 44494045, + "step": 2059, + "time_per_iteration": 2.8163280487060547 + }, + { + "auxiliary_loss_clip": 0.01554022, + "auxiliary_loss_mlp": 0.01315446, + "balance_loss_clip": 1.19278216, + "balance_loss_mlp": 1.05223215, + "epoch": 0.12385390049601683, + "flos": 21318417797760.0, + "grad_norm": 3.730913731458219, + "language_loss": 0.81617343, + "learning_rate": 3.908308598252523e-06, + "loss": 0.84486818, + "num_input_tokens_seen": 44509120, + "step": 2060, + "time_per_iteration": 2.841078042984009 + }, + { + "auxiliary_loss_clip": 0.01550871, + "auxiliary_loss_mlp": 0.01332477, + "balance_loss_clip": 1.18911624, + "balance_loss_mlp": 1.07384038, + "epoch": 0.1239140237486848, + "flos": 15117784410240.0, + "grad_norm": 2.1594817884554134, + "language_loss": 0.86618173, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.89501524, + "num_input_tokens_seen": 44525780, + "step": 2061, + "time_per_iteration": 2.742840528488159 + }, + { + "auxiliary_loss_clip": 0.01551314, + "auxiliary_loss_mlp": 0.01315526, + "balance_loss_clip": 1.18798518, + "balance_loss_mlp": 1.06127667, + "epoch": 0.12397414700135277, + "flos": 21978715692960.0, + "grad_norm": 1.9311156802839262, + "language_loss": 0.85162443, + "learning_rate": 3.908075309949906e-06, + "loss": 0.88029283, + "num_input_tokens_seen": 44543125, + "step": 2062, + "time_per_iteration": 2.784888982772827 + }, + { + "auxiliary_loss_clip": 0.01557721, + "auxiliary_loss_mlp": 0.0133053, + "balance_loss_clip": 1.1937834, + "balance_loss_mlp": 1.07380104, + "epoch": 0.12403427025402074, + "flos": 13402238682240.0, + "grad_norm": 1.8334345579791473, + "language_loss": 0.79191554, + "learning_rate": 3.907958557264774e-06, + "loss": 0.82079804, + "num_input_tokens_seen": 44560275, + "step": 2063, + "time_per_iteration": 2.902552604675293 + }, + { + "auxiliary_loss_clip": 0.01551481, + "auxiliary_loss_mlp": 0.01341891, + "balance_loss_clip": 1.18743682, + "balance_loss_mlp": 1.08363569, + "epoch": 0.12409439350668872, + "flos": 15306175402560.0, + "grad_norm": 3.591986109825483, + "language_loss": 0.79713446, + "learning_rate": 3.907841732229663e-06, + "loss": 0.82606816, + "num_input_tokens_seen": 44577640, + "step": 2064, + "time_per_iteration": 2.866367816925049 + }, + { + "auxiliary_loss_clip": 0.01559626, + "auxiliary_loss_mlp": 0.01332023, + "balance_loss_clip": 1.19726741, + "balance_loss_mlp": 1.07510304, + "epoch": 0.12415451675935668, + "flos": 25011709174080.0, + "grad_norm": 2.421283946886301, + "language_loss": 0.9251132, + "learning_rate": 3.907724834849002e-06, + "loss": 0.95402974, + "num_input_tokens_seen": 44594860, + "step": 2065, + "time_per_iteration": 2.828702926635742 + }, + { + "auxiliary_loss_clip": 0.01555651, + "auxiliary_loss_mlp": 0.01328234, + "balance_loss_clip": 1.19257998, + "balance_loss_mlp": 1.0709331, + "epoch": 0.12421464001202465, + "flos": 23661946264320.0, + "grad_norm": 1.803512022992563, + "language_loss": 0.80997181, + "learning_rate": 3.907607865127225e-06, + "loss": 0.83881062, + "num_input_tokens_seen": 44614780, + "step": 2066, + "time_per_iteration": 2.878472089767456 + }, + { + "auxiliary_loss_clip": 0.01627387, + "auxiliary_loss_mlp": 0.01439766, + "balance_loss_clip": 1.26316154, + "balance_loss_mlp": 1.25055695, + "epoch": 0.12427476326469263, + "flos": 65739866664960.0, + "grad_norm": 0.9684163935356764, + "language_loss": 0.63268471, + "learning_rate": 3.907490823068766e-06, + "loss": 0.66335624, + "num_input_tokens_seen": 44671240, + "step": 2067, + "time_per_iteration": 4.80789852142334 + }, + { + "auxiliary_loss_clip": 0.01561152, + "auxiliary_loss_mlp": 0.0132818, + "balance_loss_clip": 1.19809151, + "balance_loss_mlp": 1.06515682, + "epoch": 0.12433488651736059, + "flos": 24537754150560.0, + "grad_norm": 1.8933329462501354, + "language_loss": 0.93752563, + "learning_rate": 3.907373708678063e-06, + "loss": 0.96641892, + "num_input_tokens_seen": 44691050, + "step": 2068, + "time_per_iteration": 2.8871045112609863 + }, + { + "auxiliary_loss_clip": 0.01557396, + "auxiliary_loss_mlp": 0.01351472, + "balance_loss_clip": 1.19445372, + "balance_loss_mlp": 1.08673191, + "epoch": 0.12439500977002856, + "flos": 21033726114240.0, + "grad_norm": 2.2553474614315467, + "language_loss": 0.81009632, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.839185, + "num_input_tokens_seen": 44709850, + "step": 2069, + "time_per_iteration": 2.8306024074554443 + }, + { + "auxiliary_loss_clip": 0.01554297, + "auxiliary_loss_mlp": 0.01353663, + "balance_loss_clip": 1.19009733, + "balance_loss_mlp": 1.08625305, + "epoch": 0.12445513302269653, + "flos": 26832885920640.0, + "grad_norm": 1.5293541556565926, + "language_loss": 0.77445352, + "learning_rate": 3.907139262917696e-06, + "loss": 0.80353308, + "num_input_tokens_seen": 44731475, + "step": 2070, + "time_per_iteration": 2.8665950298309326 + }, + { + "auxiliary_loss_clip": 0.01553263, + "auxiliary_loss_mlp": 0.01341123, + "balance_loss_clip": 1.19017613, + "balance_loss_mlp": 1.06875324, + "epoch": 0.1245152562753645, + "flos": 18370877189760.0, + "grad_norm": 2.2949713075865783, + "language_loss": 0.81064034, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83958423, + "num_input_tokens_seen": 44749685, + "step": 2071, + "time_per_iteration": 2.8546383380889893 + }, + { + "auxiliary_loss_clip": 0.01558922, + "auxiliary_loss_mlp": 0.01337213, + "balance_loss_clip": 1.1949985, + "balance_loss_mlp": 1.07094729, + "epoch": 0.12457537952803246, + "flos": 33110703842400.0, + "grad_norm": 3.9487039191256037, + "language_loss": 0.78050303, + "learning_rate": 3.906904527881684e-06, + "loss": 0.8094644, + "num_input_tokens_seen": 44772165, + "step": 2072, + "time_per_iteration": 2.9031906127929688 + }, + { + "auxiliary_loss_clip": 0.01552206, + "auxiliary_loss_mlp": 0.01351347, + "balance_loss_clip": 1.18910122, + "balance_loss_mlp": 1.08431792, + "epoch": 0.12463550278070043, + "flos": 22272244637760.0, + "grad_norm": 2.123300828786484, + "language_loss": 0.75567973, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.78471518, + "num_input_tokens_seen": 44790580, + "step": 2073, + "time_per_iteration": 2.863659143447876 + }, + { + "auxiliary_loss_clip": 0.01549048, + "auxiliary_loss_mlp": 0.0133187, + "balance_loss_clip": 1.18393731, + "balance_loss_mlp": 1.06465042, + "epoch": 0.12469562603336841, + "flos": 14680506281760.0, + "grad_norm": 2.396665216993697, + "language_loss": 0.90623611, + "learning_rate": 3.906669503605631e-06, + "loss": 0.93504524, + "num_input_tokens_seen": 44806730, + "step": 2074, + "time_per_iteration": 2.8043277263641357 + }, + { + "auxiliary_loss_clip": 0.01553331, + "auxiliary_loss_mlp": 0.01348565, + "balance_loss_clip": 1.18910718, + "balance_loss_mlp": 1.08287168, + "epoch": 0.12475574928603637, + "flos": 24647102128800.0, + "grad_norm": 4.327808466311262, + "language_loss": 0.84334648, + "learning_rate": 3.906551883013728e-06, + "loss": 0.87236547, + "num_input_tokens_seen": 44825550, + "step": 2075, + "time_per_iteration": 5.978694677352905 + }, + { + "auxiliary_loss_clip": 0.01545872, + "auxiliary_loss_mlp": 0.01332753, + "balance_loss_clip": 1.18322134, + "balance_loss_mlp": 1.0601933, + "epoch": 0.12481587253870434, + "flos": 21765177966240.0, + "grad_norm": 2.024078635080798, + "language_loss": 0.73521054, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76399684, + "num_input_tokens_seen": 44844155, + "step": 2076, + "time_per_iteration": 4.252486705780029 + }, + { + "auxiliary_loss_clip": 0.01541167, + "auxiliary_loss_mlp": 0.01324053, + "balance_loss_clip": 1.1780746, + "balance_loss_mlp": 1.06198359, + "epoch": 0.12487599579137232, + "flos": 21434592844800.0, + "grad_norm": 1.9630414983980846, + "language_loss": 0.75792217, + "learning_rate": 3.906316424944469e-06, + "loss": 0.78657436, + "num_input_tokens_seen": 44863780, + "step": 2077, + "time_per_iteration": 3.0004913806915283 + }, + { + "auxiliary_loss_clip": 0.0154573, + "auxiliary_loss_mlp": 0.01320041, + "balance_loss_clip": 1.18260241, + "balance_loss_mlp": 1.05758977, + "epoch": 0.12493611904404028, + "flos": 16109501846400.0, + "grad_norm": 2.1587804618469417, + "language_loss": 0.83157718, + "learning_rate": 3.906198587476043e-06, + "loss": 0.86023486, + "num_input_tokens_seen": 44881480, + "step": 2078, + "time_per_iteration": 2.821437120437622 + }, + { + "auxiliary_loss_clip": 0.01547512, + "auxiliary_loss_mlp": 0.0133291, + "balance_loss_clip": 1.18387032, + "balance_loss_mlp": 1.07846951, + "epoch": 0.12499624229670825, + "flos": 21582438269760.0, + "grad_norm": 2.087268362734363, + "language_loss": 0.75334942, + "learning_rate": 3.906080677724374e-06, + "loss": 0.78215361, + "num_input_tokens_seen": 44900390, + "step": 2079, + "time_per_iteration": 2.8102564811706543 + }, + { + "auxiliary_loss_clip": 0.01544733, + "auxiliary_loss_mlp": 0.01323821, + "balance_loss_clip": 1.18120468, + "balance_loss_mlp": 1.06289601, + "epoch": 0.1250563655493762, + "flos": 25701212116800.0, + "grad_norm": 2.497943594268396, + "language_loss": 0.83916759, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86785316, + "num_input_tokens_seen": 44920375, + "step": 2080, + "time_per_iteration": 2.789917230606079 + }, + { + "auxiliary_loss_clip": 0.01548915, + "auxiliary_loss_mlp": 0.01333241, + "balance_loss_clip": 1.18605614, + "balance_loss_mlp": 1.07613039, + "epoch": 0.12511648880204418, + "flos": 16911387020160.0, + "grad_norm": 2.168730401623052, + "language_loss": 0.85133839, + "learning_rate": 3.9058446413892e-06, + "loss": 0.88015997, + "num_input_tokens_seen": 44938415, + "step": 2081, + "time_per_iteration": 2.7796993255615234 + }, + { + "auxiliary_loss_clip": 0.01542207, + "auxiliary_loss_mlp": 0.01329821, + "balance_loss_clip": 1.1806016, + "balance_loss_mlp": 1.08053017, + "epoch": 0.12517661205471217, + "flos": 17569485082080.0, + "grad_norm": 1.7882680512748828, + "language_loss": 0.76968396, + "learning_rate": 3.905726514814646e-06, + "loss": 0.79840422, + "num_input_tokens_seen": 44957135, + "step": 2082, + "time_per_iteration": 2.801558494567871 + }, + { + "auxiliary_loss_clip": 0.01547668, + "auxiliary_loss_mlp": 0.01345487, + "balance_loss_clip": 1.18485701, + "balance_loss_mlp": 1.08952093, + "epoch": 0.12523673530738014, + "flos": 16035617062080.0, + "grad_norm": 3.7569468096736007, + "language_loss": 0.79241335, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.82134491, + "num_input_tokens_seen": 44974480, + "step": 2083, + "time_per_iteration": 2.784245491027832 + }, + { + "auxiliary_loss_clip": 0.01550524, + "auxiliary_loss_mlp": 0.01343639, + "balance_loss_clip": 1.1869148, + "balance_loss_mlp": 1.09549296, + "epoch": 0.1252968585600481, + "flos": 18809786229120.0, + "grad_norm": 2.5327937175140702, + "language_loss": 0.90003711, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92897874, + "num_input_tokens_seen": 44990310, + "step": 2084, + "time_per_iteration": 2.8407540321350098 + }, + { + "auxiliary_loss_clip": 0.01547039, + "auxiliary_loss_mlp": 0.01337969, + "balance_loss_clip": 1.18312049, + "balance_loss_mlp": 1.08791614, + "epoch": 0.12535698181271607, + "flos": 27274032721440.0, + "grad_norm": 1.8855651236486675, + "language_loss": 0.80043983, + "learning_rate": 3.905371701516869e-06, + "loss": 0.82928991, + "num_input_tokens_seen": 45010720, + "step": 2085, + "time_per_iteration": 2.816988468170166 + }, + { + "auxiliary_loss_clip": 0.01551085, + "auxiliary_loss_mlp": 0.01355072, + "balance_loss_clip": 1.1872437, + "balance_loss_mlp": 1.10845208, + "epoch": 0.12541710506538403, + "flos": 22056279508800.0, + "grad_norm": 1.9578064086275777, + "language_loss": 0.88566238, + "learning_rate": 3.905253285907856e-06, + "loss": 0.91472393, + "num_input_tokens_seen": 45030360, + "step": 2086, + "time_per_iteration": 2.8115475177764893 + }, + { + "auxiliary_loss_clip": 0.0154679, + "auxiliary_loss_mlp": 0.01333781, + "balance_loss_clip": 1.18272495, + "balance_loss_mlp": 1.08906794, + "epoch": 0.125477228318052, + "flos": 12604753175040.0, + "grad_norm": 2.221387083138594, + "language_loss": 0.87216949, + "learning_rate": 3.905134798051447e-06, + "loss": 0.90097523, + "num_input_tokens_seen": 45045085, + "step": 2087, + "time_per_iteration": 2.767307758331299 + }, + { + "auxiliary_loss_clip": 0.01548348, + "auxiliary_loss_mlp": 0.01326754, + "balance_loss_clip": 1.18511295, + "balance_loss_mlp": 1.07879853, + "epoch": 0.12553735157071996, + "flos": 23880831861600.0, + "grad_norm": 2.268435650099018, + "language_loss": 0.73998547, + "learning_rate": 3.905016237952136e-06, + "loss": 0.76873642, + "num_input_tokens_seen": 45065145, + "step": 2088, + "time_per_iteration": 2.872910737991333 + }, + { + "auxiliary_loss_clip": 0.01616304, + "auxiliary_loss_mlp": 0.0141275, + "balance_loss_clip": 1.2514565, + "balance_loss_mlp": 1.17623901, + "epoch": 0.12559747482338796, + "flos": 69927594635520.0, + "grad_norm": 0.9615232895135852, + "language_loss": 0.61732441, + "learning_rate": 3.904897605614418e-06, + "loss": 0.64761502, + "num_input_tokens_seen": 45126230, + "step": 2089, + "time_per_iteration": 3.3860833644866943 + }, + { + "auxiliary_loss_clip": 0.0154601, + "auxiliary_loss_mlp": 0.01335499, + "balance_loss_clip": 1.18317735, + "balance_loss_mlp": 1.09173989, + "epoch": 0.12565759807605592, + "flos": 24281850304800.0, + "grad_norm": 2.009626618877974, + "language_loss": 0.77921283, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80802786, + "num_input_tokens_seen": 45145545, + "step": 2090, + "time_per_iteration": 2.8299784660339355 + }, + { + "auxiliary_loss_clip": 0.01612, + "auxiliary_loss_mlp": 0.01325119, + "balance_loss_clip": 1.24756229, + "balance_loss_mlp": 1.10386658, + "epoch": 0.12571772132872389, + "flos": 56456819039520.0, + "grad_norm": 0.7937465472173124, + "language_loss": 0.59256893, + "learning_rate": 3.90466012424176e-06, + "loss": 0.62194014, + "num_input_tokens_seen": 45206845, + "step": 2091, + "time_per_iteration": 3.156982898712158 + }, + { + "auxiliary_loss_clip": 0.01546875, + "auxiliary_loss_mlp": 0.01374662, + "balance_loss_clip": 1.18419242, + "balance_loss_mlp": 1.13795996, + "epoch": 0.12577784458139185, + "flos": 41249106233280.0, + "grad_norm": 1.9748927185108205, + "language_loss": 0.63348901, + "learning_rate": 3.904541275215825e-06, + "loss": 0.66270435, + "num_input_tokens_seen": 45228495, + "step": 2092, + "time_per_iteration": 2.9524922370910645 + }, + { + "auxiliary_loss_clip": 0.01550847, + "auxiliary_loss_mlp": 0.01362105, + "balance_loss_clip": 1.18989396, + "balance_loss_mlp": 1.1204437, + "epoch": 0.12583796783405982, + "flos": 19757544563520.0, + "grad_norm": 2.255201058705357, + "language_loss": 0.80369949, + "learning_rate": 3.904422353969493e-06, + "loss": 0.832829, + "num_input_tokens_seen": 45245720, + "step": 2093, + "time_per_iteration": 2.8441550731658936 + }, + { + "auxiliary_loss_clip": 0.01544504, + "auxiliary_loss_mlp": 0.01370227, + "balance_loss_clip": 1.18201566, + "balance_loss_mlp": 1.13104558, + "epoch": 0.12589809108672778, + "flos": 22604346885600.0, + "grad_norm": 1.8298410348884913, + "language_loss": 0.76434219, + "learning_rate": 3.904303360507276e-06, + "loss": 0.79348958, + "num_input_tokens_seen": 45265650, + "step": 2094, + "time_per_iteration": 2.8357794284820557 + }, + { + "auxiliary_loss_clip": 0.01542773, + "auxiliary_loss_mlp": 0.01361761, + "balance_loss_clip": 1.18012738, + "balance_loss_mlp": 1.12048185, + "epoch": 0.12595821433939577, + "flos": 45226330729920.0, + "grad_norm": 1.9077762535184337, + "language_loss": 0.77567154, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.80471694, + "num_input_tokens_seen": 45287790, + "step": 2095, + "time_per_iteration": 3.0143117904663086 + }, + { + "auxiliary_loss_clip": 0.0154864, + "auxiliary_loss_mlp": 0.01388264, + "balance_loss_clip": 1.18814588, + "balance_loss_mlp": 1.14450514, + "epoch": 0.12601833759206374, + "flos": 14321588460480.0, + "grad_norm": 2.924798976106808, + "language_loss": 0.83113265, + "learning_rate": 3.904065156953232e-06, + "loss": 0.86050171, + "num_input_tokens_seen": 45305720, + "step": 2096, + "time_per_iteration": 2.80454421043396 + }, + { + "auxiliary_loss_clip": 0.01554988, + "auxiliary_loss_mlp": 0.01379953, + "balance_loss_clip": 1.19312882, + "balance_loss_mlp": 1.13562131, + "epoch": 0.1260784608447317, + "flos": 21290388523200.0, + "grad_norm": 2.573987322085914, + "language_loss": 0.76279426, + "learning_rate": 3.903945946870439e-06, + "loss": 0.79214358, + "num_input_tokens_seen": 45325290, + "step": 2097, + "time_per_iteration": 2.808429479598999 + }, + { + "auxiliary_loss_clip": 0.01540757, + "auxiliary_loss_mlp": 0.0134768, + "balance_loss_clip": 1.17874432, + "balance_loss_mlp": 1.10392118, + "epoch": 0.12613858409739967, + "flos": 26253792944640.0, + "grad_norm": 2.830448850726964, + "language_loss": 0.87465096, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.90353525, + "num_input_tokens_seen": 45344465, + "step": 2098, + "time_per_iteration": 2.8892836570739746 + }, + { + "auxiliary_loss_clip": 0.0153722, + "auxiliary_loss_mlp": 0.01365189, + "balance_loss_clip": 1.17528081, + "balance_loss_mlp": 1.11627996, + "epoch": 0.12619870735006763, + "flos": 21582134844480.0, + "grad_norm": 2.3407019898630606, + "language_loss": 0.7006498, + "learning_rate": 3.903707310115912e-06, + "loss": 0.72967392, + "num_input_tokens_seen": 45362465, + "step": 2099, + "time_per_iteration": 2.8228607177734375 + }, + { + "auxiliary_loss_clip": 0.01546423, + "auxiliary_loss_mlp": 0.01341493, + "balance_loss_clip": 1.1851027, + "balance_loss_mlp": 1.09925997, + "epoch": 0.1262588306027356, + "flos": 23369251739040.0, + "grad_norm": 1.898878107341776, + "language_loss": 0.81498486, + "learning_rate": 3.903587883453228e-06, + "loss": 0.84386402, + "num_input_tokens_seen": 45382700, + "step": 2100, + "time_per_iteration": 2.9070041179656982 + }, + { + "auxiliary_loss_clip": 0.01543759, + "auxiliary_loss_mlp": 0.01345699, + "balance_loss_clip": 1.1817975, + "balance_loss_mlp": 1.09106755, + "epoch": 0.12631895385540357, + "flos": 23951341039680.0, + "grad_norm": 2.0944322782469937, + "language_loss": 0.80695581, + "learning_rate": 3.903468384606302e-06, + "loss": 0.83585036, + "num_input_tokens_seen": 45401005, + "step": 2101, + "time_per_iteration": 2.8570332527160645 + }, + { + "auxiliary_loss_clip": 0.01610021, + "auxiliary_loss_mlp": 0.01360397, + "balance_loss_clip": 1.24812675, + "balance_loss_mlp": 1.17042542, + "epoch": 0.12637907710807156, + "flos": 70288712290080.0, + "grad_norm": 0.7745936671520807, + "language_loss": 0.57005054, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59975469, + "num_input_tokens_seen": 45466555, + "step": 2102, + "time_per_iteration": 3.368837356567383 + }, + { + "auxiliary_loss_clip": 0.0154141, + "auxiliary_loss_mlp": 0.01314334, + "balance_loss_clip": 1.18049955, + "balance_loss_mlp": 1.05283618, + "epoch": 0.12643920036073952, + "flos": 18917010230400.0, + "grad_norm": 3.145137202336918, + "language_loss": 0.94182813, + "learning_rate": 3.903229170377845e-06, + "loss": 0.97038555, + "num_input_tokens_seen": 45485165, + "step": 2103, + "time_per_iteration": 2.85129451751709 + }, + { + "auxiliary_loss_clip": 0.01537967, + "auxiliary_loss_mlp": 0.01338697, + "balance_loss_clip": 1.17622197, + "balance_loss_mlp": 1.07300353, + "epoch": 0.1264993236134075, + "flos": 27784968065280.0, + "grad_norm": 1.852565206726916, + "language_loss": 0.7817415, + "learning_rate": 3.903109455005387e-06, + "loss": 0.81050813, + "num_input_tokens_seen": 45504630, + "step": 2104, + "time_per_iteration": 2.867084503173828 + }, + { + "auxiliary_loss_clip": 0.01543492, + "auxiliary_loss_mlp": 0.0138137, + "balance_loss_clip": 1.18275058, + "balance_loss_mlp": 1.09984493, + "epoch": 0.12655944686607545, + "flos": 24757056957600.0, + "grad_norm": 2.8971357955812254, + "language_loss": 0.81368351, + "learning_rate": 3.902989667466828e-06, + "loss": 0.84293211, + "num_input_tokens_seen": 45524885, + "step": 2105, + "time_per_iteration": 4.413256406784058 + }, + { + "auxiliary_loss_clip": 0.01542176, + "auxiliary_loss_mlp": 0.01410183, + "balance_loss_clip": 1.18110216, + "balance_loss_mlp": 1.1328547, + "epoch": 0.12661957011874342, + "flos": 24135294437280.0, + "grad_norm": 2.089282851564577, + "language_loss": 0.83775532, + "learning_rate": 3.90286980776671e-06, + "loss": 0.86727887, + "num_input_tokens_seen": 45545000, + "step": 2106, + "time_per_iteration": 2.826700448989868 + }, + { + "auxiliary_loss_clip": 0.01552772, + "auxiliary_loss_mlp": 0.01361018, + "balance_loss_clip": 1.19141912, + "balance_loss_mlp": 1.0890305, + "epoch": 0.12667969337141138, + "flos": 24571814002560.0, + "grad_norm": 1.8537985761678204, + "language_loss": 0.73647112, + "learning_rate": 3.902749875909578e-06, + "loss": 0.76560903, + "num_input_tokens_seen": 45564210, + "step": 2107, + "time_per_iteration": 2.8477118015289307 + }, + { + "auxiliary_loss_clip": 0.01549917, + "auxiliary_loss_mlp": 0.01323765, + "balance_loss_clip": 1.18942642, + "balance_loss_mlp": 1.05635428, + "epoch": 0.12673981662407935, + "flos": 22963643988480.0, + "grad_norm": 2.0730998474292277, + "language_loss": 0.7951476, + "learning_rate": 3.90262987189998e-06, + "loss": 0.82388443, + "num_input_tokens_seen": 45583030, + "step": 2108, + "time_per_iteration": 2.8757591247558594 + }, + { + "auxiliary_loss_clip": 0.01540959, + "auxiliary_loss_mlp": 0.01328929, + "balance_loss_clip": 1.17958724, + "balance_loss_mlp": 1.06438017, + "epoch": 0.12679993987674734, + "flos": 17276790556800.0, + "grad_norm": 1.8905781892317133, + "language_loss": 0.76172781, + "learning_rate": 3.902509795742467e-06, + "loss": 0.79042667, + "num_input_tokens_seen": 45602265, + "step": 2109, + "time_per_iteration": 2.807241439819336 + }, + { + "auxiliary_loss_clip": 0.01548945, + "auxiliary_loss_mlp": 0.01355595, + "balance_loss_clip": 1.18789232, + "balance_loss_mlp": 1.09199905, + "epoch": 0.1268600631294153, + "flos": 17277018125760.0, + "grad_norm": 1.9449152717957483, + "language_loss": 0.82976252, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85880792, + "num_input_tokens_seen": 45620595, + "step": 2110, + "time_per_iteration": 2.9439942836761475 + }, + { + "auxiliary_loss_clip": 0.01545522, + "auxiliary_loss_mlp": 0.01353028, + "balance_loss_clip": 1.18462694, + "balance_loss_mlp": 1.0858084, + "epoch": 0.12692018638208327, + "flos": 24063343989120.0, + "grad_norm": 1.6364640103974581, + "language_loss": 0.78447771, + "learning_rate": 3.90226942700191e-06, + "loss": 0.81346309, + "num_input_tokens_seen": 45641140, + "step": 2111, + "time_per_iteration": 2.8503592014312744 + }, + { + "auxiliary_loss_clip": 0.01547632, + "auxiliary_loss_mlp": 0.01332118, + "balance_loss_clip": 1.18809295, + "balance_loss_mlp": 1.0629909, + "epoch": 0.12698030963475124, + "flos": 31834598148000.0, + "grad_norm": 2.309999651556679, + "language_loss": 0.77300465, + "learning_rate": 3.902149134427982e-06, + "loss": 0.80180216, + "num_input_tokens_seen": 45662315, + "step": 2112, + "time_per_iteration": 2.840278148651123 + }, + { + "auxiliary_loss_clip": 0.01541364, + "auxiliary_loss_mlp": 0.01308126, + "balance_loss_clip": 1.18094921, + "balance_loss_mlp": 1.04414868, + "epoch": 0.1270404328874192, + "flos": 25189859563200.0, + "grad_norm": 2.1020309416967793, + "language_loss": 0.85743093, + "learning_rate": 3.902028769724367e-06, + "loss": 0.88592589, + "num_input_tokens_seen": 45680335, + "step": 2113, + "time_per_iteration": 5.762375593185425 + }, + { + "auxiliary_loss_clip": 0.0154288, + "auxiliary_loss_mlp": 0.0131697, + "balance_loss_clip": 1.18389225, + "balance_loss_mlp": 1.05223024, + "epoch": 0.12710055614008717, + "flos": 15999167736000.0, + "grad_norm": 6.591600947196446, + "language_loss": 0.74279237, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.77139086, + "num_input_tokens_seen": 45696240, + "step": 2114, + "time_per_iteration": 4.2879509925842285 + }, + { + "auxiliary_loss_clip": 0.01546692, + "auxiliary_loss_mlp": 0.01313332, + "balance_loss_clip": 1.18826628, + "balance_loss_mlp": 1.04878294, + "epoch": 0.12716067939275516, + "flos": 15087441517920.0, + "grad_norm": 1.889131442955192, + "language_loss": 0.83525813, + "learning_rate": 3.901787823946341e-06, + "loss": 0.86385834, + "num_input_tokens_seen": 45713695, + "step": 2115, + "time_per_iteration": 2.7684526443481445 + }, + { + "auxiliary_loss_clip": 0.01545451, + "auxiliary_loss_mlp": 0.01305508, + "balance_loss_clip": 1.18754768, + "balance_loss_mlp": 1.03561854, + "epoch": 0.12722080264542313, + "flos": 28369295127360.0, + "grad_norm": 1.846425883339568, + "language_loss": 0.87016022, + "learning_rate": 3.901667242881065e-06, + "loss": 0.89866978, + "num_input_tokens_seen": 45736655, + "step": 2116, + "time_per_iteration": 2.8375844955444336 + }, + { + "auxiliary_loss_clip": 0.01547076, + "auxiliary_loss_mlp": 0.01323924, + "balance_loss_clip": 1.18959641, + "balance_loss_mlp": 1.06814885, + "epoch": 0.1272809258980911, + "flos": 32382665524800.0, + "grad_norm": 2.193225783322501, + "language_loss": 0.70755434, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.73626435, + "num_input_tokens_seen": 45758195, + "step": 2117, + "time_per_iteration": 2.9061009883880615 + }, + { + "auxiliary_loss_clip": 0.01550647, + "auxiliary_loss_mlp": 0.01311459, + "balance_loss_clip": 1.19310749, + "balance_loss_mlp": 1.05320442, + "epoch": 0.12734104915075906, + "flos": 16036261840800.0, + "grad_norm": 2.4233494411088543, + "language_loss": 0.8706249, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89924598, + "num_input_tokens_seen": 45774280, + "step": 2118, + "time_per_iteration": 2.8045132160186768 + }, + { + "auxiliary_loss_clip": 0.0155049, + "auxiliary_loss_mlp": 0.01319002, + "balance_loss_clip": 1.19338322, + "balance_loss_mlp": 1.06513429, + "epoch": 0.12740117240342702, + "flos": 18262325702880.0, + "grad_norm": 2.2166510687049583, + "language_loss": 0.87875867, + "learning_rate": 3.901305067035068e-06, + "loss": 0.90745354, + "num_input_tokens_seen": 45792760, + "step": 2119, + "time_per_iteration": 2.8076605796813965 + }, + { + "auxiliary_loss_clip": 0.01542743, + "auxiliary_loss_mlp": 0.01315313, + "balance_loss_clip": 1.18523228, + "balance_loss_mlp": 1.05782104, + "epoch": 0.127461295656095, + "flos": 12122947022400.0, + "grad_norm": 2.3194861068541597, + "language_loss": 0.88268363, + "learning_rate": 3.901184197551605e-06, + "loss": 0.91126418, + "num_input_tokens_seen": 45804300, + "step": 2120, + "time_per_iteration": 2.984961748123169 + }, + { + "auxiliary_loss_clip": 0.01545675, + "auxiliary_loss_mlp": 0.01312225, + "balance_loss_clip": 1.18741655, + "balance_loss_mlp": 1.06198084, + "epoch": 0.12752141890876295, + "flos": 23151352273920.0, + "grad_norm": 2.087820899924513, + "language_loss": 0.75835025, + "learning_rate": 3.901063255975046e-06, + "loss": 0.78692919, + "num_input_tokens_seen": 45823780, + "step": 2121, + "time_per_iteration": 2.858482599258423 + }, + { + "auxiliary_loss_clip": 0.01546776, + "auxiliary_loss_mlp": 0.01332136, + "balance_loss_clip": 1.1902113, + "balance_loss_mlp": 1.085325, + "epoch": 0.12758154216143094, + "flos": 21618129032640.0, + "grad_norm": 2.429405511587025, + "language_loss": 0.8286767, + "learning_rate": 3.900942242309978e-06, + "loss": 0.85746574, + "num_input_tokens_seen": 45840495, + "step": 2122, + "time_per_iteration": 2.807121515274048 + }, + { + "auxiliary_loss_clip": 0.01558152, + "auxiliary_loss_mlp": 0.01330019, + "balance_loss_clip": 1.20200038, + "balance_loss_mlp": 1.08263624, + "epoch": 0.1276416654140989, + "flos": 15926724221760.0, + "grad_norm": 2.030964115831625, + "language_loss": 0.78787148, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81675327, + "num_input_tokens_seen": 45857735, + "step": 2123, + "time_per_iteration": 2.7809817790985107 + }, + { + "auxiliary_loss_clip": 0.01556464, + "auxiliary_loss_mlp": 0.01342641, + "balance_loss_clip": 1.20043921, + "balance_loss_mlp": 1.09621108, + "epoch": 0.12770178866676687, + "flos": 22384892365920.0, + "grad_norm": 1.6469734660223623, + "language_loss": 0.79593623, + "learning_rate": 3.900699998732673e-06, + "loss": 0.82492721, + "num_input_tokens_seen": 45876485, + "step": 2124, + "time_per_iteration": 2.889827251434326 + }, + { + "auxiliary_loss_clip": 0.01556489, + "auxiliary_loss_mlp": 0.01335116, + "balance_loss_clip": 1.20103478, + "balance_loss_mlp": 1.09078431, + "epoch": 0.12776191191943484, + "flos": 21654616286880.0, + "grad_norm": 2.170618072352186, + "language_loss": 0.75811994, + "learning_rate": 3.900578768829623e-06, + "loss": 0.78703594, + "num_input_tokens_seen": 45894645, + "step": 2125, + "time_per_iteration": 2.867892026901245 + }, + { + "auxiliary_loss_clip": 0.01567107, + "auxiliary_loss_mlp": 0.01355381, + "balance_loss_clip": 1.21185207, + "balance_loss_mlp": 1.11734366, + "epoch": 0.1278220351721028, + "flos": 25737737299200.0, + "grad_norm": 2.5828123933764298, + "language_loss": 0.78239834, + "learning_rate": 3.900457466856434e-06, + "loss": 0.81162322, + "num_input_tokens_seen": 45913755, + "step": 2126, + "time_per_iteration": 2.838941812515259 + }, + { + "auxiliary_loss_clip": 0.01568286, + "auxiliary_loss_mlp": 0.01347822, + "balance_loss_clip": 1.21452928, + "balance_loss_mlp": 1.10692358, + "epoch": 0.12788215842477077, + "flos": 41246337477600.0, + "grad_norm": 1.4751784379988835, + "language_loss": 0.69112957, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7202906, + "num_input_tokens_seen": 45936095, + "step": 2127, + "time_per_iteration": 3.0028162002563477 + }, + { + "auxiliary_loss_clip": 0.01638141, + "auxiliary_loss_mlp": 0.01214355, + "balance_loss_clip": 1.28545856, + "balance_loss_mlp": 1.01446533, + "epoch": 0.12794228167743876, + "flos": 70884683297280.0, + "grad_norm": 0.8483815502545479, + "language_loss": 0.62708157, + "learning_rate": 3.900214646718047e-06, + "loss": 0.65560651, + "num_input_tokens_seen": 46004655, + "step": 2128, + "time_per_iteration": 3.363774538040161 + }, + { + "auxiliary_loss_clip": 0.01563939, + "auxiliary_loss_mlp": 0.01326884, + "balance_loss_clip": 1.20940006, + "balance_loss_mlp": 1.07110822, + "epoch": 0.12800240493010673, + "flos": 16291596764160.0, + "grad_norm": 3.8359666792383176, + "language_loss": 0.77683592, + "learning_rate": 3.900093128562056e-06, + "loss": 0.80574417, + "num_input_tokens_seen": 46023610, + "step": 2129, + "time_per_iteration": 2.887253761291504 + }, + { + "auxiliary_loss_clip": 0.01569599, + "auxiliary_loss_mlp": 0.01344813, + "balance_loss_clip": 1.21562803, + "balance_loss_mlp": 1.08465099, + "epoch": 0.1280625281827747, + "flos": 20633655875040.0, + "grad_norm": 2.0449276152198532, + "language_loss": 0.79827929, + "learning_rate": 3.899971538354343e-06, + "loss": 0.82742333, + "num_input_tokens_seen": 46041725, + "step": 2130, + "time_per_iteration": 2.7956738471984863 + }, + { + "auxiliary_loss_clip": 0.01575068, + "auxiliary_loss_mlp": 0.01329598, + "balance_loss_clip": 1.22206366, + "balance_loss_mlp": 1.07515788, + "epoch": 0.12812265143544266, + "flos": 22640454858240.0, + "grad_norm": 1.9762846100589846, + "language_loss": 0.71125782, + "learning_rate": 3.899849876099518e-06, + "loss": 0.74030447, + "num_input_tokens_seen": 46061095, + "step": 2131, + "time_per_iteration": 2.838303327560425 + }, + { + "auxiliary_loss_clip": 0.01571066, + "auxiliary_loss_mlp": 0.01346816, + "balance_loss_clip": 1.21852303, + "balance_loss_mlp": 1.09218466, + "epoch": 0.12818277468811062, + "flos": 34717584299040.0, + "grad_norm": 2.1008739615214327, + "language_loss": 0.72349745, + "learning_rate": 3.899728141802197e-06, + "loss": 0.75267625, + "num_input_tokens_seen": 46082670, + "step": 2132, + "time_per_iteration": 2.8877365589141846 + }, + { + "auxiliary_loss_clip": 0.01575476, + "auxiliary_loss_mlp": 0.01317509, + "balance_loss_clip": 1.22359157, + "balance_loss_mlp": 1.0668838, + "epoch": 0.1282428979407786, + "flos": 23114334025440.0, + "grad_norm": 2.882961510008664, + "language_loss": 0.82269061, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.85162044, + "num_input_tokens_seen": 46102410, + "step": 2133, + "time_per_iteration": 2.732194423675537 + }, + { + "auxiliary_loss_clip": 0.01567185, + "auxiliary_loss_mlp": 0.01333264, + "balance_loss_clip": 1.21432078, + "balance_loss_mlp": 1.07729745, + "epoch": 0.12830302119344655, + "flos": 20888611516800.0, + "grad_norm": 2.90064102501008, + "language_loss": 0.79454654, + "learning_rate": 3.899484457098528e-06, + "loss": 0.82355106, + "num_input_tokens_seen": 46121145, + "step": 2134, + "time_per_iteration": 2.7831060886383057 + }, + { + "auxiliary_loss_clip": 0.01570506, + "auxiliary_loss_mlp": 0.01322796, + "balance_loss_clip": 1.21762204, + "balance_loss_mlp": 1.06721115, + "epoch": 0.12836314444611455, + "flos": 21399926142240.0, + "grad_norm": 2.0274853225785345, + "language_loss": 0.83260441, + "learning_rate": 3.899362506701421e-06, + "loss": 0.86153746, + "num_input_tokens_seen": 46140740, + "step": 2135, + "time_per_iteration": 2.77888560295105 + }, + { + "auxiliary_loss_clip": 0.01579367, + "auxiliary_loss_mlp": 0.01340653, + "balance_loss_clip": 1.22787237, + "balance_loss_mlp": 1.09098113, + "epoch": 0.1284232676987825, + "flos": 13664173105440.0, + "grad_norm": 2.569391225152637, + "language_loss": 0.78320581, + "learning_rate": 3.899240484280298e-06, + "loss": 0.81240594, + "num_input_tokens_seen": 46156805, + "step": 2136, + "time_per_iteration": 2.786963701248169 + }, + { + "auxiliary_loss_clip": 0.01663428, + "auxiliary_loss_mlp": 0.01271904, + "balance_loss_clip": 1.31297028, + "balance_loss_mlp": 1.06285858, + "epoch": 0.12848339095145048, + "flos": 60000823720800.0, + "grad_norm": 0.9304367172809301, + "language_loss": 0.59224415, + "learning_rate": 3.899118389839785e-06, + "loss": 0.62159753, + "num_input_tokens_seen": 46222085, + "step": 2137, + "time_per_iteration": 3.4479076862335205 + }, + { + "auxiliary_loss_clip": 0.01573376, + "auxiliary_loss_mlp": 0.01316726, + "balance_loss_clip": 1.22153747, + "balance_loss_mlp": 1.06552815, + "epoch": 0.12854351420411844, + "flos": 13883096630880.0, + "grad_norm": 4.886891226687069, + "language_loss": 0.82772082, + "learning_rate": 3.898996223384512e-06, + "loss": 0.85662186, + "num_input_tokens_seen": 46239970, + "step": 2138, + "time_per_iteration": 2.742478132247925 + }, + { + "auxiliary_loss_clip": 0.01577573, + "auxiliary_loss_mlp": 0.01332922, + "balance_loss_clip": 1.22551858, + "balance_loss_mlp": 1.0805794, + "epoch": 0.1286036374567864, + "flos": 22640151432960.0, + "grad_norm": 2.770473265590195, + "language_loss": 0.79291135, + "learning_rate": 3.898873984919113e-06, + "loss": 0.82201636, + "num_input_tokens_seen": 46257740, + "step": 2139, + "time_per_iteration": 2.9668333530426025 + }, + { + "auxiliary_loss_clip": 0.01578473, + "auxiliary_loss_mlp": 0.0134031, + "balance_loss_clip": 1.22642183, + "balance_loss_mlp": 1.08739591, + "epoch": 0.12866376070945437, + "flos": 16326566892000.0, + "grad_norm": 1.7980218061766655, + "language_loss": 0.85202408, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.88121188, + "num_input_tokens_seen": 46275445, + "step": 2140, + "time_per_iteration": 2.7731575965881348 + }, + { + "auxiliary_loss_clip": 0.01581451, + "auxiliary_loss_mlp": 0.01367237, + "balance_loss_clip": 1.22844136, + "balance_loss_mlp": 1.12881875, + "epoch": 0.12872388396212234, + "flos": 11876145935040.0, + "grad_norm": 2.105773422335695, + "language_loss": 0.86001313, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88950002, + "num_input_tokens_seen": 46291710, + "step": 2141, + "time_per_iteration": 2.7665979862213135 + }, + { + "auxiliary_loss_clip": 0.01573252, + "auxiliary_loss_mlp": 0.01348888, + "balance_loss_clip": 1.22037625, + "balance_loss_mlp": 1.10017014, + "epoch": 0.12878400721479033, + "flos": 28369257199200.0, + "grad_norm": 2.326331541862848, + "language_loss": 0.68219656, + "learning_rate": 3.898506837508518e-06, + "loss": 0.71141791, + "num_input_tokens_seen": 46311335, + "step": 2142, + "time_per_iteration": 2.8571689128875732 + }, + { + "auxiliary_loss_clip": 0.01580651, + "auxiliary_loss_mlp": 0.01365194, + "balance_loss_clip": 1.22861743, + "balance_loss_mlp": 1.12219775, + "epoch": 0.1288441304674583, + "flos": 25888048054560.0, + "grad_norm": 2.2503780826448705, + "language_loss": 0.83334351, + "learning_rate": 3.89838431104899e-06, + "loss": 0.86280203, + "num_input_tokens_seen": 46330985, + "step": 2143, + "time_per_iteration": 4.565563440322876 + }, + { + "auxiliary_loss_clip": 0.0157736, + "auxiliary_loss_mlp": 0.01363251, + "balance_loss_clip": 1.22415435, + "balance_loss_mlp": 1.11720276, + "epoch": 0.12890425372012626, + "flos": 20815902505440.0, + "grad_norm": 1.8583530632979266, + "language_loss": 0.81952161, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84892774, + "num_input_tokens_seen": 46351295, + "step": 2144, + "time_per_iteration": 2.9203226566314697 + }, + { + "auxiliary_loss_clip": 0.0157304, + "auxiliary_loss_mlp": 0.01317858, + "balance_loss_clip": 1.21946526, + "balance_loss_mlp": 1.06837702, + "epoch": 0.12896437697279423, + "flos": 22568125128480.0, + "grad_norm": 15.071087672924747, + "language_loss": 0.78484696, + "learning_rate": 3.898139042173813e-06, + "loss": 0.81375587, + "num_input_tokens_seen": 46368600, + "step": 2145, + "time_per_iteration": 2.839275360107422 + }, + { + "auxiliary_loss_clip": 0.01581562, + "auxiliary_loss_mlp": 0.01327658, + "balance_loss_clip": 1.22652185, + "balance_loss_mlp": 1.0781765, + "epoch": 0.1290245002254622, + "flos": 17495714082240.0, + "grad_norm": 2.3647825925019084, + "language_loss": 0.83006799, + "learning_rate": 3.898016299767465e-06, + "loss": 0.85916018, + "num_input_tokens_seen": 46387370, + "step": 2146, + "time_per_iteration": 2.822547197341919 + }, + { + "auxiliary_loss_clip": 0.0157926, + "auxiliary_loss_mlp": 0.01335718, + "balance_loss_clip": 1.22544813, + "balance_loss_mlp": 1.0852828, + "epoch": 0.12908462347813016, + "flos": 36318699675360.0, + "grad_norm": 2.2090418246401815, + "language_loss": 0.71080661, + "learning_rate": 3.897893485388149e-06, + "loss": 0.73995638, + "num_input_tokens_seen": 46409570, + "step": 2147, + "time_per_iteration": 2.9891459941864014 + }, + { + "auxiliary_loss_clip": 0.01572902, + "auxiliary_loss_mlp": 0.01326141, + "balance_loss_clip": 1.21768451, + "balance_loss_mlp": 1.06979322, + "epoch": 0.12914474673079815, + "flos": 22530917239200.0, + "grad_norm": 3.676890721101346, + "language_loss": 0.71469998, + "learning_rate": 3.897770599040521e-06, + "loss": 0.74369043, + "num_input_tokens_seen": 46429320, + "step": 2148, + "time_per_iteration": 2.8157172203063965 + }, + { + "auxiliary_loss_clip": 0.01580236, + "auxiliary_loss_mlp": 0.01336636, + "balance_loss_clip": 1.22614336, + "balance_loss_mlp": 1.0850563, + "epoch": 0.12920486998346611, + "flos": 21474114351840.0, + "grad_norm": 7.032219556164139, + "language_loss": 0.79154497, + "learning_rate": 3.897647640729242e-06, + "loss": 0.82071376, + "num_input_tokens_seen": 46450155, + "step": 2149, + "time_per_iteration": 2.8474512100219727 + }, + { + "auxiliary_loss_clip": 0.01575314, + "auxiliary_loss_mlp": 0.01329495, + "balance_loss_clip": 1.21982527, + "balance_loss_mlp": 1.07658076, + "epoch": 0.12926499323613408, + "flos": 27311050969920.0, + "grad_norm": 2.1208376776218376, + "language_loss": 0.76281291, + "learning_rate": 3.897524610458975e-06, + "loss": 0.79186106, + "num_input_tokens_seen": 46470280, + "step": 2150, + "time_per_iteration": 2.892439603805542 + }, + { + "auxiliary_loss_clip": 0.01579742, + "auxiliary_loss_mlp": 0.01333323, + "balance_loss_clip": 1.22302198, + "balance_loss_mlp": 1.0796454, + "epoch": 0.12932511648880204, + "flos": 22093335685440.0, + "grad_norm": 2.322197395428139, + "language_loss": 0.70672911, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.73585975, + "num_input_tokens_seen": 46487605, + "step": 2151, + "time_per_iteration": 5.849905729293823 + }, + { + "auxiliary_loss_clip": 0.01572913, + "auxiliary_loss_mlp": 0.01332142, + "balance_loss_clip": 1.21634007, + "balance_loss_mlp": 1.08475888, + "epoch": 0.12938523974147, + "flos": 20304853377120.0, + "grad_norm": 1.9565707317392027, + "language_loss": 0.84663951, + "learning_rate": 3.897278334060137e-06, + "loss": 0.87569004, + "num_input_tokens_seen": 46505100, + "step": 2152, + "time_per_iteration": 4.2793967723846436 + }, + { + "auxiliary_loss_clip": 0.01578856, + "auxiliary_loss_mlp": 0.01335043, + "balance_loss_clip": 1.22146666, + "balance_loss_mlp": 1.08289111, + "epoch": 0.12944536299413797, + "flos": 19501982071200.0, + "grad_norm": 1.7926261154549834, + "language_loss": 0.78979129, + "learning_rate": 3.897155087940906e-06, + "loss": 0.81893033, + "num_input_tokens_seen": 46524020, + "step": 2153, + "time_per_iteration": 2.801891326904297 + }, + { + "auxiliary_loss_clip": 0.01568041, + "auxiliary_loss_mlp": 0.01322542, + "balance_loss_clip": 1.21139646, + "balance_loss_mlp": 1.07172561, + "epoch": 0.12950548624680594, + "flos": 27710324717760.0, + "grad_norm": 1.974673731081781, + "language_loss": 0.80035871, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82926452, + "num_input_tokens_seen": 46544640, + "step": 2154, + "time_per_iteration": 2.8142855167388916 + }, + { + "auxiliary_loss_clip": 0.01566287, + "auxiliary_loss_mlp": 0.0132042, + "balance_loss_clip": 1.20960069, + "balance_loss_mlp": 1.06922174, + "epoch": 0.12956560949947393, + "flos": 17567626602240.0, + "grad_norm": 3.2023662507788315, + "language_loss": 0.83679986, + "learning_rate": 3.896908379886188e-06, + "loss": 0.86566699, + "num_input_tokens_seen": 46561395, + "step": 2155, + "time_per_iteration": 2.7726266384124756 + }, + { + "auxiliary_loss_clip": 0.01566573, + "auxiliary_loss_mlp": 0.01321035, + "balance_loss_clip": 1.21035051, + "balance_loss_mlp": 1.06678545, + "epoch": 0.1296257327521419, + "flos": 20742814212480.0, + "grad_norm": 3.424262026376062, + "language_loss": 0.7548449, + "learning_rate": 3.896784917960055e-06, + "loss": 0.78372097, + "num_input_tokens_seen": 46579395, + "step": 2156, + "time_per_iteration": 2.916891574859619 + }, + { + "auxiliary_loss_clip": 0.01572709, + "auxiliary_loss_mlp": 0.01321536, + "balance_loss_clip": 1.21668506, + "balance_loss_mlp": 1.07434404, + "epoch": 0.12968585600480986, + "flos": 16397417423520.0, + "grad_norm": 1.8165814044064665, + "language_loss": 0.86781871, + "learning_rate": 3.896661384107648e-06, + "loss": 0.89676118, + "num_input_tokens_seen": 46597090, + "step": 2157, + "time_per_iteration": 2.7104883193969727 + }, + { + "auxiliary_loss_clip": 0.01563425, + "auxiliary_loss_mlp": 0.01319378, + "balance_loss_clip": 1.2071768, + "balance_loss_mlp": 1.06760812, + "epoch": 0.12974597925747783, + "flos": 28332049309920.0, + "grad_norm": 3.0571393886626463, + "language_loss": 0.81563473, + "learning_rate": 3.896537778333651e-06, + "loss": 0.84446287, + "num_input_tokens_seen": 46617355, + "step": 2158, + "time_per_iteration": 2.8732190132141113 + }, + { + "auxiliary_loss_clip": 0.0156797, + "auxiliary_loss_mlp": 0.01322607, + "balance_loss_clip": 1.21049428, + "balance_loss_mlp": 1.08514214, + "epoch": 0.1298061025101458, + "flos": 9684369493920.0, + "grad_norm": 2.4869434817513807, + "language_loss": 0.74755585, + "learning_rate": 3.896414100642752e-06, + "loss": 0.77646166, + "num_input_tokens_seen": 46633130, + "step": 2159, + "time_per_iteration": 2.7272675037384033 + }, + { + "auxiliary_loss_clip": 0.01564503, + "auxiliary_loss_mlp": 0.01340931, + "balance_loss_clip": 1.20777082, + "balance_loss_mlp": 1.09526479, + "epoch": 0.12986622576281376, + "flos": 27712031484960.0, + "grad_norm": 2.3819343092330394, + "language_loss": 0.82802308, + "learning_rate": 3.89629035103964e-06, + "loss": 0.85707748, + "num_input_tokens_seen": 46650575, + "step": 2160, + "time_per_iteration": 2.8619163036346436 + }, + { + "auxiliary_loss_clip": 0.0157054, + "auxiliary_loss_mlp": 0.0133266, + "balance_loss_clip": 1.21452653, + "balance_loss_mlp": 1.08813822, + "epoch": 0.12992634901548175, + "flos": 18804476286720.0, + "grad_norm": 1.7895262055119523, + "language_loss": 0.82126194, + "learning_rate": 3.896166529529008e-06, + "loss": 0.85029399, + "num_input_tokens_seen": 46668780, + "step": 2161, + "time_per_iteration": 2.7309327125549316 + }, + { + "auxiliary_loss_clip": 0.01568653, + "auxiliary_loss_mlp": 0.01312943, + "balance_loss_clip": 1.21135235, + "balance_loss_mlp": 1.06498718, + "epoch": 0.12998647226814972, + "flos": 29129876170560.0, + "grad_norm": 2.141398364728214, + "language_loss": 0.83073425, + "learning_rate": 3.896042636115551e-06, + "loss": 0.85955024, + "num_input_tokens_seen": 46687550, + "step": 2162, + "time_per_iteration": 2.755164623260498 + }, + { + "auxiliary_loss_clip": 0.01566843, + "auxiliary_loss_mlp": 0.01344217, + "balance_loss_clip": 1.21001649, + "balance_loss_mlp": 1.09721518, + "epoch": 0.13004659552081768, + "flos": 19575753071040.0, + "grad_norm": 3.60335004277742, + "language_loss": 0.72982264, + "learning_rate": 3.895918670803968e-06, + "loss": 0.75893319, + "num_input_tokens_seen": 46706730, + "step": 2163, + "time_per_iteration": 2.778907060623169 + }, + { + "auxiliary_loss_clip": 0.01570205, + "auxiliary_loss_mlp": 0.01341068, + "balance_loss_clip": 1.21279478, + "balance_loss_mlp": 1.08319426, + "epoch": 0.13010671877348565, + "flos": 22492837002240.0, + "grad_norm": 2.773013495957855, + "language_loss": 0.81817347, + "learning_rate": 3.895794633598958e-06, + "loss": 0.84728622, + "num_input_tokens_seen": 46724250, + "step": 2164, + "time_per_iteration": 2.7892448902130127 + }, + { + "auxiliary_loss_clip": 0.01575227, + "auxiliary_loss_mlp": 0.01358945, + "balance_loss_clip": 1.21792591, + "balance_loss_mlp": 1.11175251, + "epoch": 0.1301668420261536, + "flos": 23880490508160.0, + "grad_norm": 2.4657876504418734, + "language_loss": 0.7232424, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.75258416, + "num_input_tokens_seen": 46744105, + "step": 2165, + "time_per_iteration": 2.7934389114379883 + }, + { + "auxiliary_loss_clip": 0.01562924, + "auxiliary_loss_mlp": 0.0133984, + "balance_loss_clip": 1.20589995, + "balance_loss_mlp": 1.08234763, + "epoch": 0.13022696527882158, + "flos": 23152376334240.0, + "grad_norm": 2.016211522754961, + "language_loss": 0.75170052, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.78072822, + "num_input_tokens_seen": 46764250, + "step": 2166, + "time_per_iteration": 2.7903223037719727 + }, + { + "auxiliary_loss_clip": 0.01572178, + "auxiliary_loss_mlp": 0.01333981, + "balance_loss_clip": 1.21462297, + "balance_loss_mlp": 1.08297431, + "epoch": 0.13028708853148954, + "flos": 26911094515200.0, + "grad_norm": 1.6017747737544077, + "language_loss": 0.83506554, + "learning_rate": 3.895422090670421e-06, + "loss": 0.86412716, + "num_input_tokens_seen": 46786865, + "step": 2167, + "time_per_iteration": 2.897900342941284 + }, + { + "auxiliary_loss_clip": 0.01573953, + "auxiliary_loss_mlp": 0.0133185, + "balance_loss_clip": 1.21478319, + "balance_loss_mlp": 1.08236849, + "epoch": 0.13034721178415754, + "flos": 21253597843680.0, + "grad_norm": 1.6324281596386916, + "language_loss": 0.83648783, + "learning_rate": 3.89529776593877e-06, + "loss": 0.86554587, + "num_input_tokens_seen": 46807030, + "step": 2168, + "time_per_iteration": 2.748018980026245 + }, + { + "auxiliary_loss_clip": 0.01573525, + "auxiliary_loss_mlp": 0.01348327, + "balance_loss_clip": 1.21400547, + "balance_loss_mlp": 1.0975101, + "epoch": 0.1304073350368255, + "flos": 18769051020960.0, + "grad_norm": 2.954568277599365, + "language_loss": 0.80335331, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.83257174, + "num_input_tokens_seen": 46826280, + "step": 2169, + "time_per_iteration": 2.8068747520446777 + }, + { + "auxiliary_loss_clip": 0.01571611, + "auxiliary_loss_mlp": 0.0134223, + "balance_loss_clip": 1.21417224, + "balance_loss_mlp": 1.08416557, + "epoch": 0.13046745828949347, + "flos": 28367095294080.0, + "grad_norm": 2.6243643655774296, + "language_loss": 0.67128491, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.70042336, + "num_input_tokens_seen": 46846505, + "step": 2170, + "time_per_iteration": 2.817949056625366 + }, + { + "auxiliary_loss_clip": 0.01578754, + "auxiliary_loss_mlp": 0.0136256, + "balance_loss_clip": 1.21839392, + "balance_loss_mlp": 1.11040902, + "epoch": 0.13052758154216143, + "flos": 29607548153760.0, + "grad_norm": 1.844425894327979, + "language_loss": 0.67102426, + "learning_rate": 3.8949243605434e-06, + "loss": 0.70043737, + "num_input_tokens_seen": 46867380, + "step": 2171, + "time_per_iteration": 2.855860471725464 + }, + { + "auxiliary_loss_clip": 0.01573817, + "auxiliary_loss_mlp": 0.01353412, + "balance_loss_clip": 1.21484125, + "balance_loss_mlp": 1.10412121, + "epoch": 0.1305877047948294, + "flos": 19392899590080.0, + "grad_norm": 2.2634334107969614, + "language_loss": 0.72673291, + "learning_rate": 3.894799748360537e-06, + "loss": 0.75600517, + "num_input_tokens_seen": 46886810, + "step": 2172, + "time_per_iteration": 2.794888734817505 + }, + { + "auxiliary_loss_clip": 0.01577611, + "auxiliary_loss_mlp": 0.01340313, + "balance_loss_clip": 1.21768868, + "balance_loss_mlp": 1.09407449, + "epoch": 0.13064782804749736, + "flos": 16875430760160.0, + "grad_norm": 2.1672464325358405, + "language_loss": 0.75642055, + "learning_rate": 3.894675064326678e-06, + "loss": 0.78559977, + "num_input_tokens_seen": 46905620, + "step": 2173, + "time_per_iteration": 2.755970001220703 + }, + { + "auxiliary_loss_clip": 0.01572991, + "auxiliary_loss_mlp": 0.013379, + "balance_loss_clip": 1.21301603, + "balance_loss_mlp": 1.09013534, + "epoch": 0.13070795130016533, + "flos": 24501418608960.0, + "grad_norm": 3.282261349204089, + "language_loss": 0.71181226, + "learning_rate": 3.894550308446551e-06, + "loss": 0.74092114, + "num_input_tokens_seen": 46925120, + "step": 2174, + "time_per_iteration": 2.904142379760742 + }, + { + "auxiliary_loss_clip": 0.01660188, + "auxiliary_loss_mlp": 0.01520447, + "balance_loss_clip": 1.30099154, + "balance_loss_mlp": 1.33772278, + "epoch": 0.13076807455283332, + "flos": 71061468272640.0, + "grad_norm": 0.9306895152795055, + "language_loss": 0.58973479, + "learning_rate": 3.894425480724886e-06, + "loss": 0.62154114, + "num_input_tokens_seen": 46988195, + "step": 2175, + "time_per_iteration": 3.468292474746704 + }, + { + "auxiliary_loss_clip": 0.01571246, + "auxiliary_loss_mlp": 0.01328424, + "balance_loss_clip": 1.21264863, + "balance_loss_mlp": 1.07608199, + "epoch": 0.13082819780550128, + "flos": 20266469714880.0, + "grad_norm": 2.281545978679262, + "language_loss": 0.80378461, + "learning_rate": 3.894300581166417e-06, + "loss": 0.83278131, + "num_input_tokens_seen": 47004720, + "step": 2176, + "time_per_iteration": 2.8149540424346924 + }, + { + "auxiliary_loss_clip": 0.01567213, + "auxiliary_loss_mlp": 0.01302332, + "balance_loss_clip": 1.20746994, + "balance_loss_mlp": 1.04865432, + "epoch": 0.13088832105816925, + "flos": 34206269673600.0, + "grad_norm": 4.687736361170115, + "language_loss": 0.74973059, + "learning_rate": 3.894175609775881e-06, + "loss": 0.77842605, + "num_input_tokens_seen": 47024255, + "step": 2177, + "time_per_iteration": 2.858414649963379 + }, + { + "auxiliary_loss_clip": 0.01567767, + "auxiliary_loss_mlp": 0.01320695, + "balance_loss_clip": 1.20636868, + "balance_loss_mlp": 1.06434751, + "epoch": 0.13094844431083721, + "flos": 17896922166240.0, + "grad_norm": 2.031545563981263, + "language_loss": 0.8239733, + "learning_rate": 3.894050566558015e-06, + "loss": 0.85285795, + "num_input_tokens_seen": 47042465, + "step": 2178, + "time_per_iteration": 2.775186061859131 + }, + { + "auxiliary_loss_clip": 0.01574322, + "auxiliary_loss_mlp": 0.01323328, + "balance_loss_clip": 1.21316242, + "balance_loss_mlp": 1.06278443, + "epoch": 0.13100856756350518, + "flos": 17313239882880.0, + "grad_norm": 3.0415135700121616, + "language_loss": 0.75122845, + "learning_rate": 3.893925451517562e-06, + "loss": 0.78020501, + "num_input_tokens_seen": 47060370, + "step": 2179, + "time_per_iteration": 2.802546262741089 + }, + { + "auxiliary_loss_clip": 0.01565227, + "auxiliary_loss_mlp": 0.01320469, + "balance_loss_clip": 1.20490575, + "balance_loss_mlp": 1.05191398, + "epoch": 0.13106869081617314, + "flos": 22202759520000.0, + "grad_norm": 2.4635374171851576, + "language_loss": 0.84616148, + "learning_rate": 3.893800264659266e-06, + "loss": 0.87501842, + "num_input_tokens_seen": 47081415, + "step": 2180, + "time_per_iteration": 2.7624709606170654 + }, + { + "auxiliary_loss_clip": 0.01572383, + "auxiliary_loss_mlp": 0.01336697, + "balance_loss_clip": 1.21199656, + "balance_loss_mlp": 1.07195747, + "epoch": 0.13112881406884114, + "flos": 21765329678880.0, + "grad_norm": 1.9418645849930636, + "language_loss": 0.90321481, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.93230557, + "num_input_tokens_seen": 47099860, + "step": 2181, + "time_per_iteration": 4.342936992645264 + }, + { + "auxiliary_loss_clip": 0.01568697, + "auxiliary_loss_mlp": 0.01343373, + "balance_loss_clip": 1.20837283, + "balance_loss_mlp": 1.08054066, + "epoch": 0.1311889373215091, + "flos": 23333333407200.0, + "grad_norm": 1.937911975683557, + "language_loss": 0.68784541, + "learning_rate": 3.893549675508137e-06, + "loss": 0.71696615, + "num_input_tokens_seen": 47118540, + "step": 2182, + "time_per_iteration": 2.826312780380249 + }, + { + "auxiliary_loss_clip": 0.01568257, + "auxiliary_loss_mlp": 0.01328056, + "balance_loss_clip": 1.2087841, + "balance_loss_mlp": 1.0701828, + "epoch": 0.13124906057417707, + "flos": 21469335403680.0, + "grad_norm": 1.9222666609337897, + "language_loss": 0.78880256, + "learning_rate": 3.893424273224806e-06, + "loss": 0.81776565, + "num_input_tokens_seen": 47136710, + "step": 2183, + "time_per_iteration": 2.7535650730133057 + }, + { + "auxiliary_loss_clip": 0.01559338, + "auxiliary_loss_mlp": 0.01324101, + "balance_loss_clip": 1.19955146, + "balance_loss_mlp": 1.05955184, + "epoch": 0.13130918382684503, + "flos": 23257400502240.0, + "grad_norm": 2.638779432173238, + "language_loss": 0.86061609, + "learning_rate": 3.893298799142636e-06, + "loss": 0.88945049, + "num_input_tokens_seen": 47157155, + "step": 2184, + "time_per_iteration": 2.83184552192688 + }, + { + "auxiliary_loss_clip": 0.01562772, + "auxiliary_loss_mlp": 0.01306418, + "balance_loss_clip": 1.20270872, + "balance_loss_mlp": 1.04511094, + "epoch": 0.131369307079513, + "flos": 20852389759680.0, + "grad_norm": 2.1380345706333643, + "language_loss": 0.82508075, + "learning_rate": 3.893173253266387e-06, + "loss": 0.85377258, + "num_input_tokens_seen": 47176820, + "step": 2185, + "time_per_iteration": 2.7957651615142822 + }, + { + "auxiliary_loss_clip": 0.01558367, + "auxiliary_loss_mlp": 0.01313563, + "balance_loss_clip": 1.19741344, + "balance_loss_mlp": 1.05301952, + "epoch": 0.13142943033218096, + "flos": 17860283199360.0, + "grad_norm": 2.621915480443986, + "language_loss": 0.73256892, + "learning_rate": 3.893047635600818e-06, + "loss": 0.76128823, + "num_input_tokens_seen": 47195855, + "step": 2186, + "time_per_iteration": 2.823218822479248 + }, + { + "auxiliary_loss_clip": 0.01562861, + "auxiliary_loss_mlp": 0.01301502, + "balance_loss_clip": 1.20227814, + "balance_loss_mlp": 1.04591727, + "epoch": 0.13148955358484893, + "flos": 20998149135840.0, + "grad_norm": 2.04679701047759, + "language_loss": 0.80335253, + "learning_rate": 3.892921946150693e-06, + "loss": 0.83199608, + "num_input_tokens_seen": 47214535, + "step": 2187, + "time_per_iteration": 2.8426880836486816 + }, + { + "auxiliary_loss_clip": 0.01688069, + "auxiliary_loss_mlp": 0.01386795, + "balance_loss_clip": 1.32839978, + "balance_loss_mlp": 1.15867615, + "epoch": 0.13154967683751692, + "flos": 70179022958400.0, + "grad_norm": 0.9020603335430413, + "language_loss": 0.58919716, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61994576, + "num_input_tokens_seen": 47270300, + "step": 2188, + "time_per_iteration": 3.3720521926879883 + }, + { + "auxiliary_loss_clip": 0.01570045, + "auxiliary_loss_mlp": 0.01339595, + "balance_loss_clip": 1.20983481, + "balance_loss_mlp": 1.09602702, + "epoch": 0.1316098000901849, + "flos": 20378245095360.0, + "grad_norm": 2.0553950000721515, + "language_loss": 0.74493927, + "learning_rate": 3.892670351915842e-06, + "loss": 0.77403569, + "num_input_tokens_seen": 47290720, + "step": 2189, + "time_per_iteration": 4.323131084442139 + }, + { + "auxiliary_loss_clip": 0.0156885, + "auxiliary_loss_mlp": 0.01368906, + "balance_loss_clip": 1.20753849, + "balance_loss_mlp": 1.12552857, + "epoch": 0.13166992334285285, + "flos": 23223340650240.0, + "grad_norm": 2.9601421398554097, + "language_loss": 0.73106408, + "learning_rate": 3.892544447140657e-06, + "loss": 0.76044166, + "num_input_tokens_seen": 47311820, + "step": 2190, + "time_per_iteration": 4.278883695602417 + }, + { + "auxiliary_loss_clip": 0.0156122, + "auxiliary_loss_mlp": 0.01379093, + "balance_loss_clip": 1.2005223, + "balance_loss_mlp": 1.13876748, + "epoch": 0.13173004659552082, + "flos": 23333181694560.0, + "grad_norm": 3.887879524935217, + "language_loss": 0.74995053, + "learning_rate": 3.892418470599996e-06, + "loss": 0.77935368, + "num_input_tokens_seen": 47331605, + "step": 2191, + "time_per_iteration": 4.3492772579193115 + }, + { + "auxiliary_loss_clip": 0.01567888, + "auxiliary_loss_mlp": 0.01389927, + "balance_loss_clip": 1.2065165, + "balance_loss_mlp": 1.14578629, + "epoch": 0.13179016984818878, + "flos": 21253635771840.0, + "grad_norm": 3.5271714644719836, + "language_loss": 0.80098987, + "learning_rate": 3.892292422298637e-06, + "loss": 0.83056808, + "num_input_tokens_seen": 47350455, + "step": 2192, + "time_per_iteration": 2.8244638442993164 + }, + { + "auxiliary_loss_clip": 0.0156025, + "auxiliary_loss_mlp": 0.01384168, + "balance_loss_clip": 1.19963574, + "balance_loss_mlp": 1.14517713, + "epoch": 0.13185029310085675, + "flos": 17780254053120.0, + "grad_norm": 1.9390348550722607, + "language_loss": 0.85811585, + "learning_rate": 3.892166302241361e-06, + "loss": 0.88756001, + "num_input_tokens_seen": 47368225, + "step": 2193, + "time_per_iteration": 2.838779926300049 + }, + { + "auxiliary_loss_clip": 0.01680656, + "auxiliary_loss_mlp": 0.0130616, + "balance_loss_clip": 1.3204143, + "balance_loss_mlp": 1.10092926, + "epoch": 0.1319104163535247, + "flos": 69858754296480.0, + "grad_norm": 0.7770336119402064, + "language_loss": 0.54111916, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.57098734, + "num_input_tokens_seen": 47427125, + "step": 2194, + "time_per_iteration": 3.3342525959014893 + }, + { + "auxiliary_loss_clip": 0.01563502, + "auxiliary_loss_mlp": 0.01385197, + "balance_loss_clip": 1.20258021, + "balance_loss_mlp": 1.14773262, + "epoch": 0.1319705396061927, + "flos": 25195510859040.0, + "grad_norm": 2.0425387124800474, + "language_loss": 0.732144, + "learning_rate": 3.891913846878185e-06, + "loss": 0.76163101, + "num_input_tokens_seen": 47450275, + "step": 2195, + "time_per_iteration": 2.8248233795166016 + }, + { + "auxiliary_loss_clip": 0.0156143, + "auxiliary_loss_mlp": 0.01428539, + "balance_loss_clip": 1.20033038, + "balance_loss_mlp": 1.19698715, + "epoch": 0.13203066285886067, + "flos": 20742548715360.0, + "grad_norm": 1.9544515920791352, + "language_loss": 0.77918065, + "learning_rate": 3.891787511581859e-06, + "loss": 0.8090803, + "num_input_tokens_seen": 47469155, + "step": 2196, + "time_per_iteration": 2.888869285583496 + }, + { + "auxiliary_loss_clip": 0.01556462, + "auxiliary_loss_mlp": 0.0148871, + "balance_loss_clip": 1.19540632, + "balance_loss_mlp": 1.26268888, + "epoch": 0.13209078611152864, + "flos": 22056582934080.0, + "grad_norm": 2.371752802703857, + "language_loss": 0.75098383, + "learning_rate": 3.89166110454876e-06, + "loss": 0.78143555, + "num_input_tokens_seen": 47488405, + "step": 2197, + "time_per_iteration": 2.7687790393829346 + }, + { + "auxiliary_loss_clip": 0.0155242, + "auxiliary_loss_mlp": 0.01470228, + "balance_loss_clip": 1.19192004, + "balance_loss_mlp": 1.24001074, + "epoch": 0.1321509093641966, + "flos": 16286552318880.0, + "grad_norm": 1.952162276732442, + "language_loss": 0.799815, + "learning_rate": 3.891534625783685e-06, + "loss": 0.83004147, + "num_input_tokens_seen": 47505650, + "step": 2198, + "time_per_iteration": 2.8069281578063965 + }, + { + "auxiliary_loss_clip": 0.01559765, + "auxiliary_loss_mlp": 0.01456572, + "balance_loss_clip": 1.19880319, + "balance_loss_mlp": 1.22788048, + "epoch": 0.13221103261686457, + "flos": 16984930451040.0, + "grad_norm": 5.454411320576574, + "language_loss": 0.82928556, + "learning_rate": 3.891408075291425e-06, + "loss": 0.85944891, + "num_input_tokens_seen": 47521540, + "step": 2199, + "time_per_iteration": 2.8000454902648926 + }, + { + "auxiliary_loss_clip": 0.01561707, + "auxiliary_loss_mlp": 0.014854, + "balance_loss_clip": 1.20182526, + "balance_loss_mlp": 1.26033282, + "epoch": 0.13227115586953253, + "flos": 34236081571680.0, + "grad_norm": 1.6322863256563422, + "language_loss": 0.69774628, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.72821736, + "num_input_tokens_seen": 47543625, + "step": 2200, + "time_per_iteration": 2.874117374420166 + }, + { + "auxiliary_loss_clip": 0.01556644, + "auxiliary_loss_mlp": 0.01440389, + "balance_loss_clip": 1.19688964, + "balance_loss_mlp": 1.20769262, + "epoch": 0.13233127912220052, + "flos": 20706782096160.0, + "grad_norm": 3.1268765614629275, + "language_loss": 0.85009086, + "learning_rate": 3.891154759144557e-06, + "loss": 0.88006109, + "num_input_tokens_seen": 47563740, + "step": 2201, + "time_per_iteration": 2.8221311569213867 + }, + { + "auxiliary_loss_clip": 0.0155557, + "auxiliary_loss_mlp": 0.01424444, + "balance_loss_clip": 1.19524515, + "balance_loss_mlp": 1.17992198, + "epoch": 0.1323914023748685, + "flos": 25807032776160.0, + "grad_norm": 2.084547343582485, + "language_loss": 0.87084556, + "learning_rate": 3.891027993499554e-06, + "loss": 0.90064573, + "num_input_tokens_seen": 47582655, + "step": 2202, + "time_per_iteration": 2.7850189208984375 + }, + { + "auxiliary_loss_clip": 0.0155281, + "auxiliary_loss_mlp": 0.01409731, + "balance_loss_clip": 1.19288468, + "balance_loss_mlp": 1.1680702, + "epoch": 0.13245152562753645, + "flos": 21253673700000.0, + "grad_norm": 2.3110283144776633, + "language_loss": 0.72452885, + "learning_rate": 3.89090115614658e-06, + "loss": 0.75415432, + "num_input_tokens_seen": 47600875, + "step": 2203, + "time_per_iteration": 2.800266742706299 + }, + { + "auxiliary_loss_clip": 0.01551753, + "auxiliary_loss_mlp": 0.01414516, + "balance_loss_clip": 1.19271898, + "balance_loss_mlp": 1.17075682, + "epoch": 0.13251164888020442, + "flos": 26613165903840.0, + "grad_norm": 2.5974859738625122, + "language_loss": 0.74404716, + "learning_rate": 3.890774247090444e-06, + "loss": 0.77370989, + "num_input_tokens_seen": 47619250, + "step": 2204, + "time_per_iteration": 2.844466209411621 + }, + { + "auxiliary_loss_clip": 0.01552313, + "auxiliary_loss_mlp": 0.01386101, + "balance_loss_clip": 1.19243348, + "balance_loss_mlp": 1.13242316, + "epoch": 0.13257177213287238, + "flos": 29829202506720.0, + "grad_norm": 2.0622685097844156, + "language_loss": 0.78633988, + "learning_rate": 3.89064726633596e-06, + "loss": 0.81572402, + "num_input_tokens_seen": 47639445, + "step": 2205, + "time_per_iteration": 2.9454877376556396 + }, + { + "auxiliary_loss_clip": 0.0155229, + "auxiliary_loss_mlp": 0.01382071, + "balance_loss_clip": 1.19222498, + "balance_loss_mlp": 1.13526034, + "epoch": 0.13263189538554035, + "flos": 21290616092160.0, + "grad_norm": 2.4608443838216267, + "language_loss": 0.79224855, + "learning_rate": 3.890520213887941e-06, + "loss": 0.82159215, + "num_input_tokens_seen": 47658740, + "step": 2206, + "time_per_iteration": 2.8443315029144287 + }, + { + "auxiliary_loss_clip": 0.01548621, + "auxiliary_loss_mlp": 0.01354071, + "balance_loss_clip": 1.18771875, + "balance_loss_mlp": 1.10325432, + "epoch": 0.13269201863820831, + "flos": 16876037610720.0, + "grad_norm": 4.803314221864926, + "language_loss": 0.74945498, + "learning_rate": 3.890393089751208e-06, + "loss": 0.77848184, + "num_input_tokens_seen": 47676880, + "step": 2207, + "time_per_iteration": 2.7939438819885254 + }, + { + "auxiliary_loss_clip": 0.01546595, + "auxiliary_loss_mlp": 0.01334295, + "balance_loss_clip": 1.1857779, + "balance_loss_mlp": 1.08080804, + "epoch": 0.1327521418908763, + "flos": 23771180458080.0, + "grad_norm": 1.997679864631868, + "language_loss": 0.84520453, + "learning_rate": 3.890265893930578e-06, + "loss": 0.87401342, + "num_input_tokens_seen": 47696635, + "step": 2208, + "time_per_iteration": 2.792614698410034 + }, + { + "auxiliary_loss_clip": 0.01550752, + "auxiliary_loss_mlp": 0.01313339, + "balance_loss_clip": 1.1897912, + "balance_loss_mlp": 1.05546594, + "epoch": 0.13281226514354427, + "flos": 26508027951360.0, + "grad_norm": 1.9194926897178255, + "language_loss": 0.85651672, + "learning_rate": 3.890138626430876e-06, + "loss": 0.88515759, + "num_input_tokens_seen": 47717760, + "step": 2209, + "time_per_iteration": 2.8940813541412354 + }, + { + "auxiliary_loss_clip": 0.01556818, + "auxiliary_loss_mlp": 0.01343883, + "balance_loss_clip": 1.19745946, + "balance_loss_mlp": 1.08410215, + "epoch": 0.13287238839621224, + "flos": 24501077255520.0, + "grad_norm": 2.4859779999124876, + "language_loss": 0.82622528, + "learning_rate": 3.890011287256929e-06, + "loss": 0.85523224, + "num_input_tokens_seen": 47737685, + "step": 2210, + "time_per_iteration": 2.803464889526367 + }, + { + "auxiliary_loss_clip": 0.01636513, + "auxiliary_loss_mlp": 0.01295128, + "balance_loss_clip": 1.27812862, + "balance_loss_mlp": 1.09828949, + "epoch": 0.1329325116488802, + "flos": 67700923922880.0, + "grad_norm": 0.8360057578724006, + "language_loss": 0.57985592, + "learning_rate": 3.889883876413563e-06, + "loss": 0.60917234, + "num_input_tokens_seen": 47802415, + "step": 2211, + "time_per_iteration": 3.4033138751983643 + }, + { + "auxiliary_loss_clip": 0.01631293, + "auxiliary_loss_mlp": 0.01297157, + "balance_loss_clip": 1.2737143, + "balance_loss_mlp": 1.10146332, + "epoch": 0.13299263490154817, + "flos": 72269112909600.0, + "grad_norm": 0.8723695342895688, + "language_loss": 0.55256945, + "learning_rate": 3.889756393905611e-06, + "loss": 0.58185399, + "num_input_tokens_seen": 47871485, + "step": 2212, + "time_per_iteration": 3.375455379486084 + }, + { + "auxiliary_loss_clip": 0.0154674, + "auxiliary_loss_mlp": 0.01336722, + "balance_loss_clip": 1.18701887, + "balance_loss_mlp": 1.06721377, + "epoch": 0.13305275815421613, + "flos": 17933219779680.0, + "grad_norm": 3.901917324422369, + "language_loss": 0.74116719, + "learning_rate": 3.889628839737908e-06, + "loss": 0.77000183, + "num_input_tokens_seen": 47888315, + "step": 2213, + "time_per_iteration": 2.7185983657836914 + }, + { + "auxiliary_loss_clip": 0.01551235, + "auxiliary_loss_mlp": 0.01294738, + "balance_loss_clip": 1.19260764, + "balance_loss_mlp": 1.02942586, + "epoch": 0.13311288140688413, + "flos": 22342791744000.0, + "grad_norm": 1.789019068447867, + "language_loss": 0.79478055, + "learning_rate": 3.889501213915291e-06, + "loss": 0.82324028, + "num_input_tokens_seen": 47906600, + "step": 2214, + "time_per_iteration": 2.788245439529419 + }, + { + "auxiliary_loss_clip": 0.01552458, + "auxiliary_loss_mlp": 0.01329938, + "balance_loss_clip": 1.19425571, + "balance_loss_mlp": 1.06653285, + "epoch": 0.1331730046595521, + "flos": 31871616396480.0, + "grad_norm": 3.0206091956974395, + "language_loss": 0.69481188, + "learning_rate": 3.889373516442597e-06, + "loss": 0.72363579, + "num_input_tokens_seen": 47927630, + "step": 2215, + "time_per_iteration": 2.893606424331665 + }, + { + "auxiliary_loss_clip": 0.01546228, + "auxiliary_loss_mlp": 0.01327892, + "balance_loss_clip": 1.18701375, + "balance_loss_mlp": 1.07001829, + "epoch": 0.13323312791222006, + "flos": 22568883691680.0, + "grad_norm": 3.2074849841836643, + "language_loss": 0.81349659, + "learning_rate": 3.889245747324671e-06, + "loss": 0.84223777, + "num_input_tokens_seen": 47947935, + "step": 2216, + "time_per_iteration": 2.8427886962890625 + }, + { + "auxiliary_loss_clip": 0.01551505, + "auxiliary_loss_mlp": 0.01337584, + "balance_loss_clip": 1.191993, + "balance_loss_mlp": 1.08009243, + "epoch": 0.13329325116488802, + "flos": 15087100164480.0, + "grad_norm": 6.443296626152804, + "language_loss": 0.86983013, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89872098, + "num_input_tokens_seen": 47965515, + "step": 2217, + "time_per_iteration": 2.787395715713501 + }, + { + "auxiliary_loss_clip": 0.0155117, + "auxiliary_loss_mlp": 0.01356173, + "balance_loss_clip": 1.19207478, + "balance_loss_mlp": 1.09601045, + "epoch": 0.133353374417556, + "flos": 27456127639200.0, + "grad_norm": 3.210200687782185, + "language_loss": 0.72953445, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75860786, + "num_input_tokens_seen": 47985675, + "step": 2218, + "time_per_iteration": 2.8158628940582275 + }, + { + "auxiliary_loss_clip": 0.01550396, + "auxiliary_loss_mlp": 0.0135525, + "balance_loss_clip": 1.19240379, + "balance_loss_mlp": 1.09489655, + "epoch": 0.13341349767022395, + "flos": 24096834918720.0, + "grad_norm": 8.246570446328326, + "language_loss": 0.87512112, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.90417755, + "num_input_tokens_seen": 48004985, + "step": 2219, + "time_per_iteration": 4.374691963195801 + }, + { + "auxiliary_loss_clip": 0.01549518, + "auxiliary_loss_mlp": 0.01374144, + "balance_loss_clip": 1.19204867, + "balance_loss_mlp": 1.11283684, + "epoch": 0.13347362092289192, + "flos": 24135597862560.0, + "grad_norm": 2.0768948654359876, + "language_loss": 0.77494633, + "learning_rate": 3.888733954497574e-06, + "loss": 0.80418295, + "num_input_tokens_seen": 48024965, + "step": 2220, + "time_per_iteration": 2.778167724609375 + }, + { + "auxiliary_loss_clip": 0.01553307, + "auxiliary_loss_mlp": 0.01356208, + "balance_loss_clip": 1.19445515, + "balance_loss_mlp": 1.10596395, + "epoch": 0.1335337441755599, + "flos": 18438200402400.0, + "grad_norm": 3.0889422390828454, + "language_loss": 0.79236424, + "learning_rate": 3.888605827226212e-06, + "loss": 0.82145935, + "num_input_tokens_seen": 48040890, + "step": 2221, + "time_per_iteration": 2.7545881271362305 + }, + { + "auxiliary_loss_clip": 0.01604654, + "auxiliary_loss_mlp": 0.01324249, + "balance_loss_clip": 1.25435162, + "balance_loss_mlp": 1.13122559, + "epoch": 0.13359386742822787, + "flos": 50617227450240.0, + "grad_norm": 1.0676446606956957, + "language_loss": 0.68984044, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71912944, + "num_input_tokens_seen": 48091855, + "step": 2222, + "time_per_iteration": 3.143507957458496 + }, + { + "auxiliary_loss_clip": 0.01550837, + "auxiliary_loss_mlp": 0.01335764, + "balance_loss_clip": 1.19302726, + "balance_loss_mlp": 1.07750869, + "epoch": 0.13365399068089584, + "flos": 22780487082240.0, + "grad_norm": 2.154187866230882, + "language_loss": 0.67627794, + "learning_rate": 3.888349357839982e-06, + "loss": 0.70514399, + "num_input_tokens_seen": 48111350, + "step": 2223, + "time_per_iteration": 2.81906795501709 + }, + { + "auxiliary_loss_clip": 0.0154673, + "auxiliary_loss_mlp": 0.01303317, + "balance_loss_clip": 1.18884444, + "balance_loss_mlp": 1.04525256, + "epoch": 0.1337141139335638, + "flos": 12533257864800.0, + "grad_norm": 2.0319888188236943, + "language_loss": 0.82553196, + "learning_rate": 3.88822101573484e-06, + "loss": 0.85403246, + "num_input_tokens_seen": 48129840, + "step": 2224, + "time_per_iteration": 2.786450147628784 + }, + { + "auxiliary_loss_clip": 0.01548256, + "auxiliary_loss_mlp": 0.01334465, + "balance_loss_clip": 1.19020462, + "balance_loss_mlp": 1.07144177, + "epoch": 0.13377423718623177, + "flos": 23041169876160.0, + "grad_norm": 2.0826106686034667, + "language_loss": 0.66186744, + "learning_rate": 3.888092602028167e-06, + "loss": 0.69069463, + "num_input_tokens_seen": 48149240, + "step": 2225, + "time_per_iteration": 2.810755491256714 + }, + { + "auxiliary_loss_clip": 0.01545811, + "auxiliary_loss_mlp": 0.01316997, + "balance_loss_clip": 1.18854761, + "balance_loss_mlp": 1.05607152, + "epoch": 0.13383436043889974, + "flos": 16218242974080.0, + "grad_norm": 2.772639209212616, + "language_loss": 0.89585835, + "learning_rate": 3.887964116724835e-06, + "loss": 0.92448652, + "num_input_tokens_seen": 48166330, + "step": 2226, + "time_per_iteration": 2.7710487842559814 + }, + { + "auxiliary_loss_clip": 0.01555866, + "auxiliary_loss_mlp": 0.01348885, + "balance_loss_clip": 1.19833207, + "balance_loss_mlp": 1.09063017, + "epoch": 0.1338944836915677, + "flos": 24281964089280.0, + "grad_norm": 3.182813593453601, + "language_loss": 0.74086928, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76991677, + "num_input_tokens_seen": 48187600, + "step": 2227, + "time_per_iteration": 4.3081374168396 + }, + { + "auxiliary_loss_clip": 0.01553027, + "auxiliary_loss_mlp": 0.0134613, + "balance_loss_clip": 1.19390535, + "balance_loss_mlp": 1.08978188, + "epoch": 0.1339546069442357, + "flos": 17600434824960.0, + "grad_norm": 2.5019391264122537, + "language_loss": 0.85322523, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.88221681, + "num_input_tokens_seen": 48204400, + "step": 2228, + "time_per_iteration": 4.380034923553467 + }, + { + "auxiliary_loss_clip": 0.01550065, + "auxiliary_loss_mlp": 0.01325114, + "balance_loss_clip": 1.19160199, + "balance_loss_mlp": 1.07506061, + "epoch": 0.13401473019690366, + "flos": 18992108715840.0, + "grad_norm": 2.5224772051127595, + "language_loss": 0.8124404, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.84119219, + "num_input_tokens_seen": 48222180, + "step": 2229, + "time_per_iteration": 4.3735857009887695 + }, + { + "auxiliary_loss_clip": 0.01554563, + "auxiliary_loss_mlp": 0.01307164, + "balance_loss_clip": 1.19626892, + "balance_loss_mlp": 1.04948163, + "epoch": 0.13407485344957162, + "flos": 26946292212000.0, + "grad_norm": 1.9224665464551343, + "language_loss": 0.74326539, + "learning_rate": 3.887449459642378e-06, + "loss": 0.77188265, + "num_input_tokens_seen": 48243245, + "step": 2230, + "time_per_iteration": 2.835637331008911 + }, + { + "auxiliary_loss_clip": 0.01552167, + "auxiliary_loss_mlp": 0.01319268, + "balance_loss_clip": 1.19344568, + "balance_loss_mlp": 1.06826067, + "epoch": 0.1341349767022396, + "flos": 20341454415840.0, + "grad_norm": 2.694328553387129, + "language_loss": 0.800596, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82931042, + "num_input_tokens_seen": 48262600, + "step": 2231, + "time_per_iteration": 2.7729427814483643 + }, + { + "auxiliary_loss_clip": 0.01556133, + "auxiliary_loss_mlp": 0.01333658, + "balance_loss_clip": 1.19843507, + "balance_loss_mlp": 1.08665621, + "epoch": 0.13419509995490755, + "flos": 29864627772480.0, + "grad_norm": 1.750437253999474, + "language_loss": 0.72387135, + "learning_rate": 3.887191701647992e-06, + "loss": 0.75276929, + "num_input_tokens_seen": 48285075, + "step": 2232, + "time_per_iteration": 2.8593902587890625 + }, + { + "auxiliary_loss_clip": 0.01553737, + "auxiliary_loss_mlp": 0.01342752, + "balance_loss_clip": 1.19338584, + "balance_loss_mlp": 1.09365225, + "epoch": 0.13425522320757552, + "flos": 26945419864320.0, + "grad_norm": 7.677958130512207, + "language_loss": 0.66179574, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.69076067, + "num_input_tokens_seen": 48301285, + "step": 2233, + "time_per_iteration": 2.814016819000244 + }, + { + "auxiliary_loss_clip": 0.01550379, + "auxiliary_loss_mlp": 0.01338652, + "balance_loss_clip": 1.1896975, + "balance_loss_mlp": 1.09298551, + "epoch": 0.1343153464602435, + "flos": 15779030509440.0, + "grad_norm": 2.8739125842600717, + "language_loss": 0.8114053, + "learning_rate": 3.886933657403615e-06, + "loss": 0.84029555, + "num_input_tokens_seen": 48317835, + "step": 2234, + "time_per_iteration": 2.796518564224243 + }, + { + "auxiliary_loss_clip": 0.01555083, + "auxiliary_loss_mlp": 0.01330742, + "balance_loss_clip": 1.19542944, + "balance_loss_mlp": 1.08354998, + "epoch": 0.13437546971291148, + "flos": 24316972145280.0, + "grad_norm": 2.1847596098434794, + "language_loss": 0.82487965, + "learning_rate": 3.886804527949909e-06, + "loss": 0.85373789, + "num_input_tokens_seen": 48335670, + "step": 2235, + "time_per_iteration": 2.8125295639038086 + }, + { + "auxiliary_loss_clip": 0.01552933, + "auxiliary_loss_mlp": 0.01335706, + "balance_loss_clip": 1.19158018, + "balance_loss_mlp": 1.08851326, + "epoch": 0.13443559296557944, + "flos": 26653066692480.0, + "grad_norm": 1.9180002787994601, + "language_loss": 0.86731505, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.89620137, + "num_input_tokens_seen": 48357805, + "step": 2236, + "time_per_iteration": 2.8506486415863037 + }, + { + "auxiliary_loss_clip": 0.01553133, + "auxiliary_loss_mlp": 0.01320403, + "balance_loss_clip": 1.19219184, + "balance_loss_mlp": 1.07797909, + "epoch": 0.1344957162182474, + "flos": 21798213757920.0, + "grad_norm": 1.6849407676211774, + "language_loss": 0.77430689, + "learning_rate": 3.886546054403946e-06, + "loss": 0.80304217, + "num_input_tokens_seen": 48377845, + "step": 2237, + "time_per_iteration": 2.773573398590088 + }, + { + "auxiliary_loss_clip": 0.0155231, + "auxiliary_loss_mlp": 0.01342179, + "balance_loss_clip": 1.19321954, + "balance_loss_mlp": 1.09269738, + "epoch": 0.13455583947091537, + "flos": 19867651104960.0, + "grad_norm": 2.0737420325954083, + "language_loss": 0.79048252, + "learning_rate": 3.886416710321491e-06, + "loss": 0.81942737, + "num_input_tokens_seen": 48394735, + "step": 2238, + "time_per_iteration": 2.764831304550171 + }, + { + "auxiliary_loss_clip": 0.01555932, + "auxiliary_loss_mlp": 0.01346443, + "balance_loss_clip": 1.194731, + "balance_loss_mlp": 1.10192108, + "epoch": 0.13461596272358334, + "flos": 30849480211680.0, + "grad_norm": 6.39102614687071, + "language_loss": 0.6844244, + "learning_rate": 3.886287294705924e-06, + "loss": 0.71344817, + "num_input_tokens_seen": 48414200, + "step": 2239, + "time_per_iteration": 2.8724141120910645 + }, + { + "auxiliary_loss_clip": 0.01550507, + "auxiliary_loss_mlp": 0.01358244, + "balance_loss_clip": 1.18983996, + "balance_loss_mlp": 1.11047959, + "epoch": 0.1346760859762513, + "flos": 12496049975520.0, + "grad_norm": 3.3138962382422097, + "language_loss": 0.81703985, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.84612733, + "num_input_tokens_seen": 48431065, + "step": 2240, + "time_per_iteration": 2.849808931350708 + }, + { + "auxiliary_loss_clip": 0.01556241, + "auxiliary_loss_mlp": 0.0136278, + "balance_loss_clip": 1.19526649, + "balance_loss_mlp": 1.10586023, + "epoch": 0.1347362092289193, + "flos": 21838228331040.0, + "grad_norm": 4.887576205391336, + "language_loss": 0.7784158, + "learning_rate": 3.886028248895093e-06, + "loss": 0.80760598, + "num_input_tokens_seen": 48450335, + "step": 2241, + "time_per_iteration": 2.826098680496216 + }, + { + "auxiliary_loss_clip": 0.01558603, + "auxiliary_loss_mlp": 0.01361661, + "balance_loss_clip": 1.19728029, + "balance_loss_mlp": 1.11465931, + "epoch": 0.13479633248158726, + "flos": 23511483796320.0, + "grad_norm": 1.8418138021520003, + "language_loss": 0.83453995, + "learning_rate": 3.88589861870965e-06, + "loss": 0.86374259, + "num_input_tokens_seen": 48468555, + "step": 2242, + "time_per_iteration": 2.790095567703247 + }, + { + "auxiliary_loss_clip": 0.01552799, + "auxiliary_loss_mlp": 0.01361408, + "balance_loss_clip": 1.19148374, + "balance_loss_mlp": 1.11211705, + "epoch": 0.13485645573425523, + "flos": 29346524006400.0, + "grad_norm": 3.5974173481421663, + "language_loss": 0.65162617, + "learning_rate": 3.885768917010744e-06, + "loss": 0.68076825, + "num_input_tokens_seen": 48488515, + "step": 2243, + "time_per_iteration": 2.853196620941162 + }, + { + "auxiliary_loss_clip": 0.01562693, + "auxiliary_loss_mlp": 0.01330094, + "balance_loss_clip": 1.20273948, + "balance_loss_mlp": 1.08099389, + "epoch": 0.1349165789869232, + "flos": 28039582353600.0, + "grad_norm": 1.8642651361625386, + "language_loss": 0.7253924, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.75432026, + "num_input_tokens_seen": 48510515, + "step": 2244, + "time_per_iteration": 2.8465018272399902 + }, + { + "auxiliary_loss_clip": 0.01547133, + "auxiliary_loss_mlp": 0.01309219, + "balance_loss_clip": 1.18777251, + "balance_loss_mlp": 1.06164479, + "epoch": 0.13497670223959116, + "flos": 22855775208480.0, + "grad_norm": 1.795983972834454, + "language_loss": 0.86455035, + "learning_rate": 3.88550929909221e-06, + "loss": 0.89311385, + "num_input_tokens_seen": 48529940, + "step": 2245, + "time_per_iteration": 2.7818267345428467 + }, + { + "auxiliary_loss_clip": 0.01565951, + "auxiliary_loss_mlp": 0.01326216, + "balance_loss_clip": 1.20610166, + "balance_loss_mlp": 1.07787943, + "epoch": 0.13503682549225912, + "flos": 16506234407520.0, + "grad_norm": 1.8345945692589853, + "language_loss": 0.78935724, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81827891, + "num_input_tokens_seen": 48548190, + "step": 2246, + "time_per_iteration": 2.779301881790161 + }, + { + "auxiliary_loss_clip": 0.01635877, + "auxiliary_loss_mlp": 0.01316841, + "balance_loss_clip": 1.28264976, + "balance_loss_mlp": 1.11237335, + "epoch": 0.1350969487449271, + "flos": 70762894882560.0, + "grad_norm": 0.7833063008784702, + "language_loss": 0.60523868, + "learning_rate": 3.885249395178874e-06, + "loss": 0.63476574, + "num_input_tokens_seen": 48613165, + "step": 2247, + "time_per_iteration": 3.3854875564575195 + }, + { + "auxiliary_loss_clip": 0.015634, + "auxiliary_loss_mlp": 0.01341227, + "balance_loss_clip": 1.20462954, + "balance_loss_mlp": 1.09746742, + "epoch": 0.13515707199759508, + "flos": 23078226052800.0, + "grad_norm": 4.898297690251649, + "language_loss": 0.81026793, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83931416, + "num_input_tokens_seen": 48631705, + "step": 2248, + "time_per_iteration": 2.8153076171875 + }, + { + "auxiliary_loss_clip": 0.01565815, + "auxiliary_loss_mlp": 0.01335864, + "balance_loss_clip": 1.20809031, + "balance_loss_mlp": 1.09649158, + "epoch": 0.13521719525026304, + "flos": 23188673947680.0, + "grad_norm": 2.726299710395884, + "language_loss": 0.77096641, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79998314, + "num_input_tokens_seen": 48649740, + "step": 2249, + "time_per_iteration": 2.7849810123443604 + }, + { + "auxiliary_loss_clip": 0.01559959, + "auxiliary_loss_mlp": 0.01309041, + "balance_loss_clip": 1.20227075, + "balance_loss_mlp": 1.0679518, + "epoch": 0.135277318502931, + "flos": 24793278714720.0, + "grad_norm": 1.613709440619028, + "language_loss": 0.84361792, + "learning_rate": 3.884859003154862e-06, + "loss": 0.8723079, + "num_input_tokens_seen": 48671565, + "step": 2250, + "time_per_iteration": 2.8569321632385254 + }, + { + "auxiliary_loss_clip": 0.0156025, + "auxiliary_loss_mlp": 0.01323089, + "balance_loss_clip": 1.20309901, + "balance_loss_mlp": 1.08238149, + "epoch": 0.13533744175559898, + "flos": 21910709773440.0, + "grad_norm": 2.814835395709418, + "language_loss": 0.81991076, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84874415, + "num_input_tokens_seen": 48690425, + "step": 2251, + "time_per_iteration": 2.7806484699249268 + }, + { + "auxiliary_loss_clip": 0.01562623, + "auxiliary_loss_mlp": 0.01334711, + "balance_loss_clip": 1.20539403, + "balance_loss_mlp": 1.09324014, + "epoch": 0.13539756500826694, + "flos": 21213583270560.0, + "grad_norm": 2.0373374972242067, + "language_loss": 0.86262107, + "learning_rate": 3.884598384427084e-06, + "loss": 0.89159441, + "num_input_tokens_seen": 48707505, + "step": 2252, + "time_per_iteration": 2.9325807094573975 + }, + { + "auxiliary_loss_clip": 0.01625313, + "auxiliary_loss_mlp": 0.01213249, + "balance_loss_clip": 1.27361941, + "balance_loss_mlp": 1.01908112, + "epoch": 0.1354576882609349, + "flos": 63248378988960.0, + "grad_norm": 0.7653317721613851, + "language_loss": 0.61689031, + "learning_rate": 3.884467967864485e-06, + "loss": 0.64527595, + "num_input_tokens_seen": 48775895, + "step": 2253, + "time_per_iteration": 3.4107553958892822 + }, + { + "auxiliary_loss_clip": 0.01560202, + "auxiliary_loss_mlp": 0.0136416, + "balance_loss_clip": 1.20272398, + "balance_loss_mlp": 1.1251694, + "epoch": 0.1355178115136029, + "flos": 25485360772320.0, + "grad_norm": 2.067929570722123, + "language_loss": 0.89943695, + "learning_rate": 3.884337479842671e-06, + "loss": 0.9286806, + "num_input_tokens_seen": 48798370, + "step": 2254, + "time_per_iteration": 2.839358329772949 + }, + { + "auxiliary_loss_clip": 0.01559577, + "auxiliary_loss_mlp": 0.01374234, + "balance_loss_clip": 1.20255125, + "balance_loss_mlp": 1.13295388, + "epoch": 0.13557793476627086, + "flos": 21619153092960.0, + "grad_norm": 2.915284674540998, + "language_loss": 0.8442421, + "learning_rate": 3.884206920366591e-06, + "loss": 0.8735801, + "num_input_tokens_seen": 48817955, + "step": 2255, + "time_per_iteration": 2.8206934928894043 + }, + { + "auxiliary_loss_clip": 0.01556267, + "auxiliary_loss_mlp": 0.01381306, + "balance_loss_clip": 1.19885445, + "balance_loss_mlp": 1.13850069, + "epoch": 0.13563805801893883, + "flos": 24930124973280.0, + "grad_norm": 3.045844220474771, + "language_loss": 0.74998581, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77936161, + "num_input_tokens_seen": 48836330, + "step": 2256, + "time_per_iteration": 2.818086624145508 + }, + { + "auxiliary_loss_clip": 0.01554795, + "auxiliary_loss_mlp": 0.01394357, + "balance_loss_clip": 1.19760835, + "balance_loss_mlp": 1.15383983, + "epoch": 0.1356981812716068, + "flos": 14751660238560.0, + "grad_norm": 5.143885807173998, + "language_loss": 0.83113694, + "learning_rate": 3.88394558707144e-06, + "loss": 0.86062849, + "num_input_tokens_seen": 48851890, + "step": 2257, + "time_per_iteration": 4.297421932220459 + }, + { + "auxiliary_loss_clip": 0.01558374, + "auxiliary_loss_mlp": 0.01407391, + "balance_loss_clip": 1.2011857, + "balance_loss_mlp": 1.16248786, + "epoch": 0.13575830452427476, + "flos": 11110141164960.0, + "grad_norm": 2.9666101053872387, + "language_loss": 0.81853104, + "learning_rate": 3.883814813262277e-06, + "loss": 0.84818864, + "num_input_tokens_seen": 48865510, + "step": 2258, + "time_per_iteration": 2.7576279640197754 + }, + { + "auxiliary_loss_clip": 0.01555493, + "auxiliary_loss_mlp": 0.01386349, + "balance_loss_clip": 1.19868827, + "balance_loss_mlp": 1.14373446, + "epoch": 0.13581842777694272, + "flos": 17961817976640.0, + "grad_norm": 6.7473211983472225, + "language_loss": 0.82880652, + "learning_rate": 3.883683968018669e-06, + "loss": 0.85822493, + "num_input_tokens_seen": 48882360, + "step": 2259, + "time_per_iteration": 2.7453114986419678 + }, + { + "auxiliary_loss_clip": 0.01552381, + "auxiliary_loss_mlp": 0.01371171, + "balance_loss_clip": 1.19596159, + "balance_loss_mlp": 1.13179874, + "epoch": 0.1358785510296107, + "flos": 22859378383680.0, + "grad_norm": 2.4389881152421165, + "language_loss": 0.74019909, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.76943457, + "num_input_tokens_seen": 48902700, + "step": 2260, + "time_per_iteration": 2.811511278152466 + }, + { + "auxiliary_loss_clip": 0.01549506, + "auxiliary_loss_mlp": 0.01392382, + "balance_loss_clip": 1.19390869, + "balance_loss_mlp": 1.15052986, + "epoch": 0.13593867428227868, + "flos": 25741795612320.0, + "grad_norm": 2.626364885064716, + "language_loss": 0.75171965, + "learning_rate": 3.883422063247961e-06, + "loss": 0.78113854, + "num_input_tokens_seen": 48922525, + "step": 2261, + "time_per_iteration": 2.815142869949341 + }, + { + "auxiliary_loss_clip": 0.01548795, + "auxiliary_loss_mlp": 0.01365683, + "balance_loss_clip": 1.19202423, + "balance_loss_mlp": 1.12077951, + "epoch": 0.13599879753494665, + "flos": 31251939924960.0, + "grad_norm": 3.8728511933651015, + "language_loss": 0.63654995, + "learning_rate": 3.883291003730794e-06, + "loss": 0.66569471, + "num_input_tokens_seen": 48942510, + "step": 2262, + "time_per_iteration": 2.948209047317505 + }, + { + "auxiliary_loss_clip": 0.01556499, + "auxiliary_loss_mlp": 0.013614, + "balance_loss_clip": 1.19967079, + "balance_loss_mlp": 1.11783147, + "epoch": 0.1360589207876146, + "flos": 23917319115840.0, + "grad_norm": 3.998239199623059, + "language_loss": 0.81984603, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84902507, + "num_input_tokens_seen": 48962625, + "step": 2263, + "time_per_iteration": 2.8191592693328857 + }, + { + "auxiliary_loss_clip": 0.01552771, + "auxiliary_loss_mlp": 0.01337043, + "balance_loss_clip": 1.19668126, + "balance_loss_mlp": 1.08908772, + "epoch": 0.13611904404028258, + "flos": 19976240520000.0, + "grad_norm": 1.9961364911501345, + "language_loss": 0.88029724, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90919536, + "num_input_tokens_seen": 48982525, + "step": 2264, + "time_per_iteration": 2.8312764167785645 + }, + { + "auxiliary_loss_clip": 0.01554718, + "auxiliary_loss_mlp": 0.01316723, + "balance_loss_clip": 1.19892693, + "balance_loss_mlp": 1.06457186, + "epoch": 0.13617916729295054, + "flos": 15342435087840.0, + "grad_norm": 3.203872175867496, + "language_loss": 0.71528494, + "learning_rate": 3.882897396711683e-06, + "loss": 0.7439993, + "num_input_tokens_seen": 48997605, + "step": 2265, + "time_per_iteration": 4.266226291656494 + }, + { + "auxiliary_loss_clip": 0.01558312, + "auxiliary_loss_mlp": 0.01317445, + "balance_loss_clip": 1.20147526, + "balance_loss_mlp": 1.06643748, + "epoch": 0.1362392905456185, + "flos": 27453776093280.0, + "grad_norm": 5.240143438214287, + "language_loss": 0.67073333, + "learning_rate": 3.882766051566027e-06, + "loss": 0.6994909, + "num_input_tokens_seen": 49018535, + "step": 2266, + "time_per_iteration": 4.366527318954468 + }, + { + "auxiliary_loss_clip": 0.01556614, + "auxiliary_loss_mlp": 0.01335079, + "balance_loss_clip": 1.20040584, + "balance_loss_mlp": 1.08464384, + "epoch": 0.1362994137982865, + "flos": 25011329892480.0, + "grad_norm": 1.7786834487525867, + "language_loss": 0.76873231, + "learning_rate": 3.882634635025694e-06, + "loss": 0.79764926, + "num_input_tokens_seen": 49038865, + "step": 2267, + "time_per_iteration": 2.791377067565918 + }, + { + "auxiliary_loss_clip": 0.01546724, + "auxiliary_loss_mlp": 0.01336263, + "balance_loss_clip": 1.1911099, + "balance_loss_mlp": 1.08353877, + "epoch": 0.13635953705095447, + "flos": 20305118874240.0, + "grad_norm": 1.9999791748885178, + "language_loss": 0.82280707, + "learning_rate": 3.882503147095667e-06, + "loss": 0.85163701, + "num_input_tokens_seen": 49058010, + "step": 2268, + "time_per_iteration": 2.8365120887756348 + }, + { + "auxiliary_loss_clip": 0.01559601, + "auxiliary_loss_mlp": 0.01339093, + "balance_loss_clip": 1.20309949, + "balance_loss_mlp": 1.08484364, + "epoch": 0.13641966030362243, + "flos": 31361136190560.0, + "grad_norm": 1.884483431426869, + "language_loss": 0.76246595, + "learning_rate": 3.882371587780931e-06, + "loss": 0.79145288, + "num_input_tokens_seen": 49080330, + "step": 2269, + "time_per_iteration": 2.826364755630493 + }, + { + "auxiliary_loss_clip": 0.01561304, + "auxiliary_loss_mlp": 0.013223, + "balance_loss_clip": 1.20524621, + "balance_loss_mlp": 1.07053018, + "epoch": 0.1364797835562904, + "flos": 20479779872640.0, + "grad_norm": 2.5830703623687876, + "language_loss": 0.80475318, + "learning_rate": 3.882239957086477e-06, + "loss": 0.8335892, + "num_input_tokens_seen": 49097035, + "step": 2270, + "time_per_iteration": 2.7474472522735596 + }, + { + "auxiliary_loss_clip": 0.01554989, + "auxiliary_loss_mlp": 0.01325506, + "balance_loss_clip": 1.19795394, + "balance_loss_mlp": 1.07621574, + "epoch": 0.13653990680895836, + "flos": 13079997756000.0, + "grad_norm": 2.633008579808435, + "language_loss": 0.76029772, + "learning_rate": 3.882108255017295e-06, + "loss": 0.78910267, + "num_input_tokens_seen": 49113945, + "step": 2271, + "time_per_iteration": 2.747284173965454 + }, + { + "auxiliary_loss_clip": 0.01551849, + "auxiliary_loss_mlp": 0.01318872, + "balance_loss_clip": 1.19555449, + "balance_loss_mlp": 1.07244229, + "epoch": 0.13660003006162633, + "flos": 16948632837600.0, + "grad_norm": 2.560225492127191, + "language_loss": 0.80297643, + "learning_rate": 3.881976481578379e-06, + "loss": 0.83168364, + "num_input_tokens_seen": 49132855, + "step": 2272, + "time_per_iteration": 2.715029239654541 + }, + { + "auxiliary_loss_clip": 0.01623227, + "auxiliary_loss_mlp": 0.01294579, + "balance_loss_clip": 1.27155411, + "balance_loss_mlp": 1.10651398, + "epoch": 0.1366601533142943, + "flos": 68689569178080.0, + "grad_norm": 0.708324730741808, + "language_loss": 0.6064086, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.63558662, + "num_input_tokens_seen": 49198310, + "step": 2273, + "time_per_iteration": 3.473020553588867 + }, + { + "auxiliary_loss_clip": 0.01551152, + "auxiliary_loss_mlp": 0.01321212, + "balance_loss_clip": 1.19460881, + "balance_loss_mlp": 1.07306635, + "epoch": 0.13672027656696228, + "flos": 19246229938080.0, + "grad_norm": 1.9487507420172223, + "language_loss": 0.77928686, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80801046, + "num_input_tokens_seen": 49217250, + "step": 2274, + "time_per_iteration": 2.7768242359161377 + }, + { + "auxiliary_loss_clip": 0.01549006, + "auxiliary_loss_mlp": 0.01304997, + "balance_loss_clip": 1.19314933, + "balance_loss_mlp": 1.05475307, + "epoch": 0.13678039981963025, + "flos": 24537223156320.0, + "grad_norm": 2.0040814882890463, + "language_loss": 0.78960675, + "learning_rate": 3.881580733093211e-06, + "loss": 0.81814671, + "num_input_tokens_seen": 49236615, + "step": 2275, + "time_per_iteration": 2.8271102905273438 + }, + { + "auxiliary_loss_clip": 0.01549531, + "auxiliary_loss_mlp": 0.01321753, + "balance_loss_clip": 1.19320643, + "balance_loss_mlp": 1.06712246, + "epoch": 0.13684052307229821, + "flos": 15671161729440.0, + "grad_norm": 2.9092595701235835, + "language_loss": 0.81608665, + "learning_rate": 3.881448674225356e-06, + "loss": 0.84479946, + "num_input_tokens_seen": 49253935, + "step": 2276, + "time_per_iteration": 2.767066240310669 + }, + { + "auxiliary_loss_clip": 0.01553771, + "auxiliary_loss_mlp": 0.01346345, + "balance_loss_clip": 1.19710875, + "balance_loss_mlp": 1.08942509, + "epoch": 0.13690064632496618, + "flos": 28367133222240.0, + "grad_norm": 3.0188883017621566, + "language_loss": 0.70324552, + "learning_rate": 3.881316544012779e-06, + "loss": 0.73224664, + "num_input_tokens_seen": 49273605, + "step": 2277, + "time_per_iteration": 2.821990489959717 + }, + { + "auxiliary_loss_clip": 0.01550843, + "auxiliary_loss_mlp": 0.01344105, + "balance_loss_clip": 1.19418979, + "balance_loss_mlp": 1.08813858, + "epoch": 0.13696076957763414, + "flos": 23407066478880.0, + "grad_norm": 2.142533962882144, + "language_loss": 0.80207288, + "learning_rate": 3.88118434246049e-06, + "loss": 0.83102238, + "num_input_tokens_seen": 49291785, + "step": 2278, + "time_per_iteration": 2.88256573677063 + }, + { + "auxiliary_loss_clip": 0.01561144, + "auxiliary_loss_mlp": 0.01338108, + "balance_loss_clip": 1.20649195, + "balance_loss_mlp": 1.08671927, + "epoch": 0.1370208928303021, + "flos": 37199665791360.0, + "grad_norm": 2.411095224973591, + "language_loss": 0.74881732, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77780986, + "num_input_tokens_seen": 49311405, + "step": 2279, + "time_per_iteration": 2.8951406478881836 + }, + { + "auxiliary_loss_clip": 0.01547994, + "auxiliary_loss_mlp": 0.01322571, + "balance_loss_clip": 1.19241703, + "balance_loss_mlp": 1.06813073, + "epoch": 0.13708101608297008, + "flos": 26978569440480.0, + "grad_norm": 2.244286204103621, + "language_loss": 0.77039826, + "learning_rate": 3.880919725356831e-06, + "loss": 0.79910386, + "num_input_tokens_seen": 49331835, + "step": 2280, + "time_per_iteration": 2.9086530208587646 + }, + { + "auxiliary_loss_clip": 0.01552187, + "auxiliary_loss_mlp": 0.01309469, + "balance_loss_clip": 1.19688165, + "balance_loss_mlp": 1.05884302, + "epoch": 0.13714113933563807, + "flos": 32559336715680.0, + "grad_norm": 1.8323029278044503, + "language_loss": 0.79952455, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82814103, + "num_input_tokens_seen": 49352290, + "step": 2281, + "time_per_iteration": 2.9147727489471436 + }, + { + "auxiliary_loss_clip": 0.01554519, + "auxiliary_loss_mlp": 0.01333646, + "balance_loss_clip": 1.20063031, + "balance_loss_mlp": 1.08302045, + "epoch": 0.13720126258830603, + "flos": 16102788562080.0, + "grad_norm": 1.996828180623955, + "language_loss": 0.83805299, + "learning_rate": 3.880654822954518e-06, + "loss": 0.86693466, + "num_input_tokens_seen": 49370285, + "step": 2282, + "time_per_iteration": 2.7796123027801514 + }, + { + "auxiliary_loss_clip": 0.01547382, + "auxiliary_loss_mlp": 0.01349862, + "balance_loss_clip": 1.19240689, + "balance_loss_mlp": 1.10514951, + "epoch": 0.137261385840974, + "flos": 18955583533440.0, + "grad_norm": 1.7871533066904317, + "language_loss": 0.74031115, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.76928359, + "num_input_tokens_seen": 49389610, + "step": 2283, + "time_per_iteration": 2.7827067375183105 + }, + { + "auxiliary_loss_clip": 0.0155684, + "auxiliary_loss_mlp": 0.01349524, + "balance_loss_clip": 1.20158613, + "balance_loss_mlp": 1.10938931, + "epoch": 0.13732150909364196, + "flos": 23297642644320.0, + "grad_norm": 3.434906149765711, + "language_loss": 0.84779918, + "learning_rate": 3.880389635293729e-06, + "loss": 0.87686276, + "num_input_tokens_seen": 49408390, + "step": 2284, + "time_per_iteration": 2.819175958633423 + }, + { + "auxiliary_loss_clip": 0.01547602, + "auxiliary_loss_mlp": 0.01336229, + "balance_loss_clip": 1.19358706, + "balance_loss_mlp": 1.0899899, + "epoch": 0.13738163234630993, + "flos": 29353844141280.0, + "grad_norm": 2.150181776833478, + "language_loss": 0.74947238, + "learning_rate": 3.880256934503974e-06, + "loss": 0.77831066, + "num_input_tokens_seen": 49427725, + "step": 2285, + "time_per_iteration": 2.9243276119232178 + }, + { + "auxiliary_loss_clip": 0.015531, + "auxiliary_loss_mlp": 0.01340179, + "balance_loss_clip": 1.19943893, + "balance_loss_mlp": 1.09870911, + "epoch": 0.1374417555989779, + "flos": 26653749399360.0, + "grad_norm": 1.8469066604419881, + "language_loss": 0.75222611, + "learning_rate": 3.880124162414689e-06, + "loss": 0.78115892, + "num_input_tokens_seen": 49449000, + "step": 2286, + "time_per_iteration": 2.798826217651367 + }, + { + "auxiliary_loss_clip": 0.0155261, + "auxiliary_loss_mlp": 0.01346636, + "balance_loss_clip": 1.19696236, + "balance_loss_mlp": 1.10573769, + "epoch": 0.1375018788516459, + "flos": 28405972022400.0, + "grad_norm": 2.971729959630137, + "language_loss": 0.86564016, + "learning_rate": 3.879991319030908e-06, + "loss": 0.89463264, + "num_input_tokens_seen": 49468360, + "step": 2287, + "time_per_iteration": 2.818643093109131 + }, + { + "auxiliary_loss_clip": 0.01546476, + "auxiliary_loss_mlp": 0.01319358, + "balance_loss_clip": 1.19213641, + "balance_loss_mlp": 1.07636142, + "epoch": 0.13756200210431385, + "flos": 37416768765120.0, + "grad_norm": 9.174471788377202, + "language_loss": 0.68479228, + "learning_rate": 3.879858404357666e-06, + "loss": 0.71345067, + "num_input_tokens_seen": 49493450, + "step": 2288, + "time_per_iteration": 2.918949842453003 + }, + { + "auxiliary_loss_clip": 0.01553638, + "auxiliary_loss_mlp": 0.01352558, + "balance_loss_clip": 1.19829369, + "balance_loss_mlp": 1.11146879, + "epoch": 0.13762212535698182, + "flos": 22713239725920.0, + "grad_norm": 2.9770681677576794, + "language_loss": 0.87439525, + "learning_rate": 3.879725418400005e-06, + "loss": 0.90345716, + "num_input_tokens_seen": 49511220, + "step": 2289, + "time_per_iteration": 2.802849292755127 + }, + { + "auxiliary_loss_clip": 0.01547115, + "auxiliary_loss_mlp": 0.01344255, + "balance_loss_clip": 1.19230747, + "balance_loss_mlp": 1.09839773, + "epoch": 0.13768224860964978, + "flos": 23954451148800.0, + "grad_norm": 2.0588559167037177, + "language_loss": 0.74669671, + "learning_rate": 3.879592361162969e-06, + "loss": 0.77561039, + "num_input_tokens_seen": 49529820, + "step": 2290, + "time_per_iteration": 2.8037872314453125 + }, + { + "auxiliary_loss_clip": 0.01617239, + "auxiliary_loss_mlp": 0.01269974, + "balance_loss_clip": 1.27044022, + "balance_loss_mlp": 1.07008362, + "epoch": 0.13774237186231775, + "flos": 63597852334080.0, + "grad_norm": 1.4027117826405575, + "language_loss": 0.51585865, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.54473078, + "num_input_tokens_seen": 49595325, + "step": 2291, + "time_per_iteration": 3.3860116004943848 + }, + { + "auxiliary_loss_clip": 0.01540836, + "auxiliary_loss_mlp": 0.01313469, + "balance_loss_clip": 1.18692422, + "balance_loss_mlp": 1.06723011, + "epoch": 0.1378024951149857, + "flos": 24281736520320.0, + "grad_norm": 1.9428266380544632, + "language_loss": 0.712672, + "learning_rate": 3.879326032870952e-06, + "loss": 0.74121511, + "num_input_tokens_seen": 49615850, + "step": 2292, + "time_per_iteration": 2.787670135498047 + }, + { + "auxiliary_loss_clip": 0.01545466, + "auxiliary_loss_mlp": 0.01351077, + "balance_loss_clip": 1.19057238, + "balance_loss_mlp": 1.10960722, + "epoch": 0.13786261836765368, + "flos": 14022939214080.0, + "grad_norm": 3.687608016190068, + "language_loss": 0.8016876, + "learning_rate": 3.879192761826071e-06, + "loss": 0.83065307, + "num_input_tokens_seen": 49631860, + "step": 2293, + "time_per_iteration": 2.765963554382324 + }, + { + "auxiliary_loss_clip": 0.01544101, + "auxiliary_loss_mlp": 0.01376496, + "balance_loss_clip": 1.19027007, + "balance_loss_mlp": 1.13617027, + "epoch": 0.13792274162032167, + "flos": 28881330387840.0, + "grad_norm": 2.9205718851660643, + "language_loss": 0.7839067, + "learning_rate": 3.879059419522011e-06, + "loss": 0.81311274, + "num_input_tokens_seen": 49652145, + "step": 2294, + "time_per_iteration": 2.826889991760254 + }, + { + "auxiliary_loss_clip": 0.01542291, + "auxiliary_loss_mlp": 0.01369611, + "balance_loss_clip": 1.18896174, + "balance_loss_mlp": 1.13252783, + "epoch": 0.13798286487298964, + "flos": 21143225805120.0, + "grad_norm": 2.3473402377690835, + "language_loss": 0.79829115, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82741016, + "num_input_tokens_seen": 49669880, + "step": 2295, + "time_per_iteration": 2.7554845809936523 + }, + { + "auxiliary_loss_clip": 0.01546472, + "auxiliary_loss_mlp": 0.01381857, + "balance_loss_clip": 1.19198203, + "balance_loss_mlp": 1.14324808, + "epoch": 0.1380429881256576, + "flos": 22489423467840.0, + "grad_norm": 2.2748106297435466, + "language_loss": 0.78555441, + "learning_rate": 3.878792521156588e-06, + "loss": 0.81483769, + "num_input_tokens_seen": 49687255, + "step": 2296, + "time_per_iteration": 4.38660454750061 + }, + { + "auxiliary_loss_clip": 0.01555184, + "auxiliary_loss_mlp": 0.01403502, + "balance_loss_clip": 1.19947958, + "balance_loss_mlp": 1.16584635, + "epoch": 0.13810311137832557, + "flos": 21395602332000.0, + "grad_norm": 1.8790357685258432, + "language_loss": 0.78449357, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.81408036, + "num_input_tokens_seen": 49706650, + "step": 2297, + "time_per_iteration": 2.743089437484741 + }, + { + "auxiliary_loss_clip": 0.01555768, + "auxiliary_loss_mlp": 0.01387983, + "balance_loss_clip": 1.20071292, + "balance_loss_mlp": 1.15013635, + "epoch": 0.13816323463099353, + "flos": 25991972305920.0, + "grad_norm": 2.3959068628019993, + "language_loss": 0.68910539, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71854293, + "num_input_tokens_seen": 49725715, + "step": 2298, + "time_per_iteration": 2.7592132091522217 + }, + { + "auxiliary_loss_clip": 0.01545088, + "auxiliary_loss_mlp": 0.01362576, + "balance_loss_clip": 1.19093597, + "balance_loss_mlp": 1.12301314, + "epoch": 0.1382233578836615, + "flos": 19246305794400.0, + "grad_norm": 1.9113320755174446, + "language_loss": 0.86993819, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89901483, + "num_input_tokens_seen": 49744710, + "step": 2299, + "time_per_iteration": 2.760021924972534 + }, + { + "auxiliary_loss_clip": 0.01552154, + "auxiliary_loss_mlp": 0.01352294, + "balance_loss_clip": 1.19589162, + "balance_loss_mlp": 1.10948801, + "epoch": 0.1382834811363295, + "flos": 25668669391200.0, + "grad_norm": 1.9776779688061523, + "language_loss": 0.76066124, + "learning_rate": 3.878257869538267e-06, + "loss": 0.78970569, + "num_input_tokens_seen": 49764300, + "step": 2300, + "time_per_iteration": 2.7448666095733643 + }, + { + "auxiliary_loss_clip": 0.01545541, + "auxiliary_loss_mlp": 0.01314576, + "balance_loss_clip": 1.18999028, + "balance_loss_mlp": 1.07081676, + "epoch": 0.13834360438899745, + "flos": 19785801407040.0, + "grad_norm": 2.6365100872849503, + "language_loss": 0.8281256, + "learning_rate": 3.878124028561692e-06, + "loss": 0.85672671, + "num_input_tokens_seen": 49778380, + "step": 2301, + "time_per_iteration": 2.7315356731414795 + }, + { + "auxiliary_loss_clip": 0.0154546, + "auxiliary_loss_mlp": 0.0133581, + "balance_loss_clip": 1.19038618, + "balance_loss_mlp": 1.0931952, + "epoch": 0.13840372764166542, + "flos": 26654318321760.0, + "grad_norm": 4.092875258839136, + "language_loss": 0.85785848, + "learning_rate": 3.877990116366466e-06, + "loss": 0.88667119, + "num_input_tokens_seen": 49797460, + "step": 2302, + "time_per_iteration": 2.820117235183716 + }, + { + "auxiliary_loss_clip": 0.0161787, + "auxiliary_loss_mlp": 0.01290604, + "balance_loss_clip": 1.26712942, + "balance_loss_mlp": 1.0907135, + "epoch": 0.13846385089433338, + "flos": 70518141915840.0, + "grad_norm": 0.7713630977662091, + "language_loss": 0.65529436, + "learning_rate": 3.877856132957667e-06, + "loss": 0.6843791, + "num_input_tokens_seen": 49868005, + "step": 2303, + "time_per_iteration": 4.940772533416748 + }, + { + "auxiliary_loss_clip": 0.01539175, + "auxiliary_loss_mlp": 0.01334256, + "balance_loss_clip": 1.1837194, + "balance_loss_mlp": 1.08649099, + "epoch": 0.13852397414700135, + "flos": 17350713269280.0, + "grad_norm": 2.0723359477173724, + "language_loss": 0.78896296, + "learning_rate": 3.877722078340374e-06, + "loss": 0.81769723, + "num_input_tokens_seen": 49885825, + "step": 2304, + "time_per_iteration": 4.346985101699829 + }, + { + "auxiliary_loss_clip": 0.01542568, + "auxiliary_loss_mlp": 0.01327088, + "balance_loss_clip": 1.18695021, + "balance_loss_mlp": 1.07875121, + "epoch": 0.13858409739966931, + "flos": 21545951015520.0, + "grad_norm": 2.3224426833939544, + "language_loss": 0.78176749, + "learning_rate": 3.877587952519672e-06, + "loss": 0.81046402, + "num_input_tokens_seen": 49905975, + "step": 2305, + "time_per_iteration": 4.41637921333313 + }, + { + "auxiliary_loss_clip": 0.01540867, + "auxiliary_loss_mlp": 0.01321465, + "balance_loss_clip": 1.18644977, + "balance_loss_mlp": 1.08247423, + "epoch": 0.13864422065233728, + "flos": 21582058988160.0, + "grad_norm": 2.183479645438912, + "language_loss": 0.87910354, + "learning_rate": 3.877453755500647e-06, + "loss": 0.90772688, + "num_input_tokens_seen": 49925800, + "step": 2306, + "time_per_iteration": 2.823180675506592 + }, + { + "auxiliary_loss_clip": 0.01611229, + "auxiliary_loss_mlp": 0.01275192, + "balance_loss_clip": 1.26235354, + "balance_loss_mlp": 1.08483887, + "epoch": 0.13870434390500527, + "flos": 53375770215360.0, + "grad_norm": 0.8942556237225382, + "language_loss": 0.59062731, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61949158, + "num_input_tokens_seen": 49977620, + "step": 2307, + "time_per_iteration": 3.3174734115600586 + }, + { + "auxiliary_loss_clip": 0.01538358, + "auxiliary_loss_mlp": 0.01353486, + "balance_loss_clip": 1.18392348, + "balance_loss_mlp": 1.11239743, + "epoch": 0.13876446715767324, + "flos": 22568163056640.0, + "grad_norm": 1.8848396268958703, + "language_loss": 0.80201364, + "learning_rate": 3.877185147887984e-06, + "loss": 0.83093208, + "num_input_tokens_seen": 49996650, + "step": 2308, + "time_per_iteration": 2.815857172012329 + }, + { + "auxiliary_loss_clip": 0.01537555, + "auxiliary_loss_mlp": 0.01312317, + "balance_loss_clip": 1.18240309, + "balance_loss_mlp": 1.06741369, + "epoch": 0.1388245904103412, + "flos": 20707388946720.0, + "grad_norm": 2.816891507942322, + "language_loss": 0.78528029, + "learning_rate": 3.877050737304533e-06, + "loss": 0.813779, + "num_input_tokens_seen": 50015640, + "step": 2309, + "time_per_iteration": 2.7856357097625732 + }, + { + "auxiliary_loss_clip": 0.01538945, + "auxiliary_loss_mlp": 0.01322867, + "balance_loss_clip": 1.18382525, + "balance_loss_mlp": 1.07491171, + "epoch": 0.13888471366300917, + "flos": 20556623053440.0, + "grad_norm": 2.0725132610231287, + "language_loss": 0.68123996, + "learning_rate": 3.876916255543129e-06, + "loss": 0.70985806, + "num_input_tokens_seen": 50033500, + "step": 2310, + "time_per_iteration": 2.718672037124634 + }, + { + "auxiliary_loss_clip": 0.01538849, + "auxiliary_loss_mlp": 0.01321885, + "balance_loss_clip": 1.18453586, + "balance_loss_mlp": 1.07278502, + "epoch": 0.13894483691567713, + "flos": 13839365098080.0, + "grad_norm": 2.6387321038318725, + "language_loss": 0.84175122, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.87035859, + "num_input_tokens_seen": 50050075, + "step": 2311, + "time_per_iteration": 2.738316535949707 + }, + { + "auxiliary_loss_clip": 0.01536438, + "auxiliary_loss_mlp": 0.01317011, + "balance_loss_clip": 1.18261397, + "balance_loss_mlp": 1.06848383, + "epoch": 0.1390049601683451, + "flos": 28033324207200.0, + "grad_norm": 2.307511598759697, + "language_loss": 0.82602274, + "learning_rate": 3.876647078506866e-06, + "loss": 0.85455716, + "num_input_tokens_seen": 50070080, + "step": 2312, + "time_per_iteration": 2.794356346130371 + }, + { + "auxiliary_loss_clip": 0.01534169, + "auxiliary_loss_mlp": 0.01306327, + "balance_loss_clip": 1.18068469, + "balance_loss_mlp": 1.05398464, + "epoch": 0.13906508342101306, + "flos": 26759076992640.0, + "grad_norm": 1.9571390715882824, + "language_loss": 0.87050772, + "learning_rate": 3.876512383242215e-06, + "loss": 0.89891267, + "num_input_tokens_seen": 50090040, + "step": 2313, + "time_per_iteration": 2.8545126914978027 + }, + { + "auxiliary_loss_clip": 0.01541018, + "auxiliary_loss_mlp": 0.01316453, + "balance_loss_clip": 1.18627453, + "balance_loss_mlp": 1.06735361, + "epoch": 0.13912520667368106, + "flos": 24537412797120.0, + "grad_norm": 3.738598345203232, + "language_loss": 0.80437535, + "learning_rate": 3.876377616820024e-06, + "loss": 0.83295012, + "num_input_tokens_seen": 50110595, + "step": 2314, + "time_per_iteration": 2.78916335105896 + }, + { + "auxiliary_loss_clip": 0.01539901, + "auxiliary_loss_mlp": 0.01319529, + "balance_loss_clip": 1.18524921, + "balance_loss_mlp": 1.07462549, + "epoch": 0.13918532992634902, + "flos": 19384820892000.0, + "grad_norm": 4.136356325438285, + "language_loss": 0.8571623, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88575661, + "num_input_tokens_seen": 50125430, + "step": 2315, + "time_per_iteration": 2.6728739738464355 + }, + { + "auxiliary_loss_clip": 0.01538772, + "auxiliary_loss_mlp": 0.01329843, + "balance_loss_clip": 1.184214, + "balance_loss_mlp": 1.08722854, + "epoch": 0.139245453179017, + "flos": 21325851717120.0, + "grad_norm": 2.551955701243523, + "language_loss": 0.77173245, + "learning_rate": 3.876107870523477e-06, + "loss": 0.80041862, + "num_input_tokens_seen": 50144120, + "step": 2316, + "time_per_iteration": 2.8687407970428467 + }, + { + "auxiliary_loss_clip": 0.01538604, + "auxiliary_loss_mlp": 0.01337894, + "balance_loss_clip": 1.18535411, + "balance_loss_mlp": 1.08974838, + "epoch": 0.13930557643168495, + "flos": 19502664778080.0, + "grad_norm": 2.957650776190164, + "language_loss": 0.77628398, + "learning_rate": 3.875972890659349e-06, + "loss": 0.805049, + "num_input_tokens_seen": 50162500, + "step": 2317, + "time_per_iteration": 2.7969703674316406 + }, + { + "auxiliary_loss_clip": 0.0153943, + "auxiliary_loss_mlp": 0.01306305, + "balance_loss_clip": 1.18472004, + "balance_loss_mlp": 1.05873072, + "epoch": 0.13936569968435292, + "flos": 25413182755200.0, + "grad_norm": 1.9517240851301971, + "language_loss": 0.80181795, + "learning_rate": 3.875837839658139e-06, + "loss": 0.8302753, + "num_input_tokens_seen": 50182415, + "step": 2318, + "time_per_iteration": 2.8769381046295166 + }, + { + "auxiliary_loss_clip": 0.01604664, + "auxiliary_loss_mlp": 0.01212921, + "balance_loss_clip": 1.26054978, + "balance_loss_mlp": 1.01150513, + "epoch": 0.13942582293702088, + "flos": 70778483356320.0, + "grad_norm": 0.9722914463202519, + "language_loss": 0.58966076, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61783659, + "num_input_tokens_seen": 50245160, + "step": 2319, + "time_per_iteration": 3.3719482421875 + }, + { + "auxiliary_loss_clip": 0.01541656, + "auxiliary_loss_mlp": 0.01319986, + "balance_loss_clip": 1.18777204, + "balance_loss_mlp": 1.07203066, + "epoch": 0.13948594618968888, + "flos": 35593847323200.0, + "grad_norm": 2.736934226477408, + "language_loss": 0.65480363, + "learning_rate": 3.875567524264967e-06, + "loss": 0.68342006, + "num_input_tokens_seen": 50268215, + "step": 2320, + "time_per_iteration": 2.890223741531372 + }, + { + "auxiliary_loss_clip": 0.01535781, + "auxiliary_loss_mlp": 0.0131882, + "balance_loss_clip": 1.18245709, + "balance_loss_mlp": 1.07658696, + "epoch": 0.13954606944235684, + "flos": 21107307473280.0, + "grad_norm": 2.619015088749558, + "language_loss": 0.70710492, + "learning_rate": 3.875432259883256e-06, + "loss": 0.73565096, + "num_input_tokens_seen": 50288575, + "step": 2321, + "time_per_iteration": 2.7708897590637207 + }, + { + "auxiliary_loss_clip": 0.01538141, + "auxiliary_loss_mlp": 0.01334077, + "balance_loss_clip": 1.18477464, + "balance_loss_mlp": 1.08974552, + "epoch": 0.1396061926950248, + "flos": 25046679301920.0, + "grad_norm": 2.6156666810282223, + "language_loss": 0.86030173, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88902396, + "num_input_tokens_seen": 50308735, + "step": 2322, + "time_per_iteration": 2.835660219192505 + }, + { + "auxiliary_loss_clip": 0.01535279, + "auxiliary_loss_mlp": 0.01304357, + "balance_loss_clip": 1.18334103, + "balance_loss_mlp": 1.06250548, + "epoch": 0.13966631594769277, + "flos": 37637095632480.0, + "grad_norm": 1.8638949275301322, + "language_loss": 0.66919756, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69759393, + "num_input_tokens_seen": 50331025, + "step": 2323, + "time_per_iteration": 2.9403505325317383 + }, + { + "auxiliary_loss_clip": 0.01548865, + "auxiliary_loss_mlp": 0.01312015, + "balance_loss_clip": 1.19641721, + "balance_loss_mlp": 1.05910075, + "epoch": 0.13972643920036074, + "flos": 16692880704480.0, + "grad_norm": 2.062774773595651, + "language_loss": 0.88835502, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91696382, + "num_input_tokens_seen": 50349725, + "step": 2324, + "time_per_iteration": 2.8232598304748535 + }, + { + "auxiliary_loss_clip": 0.01535888, + "auxiliary_loss_mlp": 0.01324696, + "balance_loss_clip": 1.18383431, + "balance_loss_mlp": 1.07655048, + "epoch": 0.1397865624530287, + "flos": 23333295479040.0, + "grad_norm": 3.4257637995641588, + "language_loss": 0.71424568, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.7428515, + "num_input_tokens_seen": 50367965, + "step": 2325, + "time_per_iteration": 2.7956809997558594 + }, + { + "auxiliary_loss_clip": 0.01545545, + "auxiliary_loss_mlp": 0.01335761, + "balance_loss_clip": 1.1945256, + "balance_loss_mlp": 1.08589816, + "epoch": 0.13984668570569667, + "flos": 22780221585120.0, + "grad_norm": 2.06880305784451, + "language_loss": 0.81943297, + "learning_rate": 3.874754871328688e-06, + "loss": 0.84824604, + "num_input_tokens_seen": 50385605, + "step": 2326, + "time_per_iteration": 2.781804084777832 + }, + { + "auxiliary_loss_clip": 0.01543744, + "auxiliary_loss_mlp": 0.01323397, + "balance_loss_clip": 1.19360566, + "balance_loss_mlp": 1.07773066, + "epoch": 0.13990680895836466, + "flos": 19466632661760.0, + "grad_norm": 3.7719246426225546, + "language_loss": 0.89246833, + "learning_rate": 3.874619180324534e-06, + "loss": 0.92113972, + "num_input_tokens_seen": 50403985, + "step": 2327, + "time_per_iteration": 2.8114960193634033 + }, + { + "auxiliary_loss_clip": 0.01551574, + "auxiliary_loss_mlp": 0.01338068, + "balance_loss_clip": 1.20128036, + "balance_loss_mlp": 1.0916388, + "epoch": 0.13996693221103262, + "flos": 20305384371360.0, + "grad_norm": 2.9044811854438746, + "language_loss": 0.84846801, + "learning_rate": 3.874483418234632e-06, + "loss": 0.8773644, + "num_input_tokens_seen": 50421590, + "step": 2328, + "time_per_iteration": 2.868617534637451 + }, + { + "auxiliary_loss_clip": 0.01540589, + "auxiliary_loss_mlp": 0.01313511, + "balance_loss_clip": 1.18973386, + "balance_loss_mlp": 1.0682261, + "epoch": 0.1400270554637006, + "flos": 26620296397920.0, + "grad_norm": 2.2100972955499283, + "language_loss": 0.74253511, + "learning_rate": 3.874347585064131e-06, + "loss": 0.77107608, + "num_input_tokens_seen": 50443945, + "step": 2329, + "time_per_iteration": 2.8621795177459717 + }, + { + "auxiliary_loss_clip": 0.01545254, + "auxiliary_loss_mlp": 0.01326711, + "balance_loss_clip": 1.1950804, + "balance_loss_mlp": 1.08333325, + "epoch": 0.14008717871636855, + "flos": 19393544368800.0, + "grad_norm": 2.2287111914488063, + "language_loss": 0.7813189, + "learning_rate": 3.874211680818183e-06, + "loss": 0.81003857, + "num_input_tokens_seen": 50462065, + "step": 2330, + "time_per_iteration": 2.7662487030029297 + }, + { + "auxiliary_loss_clip": 0.01544454, + "auxiliary_loss_mlp": 0.01338252, + "balance_loss_clip": 1.19438505, + "balance_loss_mlp": 1.09926128, + "epoch": 0.14014730196903652, + "flos": 15306061618080.0, + "grad_norm": 2.206534904126887, + "language_loss": 0.72720838, + "learning_rate": 3.87407570550194e-06, + "loss": 0.75603545, + "num_input_tokens_seen": 50479565, + "step": 2331, + "time_per_iteration": 2.7373006343841553 + }, + { + "auxiliary_loss_clip": 0.01554197, + "auxiliary_loss_mlp": 0.01335455, + "balance_loss_clip": 1.20383883, + "balance_loss_mlp": 1.09627366, + "epoch": 0.14020742522170448, + "flos": 14941492500960.0, + "grad_norm": 2.5813164535392676, + "language_loss": 0.72767079, + "learning_rate": 3.873939659120557e-06, + "loss": 0.7565673, + "num_input_tokens_seen": 50497305, + "step": 2332, + "time_per_iteration": 2.7220399379730225 + }, + { + "auxiliary_loss_clip": 0.01597263, + "auxiliary_loss_mlp": 0.01280113, + "balance_loss_clip": 1.25672328, + "balance_loss_mlp": 1.08861542, + "epoch": 0.14026754847437245, + "flos": 48829731274080.0, + "grad_norm": 0.8521917712623408, + "language_loss": 0.56046516, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58923894, + "num_input_tokens_seen": 50549735, + "step": 2333, + "time_per_iteration": 3.1621992588043213 + }, + { + "auxiliary_loss_clip": 0.01543354, + "auxiliary_loss_mlp": 0.01329159, + "balance_loss_clip": 1.19234741, + "balance_loss_mlp": 1.08025014, + "epoch": 0.14032767172704044, + "flos": 25775514110880.0, + "grad_norm": 2.426239259070049, + "language_loss": 0.82813871, + "learning_rate": 3.873667353183016e-06, + "loss": 0.85686386, + "num_input_tokens_seen": 50570100, + "step": 2334, + "time_per_iteration": 4.3821961879730225 + }, + { + "auxiliary_loss_clip": 0.0153784, + "auxiliary_loss_mlp": 0.01308096, + "balance_loss_clip": 1.18799162, + "balance_loss_mlp": 1.06128466, + "epoch": 0.1403877949797084, + "flos": 21218513931360.0, + "grad_norm": 2.0205343887724734, + "language_loss": 0.81226075, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.84072012, + "num_input_tokens_seen": 50589185, + "step": 2335, + "time_per_iteration": 2.9201600551605225 + }, + { + "auxiliary_loss_clip": 0.01540969, + "auxiliary_loss_mlp": 0.0132584, + "balance_loss_clip": 1.19142294, + "balance_loss_mlp": 1.07235336, + "epoch": 0.14044791823237637, + "flos": 22750220046240.0, + "grad_norm": 1.7107878064977091, + "language_loss": 0.82234097, + "learning_rate": 3.873394763046862e-06, + "loss": 0.85100907, + "num_input_tokens_seen": 50609645, + "step": 2336, + "time_per_iteration": 2.8147528171539307 + }, + { + "auxiliary_loss_clip": 0.01542246, + "auxiliary_loss_mlp": 0.01345531, + "balance_loss_clip": 1.19291699, + "balance_loss_mlp": 1.09776664, + "epoch": 0.14050804148504434, + "flos": 22966564456800.0, + "grad_norm": 1.9218305681343115, + "language_loss": 0.80679744, + "learning_rate": 3.873258361417225e-06, + "loss": 0.83567524, + "num_input_tokens_seen": 50628385, + "step": 2337, + "time_per_iteration": 2.7830734252929688 + }, + { + "auxiliary_loss_clip": 0.01539285, + "auxiliary_loss_mlp": 0.01333817, + "balance_loss_clip": 1.18906796, + "balance_loss_mlp": 1.07651579, + "epoch": 0.1405681647377123, + "flos": 22202531951040.0, + "grad_norm": 2.1277055738279294, + "language_loss": 0.78973031, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81846142, + "num_input_tokens_seen": 50647260, + "step": 2338, + "time_per_iteration": 2.775940418243408 + }, + { + "auxiliary_loss_clip": 0.01548924, + "auxiliary_loss_mlp": 0.01340229, + "balance_loss_clip": 1.19895339, + "balance_loss_mlp": 1.09017563, + "epoch": 0.14062828799038027, + "flos": 23735072485440.0, + "grad_norm": 2.2687473272863117, + "language_loss": 0.8020944, + "learning_rate": 3.87298534506069e-06, + "loss": 0.8309859, + "num_input_tokens_seen": 50666130, + "step": 2339, + "time_per_iteration": 2.787670612335205 + }, + { + "auxiliary_loss_clip": 0.01535512, + "auxiliary_loss_mlp": 0.01312344, + "balance_loss_clip": 1.18476439, + "balance_loss_mlp": 1.05656838, + "epoch": 0.14068841124304826, + "flos": 39205630355040.0, + "grad_norm": 1.8891837432838183, + "language_loss": 0.65906179, + "learning_rate": 3.872848730344146e-06, + "loss": 0.68754029, + "num_input_tokens_seen": 50687440, + "step": 2340, + "time_per_iteration": 2.9294397830963135 + }, + { + "auxiliary_loss_clip": 0.01544984, + "auxiliary_loss_mlp": 0.01318436, + "balance_loss_clip": 1.1950177, + "balance_loss_mlp": 1.07162452, + "epoch": 0.14074853449571623, + "flos": 20194291697760.0, + "grad_norm": 2.8041000340966127, + "language_loss": 0.78961039, + "learning_rate": 3.87271204460899e-06, + "loss": 0.81824458, + "num_input_tokens_seen": 50704030, + "step": 2341, + "time_per_iteration": 4.344762086868286 + }, + { + "auxiliary_loss_clip": 0.01545356, + "auxiliary_loss_mlp": 0.01336114, + "balance_loss_clip": 1.19501352, + "balance_loss_mlp": 1.09101951, + "epoch": 0.1408086577483842, + "flos": 18407857510080.0, + "grad_norm": 2.337076998069332, + "language_loss": 0.80558556, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.83440024, + "num_input_tokens_seen": 50723305, + "step": 2342, + "time_per_iteration": 5.784418344497681 + }, + { + "auxiliary_loss_clip": 0.01546936, + "auxiliary_loss_mlp": 0.01335523, + "balance_loss_clip": 1.19474244, + "balance_loss_mlp": 1.0871861, + "epoch": 0.14086878100105216, + "flos": 25266930312960.0, + "grad_norm": 2.2980598423179335, + "language_loss": 0.7764504, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80527502, + "num_input_tokens_seen": 50743270, + "step": 2343, + "time_per_iteration": 2.797476053237915 + }, + { + "auxiliary_loss_clip": 0.01596683, + "auxiliary_loss_mlp": 0.01322113, + "balance_loss_clip": 1.25464582, + "balance_loss_mlp": 1.10925293, + "epoch": 0.14092890425372012, + "flos": 65984809272480.0, + "grad_norm": 0.8762213386577509, + "language_loss": 0.61452371, + "learning_rate": 3.872301561343699e-06, + "loss": 0.64371169, + "num_input_tokens_seen": 50802710, + "step": 2344, + "time_per_iteration": 3.240057945251465 + }, + { + "auxiliary_loss_clip": 0.01540073, + "auxiliary_loss_mlp": 0.01303797, + "balance_loss_clip": 1.18766546, + "balance_loss_mlp": 1.05717707, + "epoch": 0.1409890275063881, + "flos": 23697181889280.0, + "grad_norm": 3.2789681396783252, + "language_loss": 0.64666247, + "learning_rate": 3.872164591585956e-06, + "loss": 0.67510122, + "num_input_tokens_seen": 50822625, + "step": 2345, + "time_per_iteration": 2.7918732166290283 + }, + { + "auxiliary_loss_clip": 0.01534924, + "auxiliary_loss_mlp": 0.01311042, + "balance_loss_clip": 1.18371773, + "balance_loss_mlp": 1.06308639, + "epoch": 0.14104915075905605, + "flos": 23625421081920.0, + "grad_norm": 2.3240185026572866, + "language_loss": 0.74197423, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.7704339, + "num_input_tokens_seen": 50842330, + "step": 2346, + "time_per_iteration": 2.758981943130493 + }, + { + "auxiliary_loss_clip": 0.01542146, + "auxiliary_loss_mlp": 0.01339983, + "balance_loss_clip": 1.19102573, + "balance_loss_mlp": 1.09965694, + "epoch": 0.14110927401172405, + "flos": 20597282405280.0, + "grad_norm": 3.8725819055868125, + "language_loss": 0.77674985, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80557108, + "num_input_tokens_seen": 50861035, + "step": 2347, + "time_per_iteration": 2.7992634773254395 + }, + { + "auxiliary_loss_clip": 0.01530486, + "auxiliary_loss_mlp": 0.01337728, + "balance_loss_clip": 1.17935717, + "balance_loss_mlp": 1.09415936, + "epoch": 0.141169397264392, + "flos": 28550555625600.0, + "grad_norm": 2.0753974572387106, + "language_loss": 0.77005386, + "learning_rate": 3.8717532563775e-06, + "loss": 0.79873598, + "num_input_tokens_seen": 50880105, + "step": 2348, + "time_per_iteration": 2.802988052368164 + }, + { + "auxiliary_loss_clip": 0.01538745, + "auxiliary_loss_mlp": 0.01361489, + "balance_loss_clip": 1.18750858, + "balance_loss_mlp": 1.12440562, + "epoch": 0.14122952051705998, + "flos": 17094164644800.0, + "grad_norm": 1.823192179913944, + "language_loss": 0.86679506, + "learning_rate": 3.871616002680272e-06, + "loss": 0.89579749, + "num_input_tokens_seen": 50897720, + "step": 2349, + "time_per_iteration": 2.7333080768585205 + }, + { + "auxiliary_loss_clip": 0.01547462, + "auxiliary_loss_mlp": 0.01359882, + "balance_loss_clip": 1.19382191, + "balance_loss_mlp": 1.12031865, + "epoch": 0.14128964376972794, + "flos": 28949184594720.0, + "grad_norm": 1.8231012338679065, + "language_loss": 0.88824755, + "learning_rate": 3.871478678011177e-06, + "loss": 0.91732097, + "num_input_tokens_seen": 50918385, + "step": 2350, + "time_per_iteration": 2.814709424972534 + }, + { + "auxiliary_loss_clip": 0.01545849, + "auxiliary_loss_mlp": 0.01350384, + "balance_loss_clip": 1.19362855, + "balance_loss_mlp": 1.11024857, + "epoch": 0.1413497670223959, + "flos": 18991805290560.0, + "grad_norm": 1.9908338910691845, + "language_loss": 0.81288159, + "learning_rate": 3.871341282375423e-06, + "loss": 0.8418439, + "num_input_tokens_seen": 50938270, + "step": 2351, + "time_per_iteration": 2.76389741897583 + }, + { + "auxiliary_loss_clip": 0.01537771, + "auxiliary_loss_mlp": 0.01315837, + "balance_loss_clip": 1.18591475, + "balance_loss_mlp": 1.07589245, + "epoch": 0.14140989027506387, + "flos": 29864855341440.0, + "grad_norm": 3.083852783738434, + "language_loss": 0.83250928, + "learning_rate": 3.871203815778219e-06, + "loss": 0.86104536, + "num_input_tokens_seen": 50958155, + "step": 2352, + "time_per_iteration": 2.8224475383758545 + }, + { + "auxiliary_loss_clip": 0.01625582, + "auxiliary_loss_mlp": 0.01306847, + "balance_loss_clip": 1.27905917, + "balance_loss_mlp": 1.11687469, + "epoch": 0.14147001352773186, + "flos": 62086020939360.0, + "grad_norm": 0.9428321361554575, + "language_loss": 0.61925745, + "learning_rate": 3.87106627822478e-06, + "loss": 0.64858174, + "num_input_tokens_seen": 51020705, + "step": 2353, + "time_per_iteration": 3.2873175144195557 + }, + { + "auxiliary_loss_clip": 0.01543682, + "auxiliary_loss_mlp": 0.01325882, + "balance_loss_clip": 1.19100618, + "balance_loss_mlp": 1.0897522, + "epoch": 0.14153013678039983, + "flos": 22019678470080.0, + "grad_norm": 1.8356873893624146, + "language_loss": 0.87450111, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.90319681, + "num_input_tokens_seen": 51039995, + "step": 2354, + "time_per_iteration": 2.763256549835205 + }, + { + "auxiliary_loss_clip": 0.0154025, + "auxiliary_loss_mlp": 0.01308371, + "balance_loss_clip": 1.18633831, + "balance_loss_mlp": 1.06861746, + "epoch": 0.1415902600330678, + "flos": 19722195154080.0, + "grad_norm": 2.9184873930154187, + "language_loss": 0.75041074, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77889693, + "num_input_tokens_seen": 51059075, + "step": 2355, + "time_per_iteration": 2.7498016357421875 + }, + { + "auxiliary_loss_clip": 0.01639331, + "auxiliary_loss_mlp": 0.01236938, + "balance_loss_clip": 1.29152226, + "balance_loss_mlp": 1.03857422, + "epoch": 0.14165038328573576, + "flos": 65907093744000.0, + "grad_norm": 0.6802128864482584, + "language_loss": 0.51826543, + "learning_rate": 3.870653239879212e-06, + "loss": 0.54702812, + "num_input_tokens_seen": 51120380, + "step": 2356, + "time_per_iteration": 3.1857781410217285 + }, + { + "auxiliary_loss_clip": 0.01540762, + "auxiliary_loss_mlp": 0.01310192, + "balance_loss_clip": 1.18669844, + "balance_loss_mlp": 1.06872165, + "epoch": 0.14171050653840372, + "flos": 12131784283680.0, + "grad_norm": 2.2648561031129977, + "language_loss": 0.70824498, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.73675454, + "num_input_tokens_seen": 51136950, + "step": 2357, + "time_per_iteration": 2.737584114074707 + }, + { + "auxiliary_loss_clip": 0.01538194, + "auxiliary_loss_mlp": 0.01316688, + "balance_loss_clip": 1.18293655, + "balance_loss_mlp": 1.06987691, + "epoch": 0.1417706297910717, + "flos": 20414580636960.0, + "grad_norm": 1.8742615598115449, + "language_loss": 0.82152396, + "learning_rate": 3.870377526296674e-06, + "loss": 0.8500728, + "num_input_tokens_seen": 51155175, + "step": 2358, + "time_per_iteration": 2.719625949859619 + }, + { + "auxiliary_loss_clip": 0.01540496, + "auxiliary_loss_mlp": 0.01315908, + "balance_loss_clip": 1.18715048, + "balance_loss_mlp": 1.07043219, + "epoch": 0.14183075304373965, + "flos": 22382692532640.0, + "grad_norm": 2.21408950584778, + "language_loss": 0.72217768, + "learning_rate": 3.870239563115436e-06, + "loss": 0.75074172, + "num_input_tokens_seen": 51174500, + "step": 2359, + "time_per_iteration": 2.760350465774536 + }, + { + "auxiliary_loss_clip": 0.01542144, + "auxiliary_loss_mlp": 0.01312991, + "balance_loss_clip": 1.18942034, + "balance_loss_mlp": 1.0671339, + "epoch": 0.14189087629640765, + "flos": 21583120976640.0, + "grad_norm": 2.2763412274288672, + "language_loss": 0.75428081, + "learning_rate": 3.870101529014526e-06, + "loss": 0.78283215, + "num_input_tokens_seen": 51194270, + "step": 2360, + "time_per_iteration": 2.7817344665527344 + }, + { + "auxiliary_loss_clip": 0.0154723, + "auxiliary_loss_mlp": 0.01294384, + "balance_loss_clip": 1.19297147, + "balance_loss_mlp": 1.04585695, + "epoch": 0.1419509995490756, + "flos": 20010490012800.0, + "grad_norm": 2.2569950026114936, + "language_loss": 0.81405151, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84246773, + "num_input_tokens_seen": 51211850, + "step": 2361, + "time_per_iteration": 2.808095932006836 + }, + { + "auxiliary_loss_clip": 0.01548046, + "auxiliary_loss_mlp": 0.01304069, + "balance_loss_clip": 1.1933372, + "balance_loss_mlp": 1.05668628, + "epoch": 0.14201112280174358, + "flos": 31944173695200.0, + "grad_norm": 2.0692862544140955, + "language_loss": 0.74548316, + "learning_rate": 3.86982524807463e-06, + "loss": 0.77400434, + "num_input_tokens_seen": 51233545, + "step": 2362, + "time_per_iteration": 2.8681797981262207 + }, + { + "auxiliary_loss_clip": 0.01549579, + "auxiliary_loss_mlp": 0.01306422, + "balance_loss_clip": 1.19556856, + "balance_loss_mlp": 1.05655909, + "epoch": 0.14207124605441154, + "flos": 41467384980000.0, + "grad_norm": 1.8310446909253992, + "language_loss": 0.73917246, + "learning_rate": 3.869687001246122e-06, + "loss": 0.76773244, + "num_input_tokens_seen": 51257615, + "step": 2363, + "time_per_iteration": 2.9536733627319336 + }, + { + "auxiliary_loss_clip": 0.01546031, + "auxiliary_loss_mlp": 0.01284196, + "balance_loss_clip": 1.19084549, + "balance_loss_mlp": 1.03337944, + "epoch": 0.1421313693070795, + "flos": 31907800225440.0, + "grad_norm": 2.7386873948745607, + "language_loss": 0.73177463, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.76007688, + "num_input_tokens_seen": 51279645, + "step": 2364, + "time_per_iteration": 2.7930917739868164 + }, + { + "auxiliary_loss_clip": 0.01554366, + "auxiliary_loss_mlp": 0.01298032, + "balance_loss_clip": 1.19912755, + "balance_loss_mlp": 1.04702497, + "epoch": 0.14219149255974747, + "flos": 26873659056960.0, + "grad_norm": 2.333258428900352, + "language_loss": 0.91007519, + "learning_rate": 3.869410294898195e-06, + "loss": 0.93859917, + "num_input_tokens_seen": 51299775, + "step": 2365, + "time_per_iteration": 2.8276193141937256 + }, + { + "auxiliary_loss_clip": 0.01543066, + "auxiliary_loss_mlp": 0.01290463, + "balance_loss_clip": 1.18728876, + "balance_loss_mlp": 1.04727614, + "epoch": 0.14225161581241544, + "flos": 27456924130560.0, + "grad_norm": 2.1099142643398783, + "language_loss": 0.65324682, + "learning_rate": 3.869271835389268e-06, + "loss": 0.68158209, + "num_input_tokens_seen": 51319430, + "step": 2366, + "time_per_iteration": 2.7887701988220215 + }, + { + "auxiliary_loss_clip": 0.01542168, + "auxiliary_loss_mlp": 0.01291919, + "balance_loss_clip": 1.1873889, + "balance_loss_mlp": 1.04205668, + "epoch": 0.14231173906508343, + "flos": 10562984064000.0, + "grad_norm": 2.378031639944868, + "language_loss": 0.80837512, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.836716, + "num_input_tokens_seen": 51336045, + "step": 2367, + "time_per_iteration": 2.776726722717285 + }, + { + "auxiliary_loss_clip": 0.01539342, + "auxiliary_loss_mlp": 0.01311333, + "balance_loss_clip": 1.18314254, + "balance_loss_mlp": 1.05956256, + "epoch": 0.1423718623177514, + "flos": 28363112837280.0, + "grad_norm": 2.2545274941324323, + "language_loss": 0.82864404, + "learning_rate": 3.868994703727742e-06, + "loss": 0.85715079, + "num_input_tokens_seen": 51357030, + "step": 2368, + "time_per_iteration": 2.916867256164551 + }, + { + "auxiliary_loss_clip": 0.0155268, + "auxiliary_loss_mlp": 0.01313026, + "balance_loss_clip": 1.19774926, + "balance_loss_mlp": 1.06888556, + "epoch": 0.14243198557041936, + "flos": 19356260623200.0, + "grad_norm": 2.720039497727649, + "language_loss": 0.86923814, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89789522, + "num_input_tokens_seen": 51374890, + "step": 2369, + "time_per_iteration": 2.7698376178741455 + }, + { + "auxiliary_loss_clip": 0.01536603, + "auxiliary_loss_mlp": 0.01313506, + "balance_loss_clip": 1.18118119, + "balance_loss_mlp": 1.06803, + "epoch": 0.14249210882308733, + "flos": 28809569580480.0, + "grad_norm": 2.047996251659873, + "language_loss": 0.76051378, + "learning_rate": 3.868717288576354e-06, + "loss": 0.78901488, + "num_input_tokens_seen": 51398100, + "step": 2370, + "time_per_iteration": 2.9266672134399414 + }, + { + "auxiliary_loss_clip": 0.01547707, + "auxiliary_loss_mlp": 0.01317452, + "balance_loss_clip": 1.19135618, + "balance_loss_mlp": 1.07827091, + "epoch": 0.1425522320757553, + "flos": 21837166342560.0, + "grad_norm": 1.8197897758699166, + "language_loss": 0.83292282, + "learning_rate": 3.868578474705109e-06, + "loss": 0.86157441, + "num_input_tokens_seen": 51418745, + "step": 2371, + "time_per_iteration": 2.7562944889068604 + }, + { + "auxiliary_loss_clip": 0.01547047, + "auxiliary_loss_mlp": 0.01305941, + "balance_loss_clip": 1.19121301, + "balance_loss_mlp": 1.06428015, + "epoch": 0.14261235532842326, + "flos": 17313391595520.0, + "grad_norm": 2.7414713006447413, + "language_loss": 0.82943982, + "learning_rate": 3.868439589977181e-06, + "loss": 0.85796964, + "num_input_tokens_seen": 51437455, + "step": 2372, + "time_per_iteration": 4.289445877075195 + }, + { + "auxiliary_loss_clip": 0.01543342, + "auxiliary_loss_mlp": 0.01315755, + "balance_loss_clip": 1.18780208, + "balance_loss_mlp": 1.07313967, + "epoch": 0.14267247858109125, + "flos": 18808724240640.0, + "grad_norm": 2.586498703799379, + "language_loss": 0.84670413, + "learning_rate": 3.868300634397836e-06, + "loss": 0.87529504, + "num_input_tokens_seen": 51455710, + "step": 2373, + "time_per_iteration": 2.811570167541504 + }, + { + "auxiliary_loss_clip": 0.01543432, + "auxiliary_loss_mlp": 0.01326318, + "balance_loss_clip": 1.18829632, + "balance_loss_mlp": 1.08446598, + "epoch": 0.14273260183375922, + "flos": 11360204074080.0, + "grad_norm": 2.8203606138485573, + "language_loss": 0.86031866, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88901615, + "num_input_tokens_seen": 51471270, + "step": 2374, + "time_per_iteration": 2.756347179412842 + }, + { + "auxiliary_loss_clip": 0.01548398, + "auxiliary_loss_mlp": 0.01306395, + "balance_loss_clip": 1.19360137, + "balance_loss_mlp": 1.06530607, + "epoch": 0.14279272508642718, + "flos": 27570406278240.0, + "grad_norm": 1.6265468057479109, + "language_loss": 0.79238325, + "learning_rate": 3.868022510705977e-06, + "loss": 0.82093114, + "num_input_tokens_seen": 51492705, + "step": 2375, + "time_per_iteration": 2.886323928833008 + }, + { + "auxiliary_loss_clip": 0.01539211, + "auxiliary_loss_mlp": 0.01304698, + "balance_loss_clip": 1.1837486, + "balance_loss_mlp": 1.05655241, + "epoch": 0.14285284833909515, + "flos": 16254388874880.0, + "grad_norm": 3.765939005334882, + "language_loss": 0.76799452, + "learning_rate": 3.867883342604009e-06, + "loss": 0.79643357, + "num_input_tokens_seen": 51510780, + "step": 2376, + "time_per_iteration": 2.7168076038360596 + }, + { + "auxiliary_loss_clip": 0.01541489, + "auxiliary_loss_mlp": 0.01320134, + "balance_loss_clip": 1.18610072, + "balance_loss_mlp": 1.07942653, + "epoch": 0.1429129715917631, + "flos": 19757658348000.0, + "grad_norm": 1.851863668307931, + "language_loss": 0.9310751, + "learning_rate": 3.867744103671717e-06, + "loss": 0.95969141, + "num_input_tokens_seen": 51531400, + "step": 2377, + "time_per_iteration": 2.7874815464019775 + }, + { + "auxiliary_loss_clip": 0.01543845, + "auxiliary_loss_mlp": 0.01343905, + "balance_loss_clip": 1.18841779, + "balance_loss_mlp": 1.10415149, + "epoch": 0.14297309484443108, + "flos": 21138939923040.0, + "grad_norm": 5.945403431947248, + "language_loss": 0.91633523, + "learning_rate": 3.867604793914382e-06, + "loss": 0.94521272, + "num_input_tokens_seen": 51548215, + "step": 2378, + "time_per_iteration": 2.729834794998169 + }, + { + "auxiliary_loss_clip": 0.01541564, + "auxiliary_loss_mlp": 0.01340133, + "balance_loss_clip": 1.18594623, + "balance_loss_mlp": 1.10457492, + "epoch": 0.14303321809709904, + "flos": 23588933827680.0, + "grad_norm": 1.9234577126743493, + "language_loss": 0.73907518, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76789212, + "num_input_tokens_seen": 51566820, + "step": 2379, + "time_per_iteration": 2.796705484390259 + }, + { + "auxiliary_loss_clip": 0.01538336, + "auxiliary_loss_mlp": 0.01357926, + "balance_loss_clip": 1.18342817, + "balance_loss_mlp": 1.11512029, + "epoch": 0.14309334134976703, + "flos": 15890123183040.0, + "grad_norm": 3.2314353843952426, + "language_loss": 0.78854495, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81750757, + "num_input_tokens_seen": 51585075, + "step": 2380, + "time_per_iteration": 5.8352086544036865 + }, + { + "auxiliary_loss_clip": 0.01546924, + "auxiliary_loss_mlp": 0.01312371, + "balance_loss_clip": 1.1910851, + "balance_loss_mlp": 1.06937528, + "epoch": 0.143153464602435, + "flos": 16327439239680.0, + "grad_norm": 2.1735329275818938, + "language_loss": 0.8818233, + "learning_rate": 3.867186439744955e-06, + "loss": 0.91041636, + "num_input_tokens_seen": 51603185, + "step": 2381, + "time_per_iteration": 2.7651681900024414 + }, + { + "auxiliary_loss_clip": 0.01534211, + "auxiliary_loss_mlp": 0.0133538, + "balance_loss_clip": 1.1786809, + "balance_loss_mlp": 1.09295619, + "epoch": 0.14321358785510296, + "flos": 17093975004000.0, + "grad_norm": 4.528446639629184, + "language_loss": 0.76729429, + "learning_rate": 3.867046846740299e-06, + "loss": 0.79599023, + "num_input_tokens_seen": 51620880, + "step": 2382, + "time_per_iteration": 2.6964120864868164 + }, + { + "auxiliary_loss_clip": 0.01534113, + "auxiliary_loss_mlp": 0.01323476, + "balance_loss_clip": 1.17877591, + "balance_loss_mlp": 1.08124316, + "epoch": 0.14327371110777093, + "flos": 26325326183040.0, + "grad_norm": 2.2613237218276514, + "language_loss": 0.77030528, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79888117, + "num_input_tokens_seen": 51640170, + "step": 2383, + "time_per_iteration": 2.834416627883911 + }, + { + "auxiliary_loss_clip": 0.01546116, + "auxiliary_loss_mlp": 0.01306849, + "balance_loss_clip": 1.19014466, + "balance_loss_mlp": 1.06003833, + "epoch": 0.1433338343604389, + "flos": 18078372305280.0, + "grad_norm": 2.2413996833418763, + "language_loss": 0.88428044, + "learning_rate": 3.866767448340471e-06, + "loss": 0.91281009, + "num_input_tokens_seen": 51656580, + "step": 2384, + "time_per_iteration": 2.7600677013397217 + }, + { + "auxiliary_loss_clip": 0.01539957, + "auxiliary_loss_mlp": 0.01334141, + "balance_loss_clip": 1.18323183, + "balance_loss_mlp": 1.09724867, + "epoch": 0.14339395761310686, + "flos": 15524567933760.0, + "grad_norm": 7.150183688625658, + "language_loss": 0.7993626, + "learning_rate": 3.866627642955895e-06, + "loss": 0.82810354, + "num_input_tokens_seen": 51674645, + "step": 2385, + "time_per_iteration": 2.7935056686401367 + }, + { + "auxiliary_loss_clip": 0.01534528, + "auxiliary_loss_mlp": 0.01331571, + "balance_loss_clip": 1.17961812, + "balance_loss_mlp": 1.09448755, + "epoch": 0.14345408086577485, + "flos": 28551314188800.0, + "grad_norm": 1.817609363559033, + "language_loss": 0.7532959, + "learning_rate": 3.866487766788612e-06, + "loss": 0.78195691, + "num_input_tokens_seen": 51695770, + "step": 2386, + "time_per_iteration": 2.8030261993408203 + }, + { + "auxiliary_loss_clip": 0.01540439, + "auxiliary_loss_mlp": 0.01326683, + "balance_loss_clip": 1.18460202, + "balance_loss_mlp": 1.08616662, + "epoch": 0.14351420411844282, + "flos": 20232144365760.0, + "grad_norm": 2.097626074992349, + "language_loss": 0.78689647, + "learning_rate": 3.866347819843925e-06, + "loss": 0.81556773, + "num_input_tokens_seen": 51714165, + "step": 2387, + "time_per_iteration": 2.782648801803589 + }, + { + "auxiliary_loss_clip": 0.01534025, + "auxiliary_loss_mlp": 0.01308088, + "balance_loss_clip": 1.17854333, + "balance_loss_mlp": 1.06070518, + "epoch": 0.14357432737111078, + "flos": 19867082182560.0, + "grad_norm": 2.219379587364883, + "language_loss": 0.82755125, + "learning_rate": 3.866207802127143e-06, + "loss": 0.85597235, + "num_input_tokens_seen": 51734440, + "step": 2388, + "time_per_iteration": 2.8453991413116455 + }, + { + "auxiliary_loss_clip": 0.01542595, + "auxiliary_loss_mlp": 0.0130194, + "balance_loss_clip": 1.18643141, + "balance_loss_mlp": 1.05932522, + "epoch": 0.14363445062377875, + "flos": 28259340298560.0, + "grad_norm": 2.3557444221883723, + "language_loss": 0.82386482, + "learning_rate": 3.866067713643573e-06, + "loss": 0.85231018, + "num_input_tokens_seen": 51753730, + "step": 2389, + "time_per_iteration": 2.885481595993042 + }, + { + "auxiliary_loss_clip": 0.01544917, + "auxiliary_loss_mlp": 0.01332159, + "balance_loss_clip": 1.18937659, + "balance_loss_mlp": 1.09450352, + "epoch": 0.1436945738764467, + "flos": 18188630559360.0, + "grad_norm": 2.290738298645964, + "language_loss": 0.8309176, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.8596884, + "num_input_tokens_seen": 51771195, + "step": 2390, + "time_per_iteration": 2.77854323387146 + }, + { + "auxiliary_loss_clip": 0.01544105, + "auxiliary_loss_mlp": 0.01297837, + "balance_loss_clip": 1.18816411, + "balance_loss_mlp": 1.05579424, + "epoch": 0.14375469712911468, + "flos": 27310368263040.0, + "grad_norm": 3.46306129320648, + "language_loss": 0.75157535, + "learning_rate": 3.865787324397324e-06, + "loss": 0.77999473, + "num_input_tokens_seen": 51792290, + "step": 2391, + "time_per_iteration": 2.776519536972046 + }, + { + "auxiliary_loss_clip": 0.01639697, + "auxiliary_loss_mlp": 0.01252899, + "balance_loss_clip": 1.2955265, + "balance_loss_mlp": 1.04385376, + "epoch": 0.14381482038178264, + "flos": 56897245205280.0, + "grad_norm": 0.8971852577355963, + "language_loss": 0.61847436, + "learning_rate": 3.865647023645277e-06, + "loss": 0.64740038, + "num_input_tokens_seen": 51843675, + "step": 2392, + "time_per_iteration": 3.163681983947754 + }, + { + "auxiliary_loss_clip": 0.01550791, + "auxiliary_loss_mlp": 0.01352958, + "balance_loss_clip": 1.1950407, + "balance_loss_mlp": 1.11511135, + "epoch": 0.14387494363445064, + "flos": 14283887505120.0, + "grad_norm": 4.382452145758957, + "language_loss": 0.77019554, + "learning_rate": 3.865506652147709e-06, + "loss": 0.79923302, + "num_input_tokens_seen": 51860285, + "step": 2393, + "time_per_iteration": 2.747929573059082 + }, + { + "auxiliary_loss_clip": 0.01546552, + "auxiliary_loss_mlp": 0.01341097, + "balance_loss_clip": 1.19192696, + "balance_loss_mlp": 1.09943628, + "epoch": 0.1439350668871186, + "flos": 26763969725280.0, + "grad_norm": 2.1543240973904907, + "language_loss": 0.76813626, + "learning_rate": 3.865366209909941e-06, + "loss": 0.79701269, + "num_input_tokens_seen": 51880105, + "step": 2394, + "time_per_iteration": 2.814692735671997 + }, + { + "auxiliary_loss_clip": 0.01547698, + "auxiliary_loss_mlp": 0.01358368, + "balance_loss_clip": 1.19406629, + "balance_loss_mlp": 1.12204766, + "epoch": 0.14399519013978657, + "flos": 40703352474240.0, + "grad_norm": 4.451836883688393, + "language_loss": 0.85984021, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88890088, + "num_input_tokens_seen": 51905175, + "step": 2395, + "time_per_iteration": 2.9246230125427246 + }, + { + "auxiliary_loss_clip": 0.01550273, + "auxiliary_loss_mlp": 0.0136041, + "balance_loss_clip": 1.19668949, + "balance_loss_mlp": 1.12637782, + "epoch": 0.14405531339245453, + "flos": 20559581449920.0, + "grad_norm": 1.7423066012839379, + "language_loss": 0.83194828, + "learning_rate": 3.865085113235113e-06, + "loss": 0.86105514, + "num_input_tokens_seen": 51924490, + "step": 2396, + "time_per_iteration": 2.840630531311035 + }, + { + "auxiliary_loss_clip": 0.01543403, + "auxiliary_loss_mlp": 0.01348457, + "balance_loss_clip": 1.19033575, + "balance_loss_mlp": 1.11232758, + "epoch": 0.1441154366451225, + "flos": 19574956579680.0, + "grad_norm": 2.430318001869581, + "language_loss": 0.83283246, + "learning_rate": 3.864944458808712e-06, + "loss": 0.86175108, + "num_input_tokens_seen": 51940490, + "step": 2397, + "time_per_iteration": 2.7405567169189453 + }, + { + "auxiliary_loss_clip": 0.01541244, + "auxiliary_loss_mlp": 0.0130998, + "balance_loss_clip": 1.18872046, + "balance_loss_mlp": 1.06965375, + "epoch": 0.14417555989779046, + "flos": 18517736482560.0, + "grad_norm": 2.0558490730377486, + "language_loss": 0.80261743, + "learning_rate": 3.86480373366343e-06, + "loss": 0.83112967, + "num_input_tokens_seen": 51957910, + "step": 2398, + "time_per_iteration": 2.7037513256073 + }, + { + "auxiliary_loss_clip": 0.01554249, + "auxiliary_loss_mlp": 0.01333813, + "balance_loss_clip": 1.20230603, + "balance_loss_mlp": 1.09920907, + "epoch": 0.14423568315045843, + "flos": 26034110856000.0, + "grad_norm": 2.3865267764711087, + "language_loss": 0.6523267, + "learning_rate": 3.864662937804603e-06, + "loss": 0.6812073, + "num_input_tokens_seen": 51978010, + "step": 2399, + "time_per_iteration": 2.8036575317382812 + }, + { + "auxiliary_loss_clip": 0.01552935, + "auxiliary_loss_mlp": 0.01352328, + "balance_loss_clip": 1.20103455, + "balance_loss_mlp": 1.11886871, + "epoch": 0.14429580640312642, + "flos": 21290919517440.0, + "grad_norm": 1.9861712804345435, + "language_loss": 0.82122421, + "learning_rate": 3.864522071237571e-06, + "loss": 0.85027689, + "num_input_tokens_seen": 51998515, + "step": 2400, + "time_per_iteration": 2.7962663173675537 + }, + { + "auxiliary_loss_clip": 0.01552678, + "auxiliary_loss_mlp": 0.01324526, + "balance_loss_clip": 1.20051217, + "balance_loss_mlp": 1.08839643, + "epoch": 0.14435592965579438, + "flos": 25630134016320.0, + "grad_norm": 1.7095905902416526, + "language_loss": 0.7484076, + "learning_rate": 3.864381133967676e-06, + "loss": 0.77717966, + "num_input_tokens_seen": 52019270, + "step": 2401, + "time_per_iteration": 2.8297533988952637 + }, + { + "auxiliary_loss_clip": 0.015583, + "auxiliary_loss_mlp": 0.01330589, + "balance_loss_clip": 1.20616436, + "balance_loss_mlp": 1.09350586, + "epoch": 0.14441605290846235, + "flos": 22967209235520.0, + "grad_norm": 2.260915794617251, + "language_loss": 0.81053412, + "learning_rate": 3.86424012600026e-06, + "loss": 0.83942294, + "num_input_tokens_seen": 52039315, + "step": 2402, + "time_per_iteration": 2.764753818511963 + }, + { + "auxiliary_loss_clip": 0.01551753, + "auxiliary_loss_mlp": 0.01325872, + "balance_loss_clip": 1.19945037, + "balance_loss_mlp": 1.08077741, + "epoch": 0.14447617616113032, + "flos": 17349575424480.0, + "grad_norm": 2.3528142563209262, + "language_loss": 0.84410197, + "learning_rate": 3.864099047340673e-06, + "loss": 0.87287819, + "num_input_tokens_seen": 52056555, + "step": 2403, + "time_per_iteration": 2.797260284423828 + }, + { + "auxiliary_loss_clip": 0.01540852, + "auxiliary_loss_mlp": 0.01315498, + "balance_loss_clip": 1.18901861, + "balance_loss_mlp": 1.06811464, + "epoch": 0.14453629941379828, + "flos": 24062433713280.0, + "grad_norm": 1.753374724222132, + "language_loss": 0.70214105, + "learning_rate": 3.863957897994262e-06, + "loss": 0.73070455, + "num_input_tokens_seen": 52075800, + "step": 2404, + "time_per_iteration": 2.8346283435821533 + }, + { + "auxiliary_loss_clip": 0.01546997, + "auxiliary_loss_mlp": 0.01302759, + "balance_loss_clip": 1.19574809, + "balance_loss_mlp": 1.05766439, + "epoch": 0.14459642266646625, + "flos": 14431353648480.0, + "grad_norm": 2.3893604012201903, + "language_loss": 0.72913861, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75763619, + "num_input_tokens_seen": 52092585, + "step": 2405, + "time_per_iteration": 2.7471256256103516 + }, + { + "auxiliary_loss_clip": 0.01546236, + "auxiliary_loss_mlp": 0.01312509, + "balance_loss_clip": 1.19562292, + "balance_loss_mlp": 1.07370877, + "epoch": 0.14465654591913424, + "flos": 9868474604160.0, + "grad_norm": 2.602189051589773, + "language_loss": 0.73411965, + "learning_rate": 3.863675387262386e-06, + "loss": 0.76270711, + "num_input_tokens_seen": 52108990, + "step": 2406, + "time_per_iteration": 2.8113019466400146 + }, + { + "auxiliary_loss_clip": 0.01557907, + "auxiliary_loss_mlp": 0.01344539, + "balance_loss_clip": 1.20824373, + "balance_loss_mlp": 1.10364127, + "epoch": 0.1447166691718022, + "flos": 24975259848000.0, + "grad_norm": 2.5382644887869286, + "language_loss": 0.75412095, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.78314543, + "num_input_tokens_seen": 52125385, + "step": 2407, + "time_per_iteration": 2.7915902137756348 + }, + { + "auxiliary_loss_clip": 0.01548865, + "auxiliary_loss_mlp": 0.01350406, + "balance_loss_clip": 1.19819212, + "balance_loss_mlp": 1.11236894, + "epoch": 0.14477679242447017, + "flos": 21910178779200.0, + "grad_norm": 1.5886415424083473, + "language_loss": 0.79732174, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.82631445, + "num_input_tokens_seen": 52144985, + "step": 2408, + "time_per_iteration": 2.7757649421691895 + }, + { + "auxiliary_loss_clip": 0.01555828, + "auxiliary_loss_mlp": 0.01338777, + "balance_loss_clip": 1.20588219, + "balance_loss_mlp": 1.09158492, + "epoch": 0.14483691567713813, + "flos": 20742852140640.0, + "grad_norm": 2.058644113537405, + "language_loss": 0.82358909, + "learning_rate": 3.863251091147299e-06, + "loss": 0.85253513, + "num_input_tokens_seen": 52163885, + "step": 2409, + "time_per_iteration": 2.7829349040985107 + }, + { + "auxiliary_loss_clip": 0.0156078, + "auxiliary_loss_mlp": 0.01302524, + "balance_loss_clip": 1.21053064, + "balance_loss_mlp": 1.05552197, + "epoch": 0.1448970389298061, + "flos": 35410614560640.0, + "grad_norm": 2.881351410342882, + "language_loss": 0.74493593, + "learning_rate": 3.863109517792446e-06, + "loss": 0.77356899, + "num_input_tokens_seen": 52184325, + "step": 2410, + "time_per_iteration": 2.862628936767578 + }, + { + "auxiliary_loss_clip": 0.0155739, + "auxiliary_loss_mlp": 0.01322217, + "balance_loss_clip": 1.20763159, + "balance_loss_mlp": 1.07635951, + "epoch": 0.14495716218247406, + "flos": 15416206087680.0, + "grad_norm": 1.9092491924578088, + "language_loss": 0.82121921, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.85001522, + "num_input_tokens_seen": 52202740, + "step": 2411, + "time_per_iteration": 4.280704736709595 + }, + { + "auxiliary_loss_clip": 0.01556089, + "auxiliary_loss_mlp": 0.01303372, + "balance_loss_clip": 1.20750022, + "balance_loss_mlp": 1.05656111, + "epoch": 0.14501728543514203, + "flos": 33696054964800.0, + "grad_norm": 2.6619209210643464, + "language_loss": 0.70642555, + "learning_rate": 3.862826159140214e-06, + "loss": 0.73502016, + "num_input_tokens_seen": 52223100, + "step": 2412, + "time_per_iteration": 2.891160249710083 + }, + { + "auxiliary_loss_clip": 0.01559652, + "auxiliary_loss_mlp": 0.01308809, + "balance_loss_clip": 1.21117258, + "balance_loss_mlp": 1.06867421, + "epoch": 0.14507740868781002, + "flos": 15597921723840.0, + "grad_norm": 2.0272336669189794, + "language_loss": 0.7688005, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79748511, + "num_input_tokens_seen": 52239690, + "step": 2413, + "time_per_iteration": 2.8213210105895996 + }, + { + "auxiliary_loss_clip": 0.0166379, + "auxiliary_loss_mlp": 0.01223495, + "balance_loss_clip": 1.32871819, + "balance_loss_mlp": 1.0266571, + "epoch": 0.145137531940478, + "flos": 66682011631680.0, + "grad_norm": 0.8687003156755744, + "language_loss": 0.58831215, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.617185, + "num_input_tokens_seen": 52296705, + "step": 2414, + "time_per_iteration": 3.299733877182007 + }, + { + "auxiliary_loss_clip": 0.01664037, + "auxiliary_loss_mlp": 0.0123632, + "balance_loss_clip": 1.3289721, + "balance_loss_mlp": 1.03871918, + "epoch": 0.14519765519314595, + "flos": 67528690326720.0, + "grad_norm": 0.8378536579028053, + "language_loss": 0.62108946, + "learning_rate": 3.862400591386154e-06, + "loss": 0.65009308, + "num_input_tokens_seen": 52361830, + "step": 2415, + "time_per_iteration": 3.2703285217285156 + }, + { + "auxiliary_loss_clip": 0.01556919, + "auxiliary_loss_mlp": 0.01322684, + "balance_loss_clip": 1.20894456, + "balance_loss_mlp": 1.07511044, + "epoch": 0.14525777844581392, + "flos": 17200668011040.0, + "grad_norm": 2.2859397002965265, + "language_loss": 0.72956902, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.75836504, + "num_input_tokens_seen": 52379420, + "step": 2416, + "time_per_iteration": 2.7374422550201416 + }, + { + "auxiliary_loss_clip": 0.01657992, + "auxiliary_loss_mlp": 0.01208267, + "balance_loss_clip": 1.3232944, + "balance_loss_mlp": 1.00990295, + "epoch": 0.14531790169848188, + "flos": 65411557233120.0, + "grad_norm": 0.7183065833509399, + "language_loss": 0.60369742, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.63236004, + "num_input_tokens_seen": 52446290, + "step": 2417, + "time_per_iteration": 3.276451826095581 + }, + { + "auxiliary_loss_clip": 0.01558749, + "auxiliary_loss_mlp": 0.01334798, + "balance_loss_clip": 1.20910597, + "balance_loss_mlp": 1.08627081, + "epoch": 0.14537802495114985, + "flos": 32565253508640.0, + "grad_norm": 3.230406229924125, + "language_loss": 0.79408944, + "learning_rate": 3.861974388030356e-06, + "loss": 0.82302487, + "num_input_tokens_seen": 52467295, + "step": 2418, + "time_per_iteration": 7.3618950843811035 + }, + { + "auxiliary_loss_clip": 0.01567807, + "auxiliary_loss_mlp": 0.01301957, + "balance_loss_clip": 1.21807659, + "balance_loss_mlp": 1.056481, + "epoch": 0.1454381482038178, + "flos": 20228579118720.0, + "grad_norm": 1.7383291817367055, + "language_loss": 0.71552849, + "learning_rate": 3.861832179025394e-06, + "loss": 0.7442261, + "num_input_tokens_seen": 52487295, + "step": 2419, + "time_per_iteration": 2.7618062496185303 + }, + { + "auxiliary_loss_clip": 0.01562798, + "auxiliary_loss_mlp": 0.01295778, + "balance_loss_clip": 1.21414387, + "balance_loss_mlp": 1.0483948, + "epoch": 0.1454982714564858, + "flos": 22895334643680.0, + "grad_norm": 4.158268939063963, + "language_loss": 0.90156448, + "learning_rate": 3.861689899419569e-06, + "loss": 0.93015021, + "num_input_tokens_seen": 52504220, + "step": 2420, + "time_per_iteration": 2.816071033477783 + }, + { + "auxiliary_loss_clip": 0.01557398, + "auxiliary_loss_mlp": 0.01309703, + "balance_loss_clip": 1.20836365, + "balance_loss_mlp": 1.06155682, + "epoch": 0.14555839470915377, + "flos": 20231916796800.0, + "grad_norm": 1.931236388734051, + "language_loss": 0.83564866, + "learning_rate": 3.861547549218276e-06, + "loss": 0.86431968, + "num_input_tokens_seen": 52521900, + "step": 2421, + "time_per_iteration": 2.76590633392334 + }, + { + "auxiliary_loss_clip": 0.01561088, + "auxiliary_loss_mlp": 0.01335122, + "balance_loss_clip": 1.21199691, + "balance_loss_mlp": 1.09670353, + "epoch": 0.14561851796182174, + "flos": 22238450282880.0, + "grad_norm": 1.6348966658390958, + "language_loss": 0.81715947, + "learning_rate": 3.861405128426914e-06, + "loss": 0.84612155, + "num_input_tokens_seen": 52540495, + "step": 2422, + "time_per_iteration": 2.7775228023529053 + }, + { + "auxiliary_loss_clip": 0.01662101, + "auxiliary_loss_mlp": 0.01228264, + "balance_loss_clip": 1.33122587, + "balance_loss_mlp": 1.02455902, + "epoch": 0.1456786412144897, + "flos": 52643142226080.0, + "grad_norm": 0.9123629803593412, + "language_loss": 0.63290393, + "learning_rate": 3.861262637050883e-06, + "loss": 0.66180754, + "num_input_tokens_seen": 52603305, + "step": 2423, + "time_per_iteration": 3.3331470489501953 + }, + { + "auxiliary_loss_clip": 0.01561682, + "auxiliary_loss_mlp": 0.01326343, + "balance_loss_clip": 1.21259451, + "balance_loss_mlp": 1.08201194, + "epoch": 0.14573876446715767, + "flos": 23223909572640.0, + "grad_norm": 2.119906161013361, + "language_loss": 0.82485056, + "learning_rate": 3.861120075095585e-06, + "loss": 0.8537308, + "num_input_tokens_seen": 52623435, + "step": 2424, + "time_per_iteration": 2.803412437438965 + }, + { + "auxiliary_loss_clip": 0.01559537, + "auxiliary_loss_mlp": 0.01362618, + "balance_loss_clip": 1.21004891, + "balance_loss_mlp": 1.11733329, + "epoch": 0.14579888771982563, + "flos": 18116376685920.0, + "grad_norm": 2.4075904942745407, + "language_loss": 0.78712279, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81634432, + "num_input_tokens_seen": 52642255, + "step": 2425, + "time_per_iteration": 2.7184553146362305 + }, + { + "auxiliary_loss_clip": 0.01562477, + "auxiliary_loss_mlp": 0.01374509, + "balance_loss_clip": 1.2136693, + "balance_loss_mlp": 1.12769783, + "epoch": 0.14585901097249362, + "flos": 23003241351840.0, + "grad_norm": 2.8105603384611992, + "language_loss": 0.83585835, + "learning_rate": 3.860834739468821e-06, + "loss": 0.86522824, + "num_input_tokens_seen": 52658700, + "step": 2426, + "time_per_iteration": 2.7205495834350586 + }, + { + "auxiliary_loss_clip": 0.01571213, + "auxiliary_loss_mlp": 0.01388814, + "balance_loss_clip": 1.22263432, + "balance_loss_mlp": 1.14944124, + "epoch": 0.1459191342251616, + "flos": 21910861486080.0, + "grad_norm": 2.407306200583126, + "language_loss": 0.8749845, + "learning_rate": 3.860691965808173e-06, + "loss": 0.90458477, + "num_input_tokens_seen": 52678140, + "step": 2427, + "time_per_iteration": 2.696634531021118 + }, + { + "auxiliary_loss_clip": 0.0155557, + "auxiliary_loss_mlp": 0.01368535, + "balance_loss_clip": 1.20685542, + "balance_loss_mlp": 1.12401295, + "epoch": 0.14597925747782955, + "flos": 14977259120160.0, + "grad_norm": 1.9233042106682574, + "language_loss": 0.6743409, + "learning_rate": 3.8605491215899e-06, + "loss": 0.70358193, + "num_input_tokens_seen": 52696825, + "step": 2428, + "time_per_iteration": 2.617429256439209 + }, + { + "auxiliary_loss_clip": 0.01555389, + "auxiliary_loss_mlp": 0.01336186, + "balance_loss_clip": 1.20703793, + "balance_loss_mlp": 1.09662282, + "epoch": 0.14603938073049752, + "flos": 21071009859840.0, + "grad_norm": 1.9724371605910775, + "language_loss": 0.83753514, + "learning_rate": 3.860406206819417e-06, + "loss": 0.86645091, + "num_input_tokens_seen": 52715125, + "step": 2429, + "time_per_iteration": 2.6657934188842773 + }, + { + "auxiliary_loss_clip": 0.01554955, + "auxiliary_loss_mlp": 0.01327637, + "balance_loss_clip": 1.20557451, + "balance_loss_mlp": 1.09589386, + "epoch": 0.14609950398316549, + "flos": 19866816685440.0, + "grad_norm": 1.8910894951038402, + "language_loss": 0.79051995, + "learning_rate": 3.860263221502145e-06, + "loss": 0.81934583, + "num_input_tokens_seen": 52734015, + "step": 2430, + "time_per_iteration": 2.687530755996704 + }, + { + "auxiliary_loss_clip": 0.01567889, + "auxiliary_loss_mlp": 0.01389094, + "balance_loss_clip": 1.21889162, + "balance_loss_mlp": 1.15963995, + "epoch": 0.14615962723583345, + "flos": 22421038266720.0, + "grad_norm": 2.129977308969517, + "language_loss": 0.8301214, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85969126, + "num_input_tokens_seen": 52753025, + "step": 2431, + "time_per_iteration": 2.7432661056518555 + }, + { + "auxiliary_loss_clip": 0.01569172, + "auxiliary_loss_mlp": 0.01388701, + "balance_loss_clip": 1.22115839, + "balance_loss_mlp": 1.15867519, + "epoch": 0.14621975048850142, + "flos": 22348443039840.0, + "grad_norm": 3.8097402987776485, + "language_loss": 0.79003674, + "learning_rate": 3.859977039248921e-06, + "loss": 0.81961548, + "num_input_tokens_seen": 52773420, + "step": 2432, + "time_per_iteration": 2.719949245452881 + }, + { + "auxiliary_loss_clip": 0.01563926, + "auxiliary_loss_mlp": 0.01370471, + "balance_loss_clip": 1.2163291, + "balance_loss_mlp": 1.1360575, + "epoch": 0.1462798737411694, + "flos": 24391463780160.0, + "grad_norm": 2.279766783527353, + "language_loss": 0.80244195, + "learning_rate": 3.859833842323822e-06, + "loss": 0.83178598, + "num_input_tokens_seen": 52792870, + "step": 2433, + "time_per_iteration": 2.7817740440368652 + }, + { + "auxiliary_loss_clip": 0.01558961, + "auxiliary_loss_mlp": 0.01369631, + "balance_loss_clip": 1.21133244, + "balance_loss_mlp": 1.13960505, + "epoch": 0.14633999699383737, + "flos": 19246685076000.0, + "grad_norm": 2.7091977210944775, + "language_loss": 0.78181201, + "learning_rate": 3.859690574873638e-06, + "loss": 0.81109792, + "num_input_tokens_seen": 52811615, + "step": 2434, + "time_per_iteration": 2.728252649307251 + }, + { + "auxiliary_loss_clip": 0.0164748, + "auxiliary_loss_mlp": 0.0124176, + "balance_loss_clip": 1.31762838, + "balance_loss_mlp": 1.04873657, + "epoch": 0.14640012024650534, + "flos": 62667010323360.0, + "grad_norm": 0.8634506731420342, + "language_loss": 0.58420169, + "learning_rate": 3.8595472369038e-06, + "loss": 0.61309409, + "num_input_tokens_seen": 52873230, + "step": 2435, + "time_per_iteration": 3.3128116130828857 + }, + { + "auxiliary_loss_clip": 0.01555726, + "auxiliary_loss_mlp": 0.01322731, + "balance_loss_clip": 1.20828485, + "balance_loss_mlp": 1.08087969, + "epoch": 0.1464602434991733, + "flos": 12277885013280.0, + "grad_norm": 2.549866877829674, + "language_loss": 0.8865788, + "learning_rate": 3.859403828419744e-06, + "loss": 0.91536337, + "num_input_tokens_seen": 52889325, + "step": 2436, + "time_per_iteration": 2.7944581508636475 + }, + { + "auxiliary_loss_clip": 0.01551065, + "auxiliary_loss_mlp": 0.0137752, + "balance_loss_clip": 1.20265412, + "balance_loss_mlp": 1.13109016, + "epoch": 0.14652036675184127, + "flos": 20924378136000.0, + "grad_norm": 2.489082368127323, + "language_loss": 0.74949843, + "learning_rate": 3.85926034942691e-06, + "loss": 0.77878428, + "num_input_tokens_seen": 52909705, + "step": 2437, + "time_per_iteration": 2.8161990642547607 + }, + { + "auxiliary_loss_clip": 0.0155434, + "auxiliary_loss_mlp": 0.01417007, + "balance_loss_clip": 1.20549619, + "balance_loss_mlp": 1.16638112, + "epoch": 0.14658049000450923, + "flos": 27705849194880.0, + "grad_norm": 2.340933603620872, + "language_loss": 0.73865819, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76837158, + "num_input_tokens_seen": 52930300, + "step": 2438, + "time_per_iteration": 2.7662129402160645 + }, + { + "auxiliary_loss_clip": 0.01559394, + "auxiliary_loss_mlp": 0.01450425, + "balance_loss_clip": 1.21065462, + "balance_loss_mlp": 1.20780993, + "epoch": 0.14664061325717723, + "flos": 24938848450080.0, + "grad_norm": 1.864033491352056, + "language_loss": 0.746364, + "learning_rate": 3.858973179936668e-06, + "loss": 0.7764622, + "num_input_tokens_seen": 52949955, + "step": 2439, + "time_per_iteration": 2.8254892826080322 + }, + { + "auxiliary_loss_clip": 0.01550518, + "auxiliary_loss_mlp": 0.01398154, + "balance_loss_clip": 1.20146406, + "balance_loss_mlp": 1.15038967, + "epoch": 0.1467007365098452, + "flos": 40300892760960.0, + "grad_norm": 2.323889734351152, + "language_loss": 0.74980521, + "learning_rate": 3.85882948945015e-06, + "loss": 0.77929193, + "num_input_tokens_seen": 52972905, + "step": 2440, + "time_per_iteration": 2.8921737670898438 + }, + { + "auxiliary_loss_clip": 0.01558372, + "auxiliary_loss_mlp": 0.0136802, + "balance_loss_clip": 1.20986211, + "balance_loss_mlp": 1.12750316, + "epoch": 0.14676085976251316, + "flos": 26543453217120.0, + "grad_norm": 2.279965647561627, + "language_loss": 0.8334136, + "learning_rate": 3.85868572847663e-06, + "loss": 0.86267757, + "num_input_tokens_seen": 52994850, + "step": 2441, + "time_per_iteration": 2.802849054336548 + }, + { + "auxiliary_loss_clip": 0.01549106, + "auxiliary_loss_mlp": 0.01308186, + "balance_loss_clip": 1.20016384, + "balance_loss_mlp": 1.05641639, + "epoch": 0.14682098301518112, + "flos": 23552370717120.0, + "grad_norm": 2.191821215604706, + "language_loss": 0.7259959, + "learning_rate": 3.858541897021563e-06, + "loss": 0.75456882, + "num_input_tokens_seen": 53014740, + "step": 2442, + "time_per_iteration": 2.803795576095581 + }, + { + "auxiliary_loss_clip": 0.01562406, + "auxiliary_loss_mlp": 0.01357295, + "balance_loss_clip": 1.21267748, + "balance_loss_mlp": 1.11944818, + "epoch": 0.1468811062678491, + "flos": 11652405533280.0, + "grad_norm": 6.239043873643049, + "language_loss": 0.81318867, + "learning_rate": 3.8583979950904e-06, + "loss": 0.84238565, + "num_input_tokens_seen": 53029780, + "step": 2443, + "time_per_iteration": 2.7373147010803223 + }, + { + "auxiliary_loss_clip": 0.01556841, + "auxiliary_loss_mlp": 0.01381922, + "balance_loss_clip": 1.2081244, + "balance_loss_mlp": 1.13835382, + "epoch": 0.14694122952051705, + "flos": 23004834334560.0, + "grad_norm": 2.181063618366127, + "language_loss": 0.83120322, + "learning_rate": 3.858254022688599e-06, + "loss": 0.86059093, + "num_input_tokens_seen": 53048620, + "step": 2444, + "time_per_iteration": 2.8243443965911865 + }, + { + "auxiliary_loss_clip": 0.01550026, + "auxiliary_loss_mlp": 0.01427482, + "balance_loss_clip": 1.20057535, + "balance_loss_mlp": 1.1873467, + "epoch": 0.14700135277318502, + "flos": 26505259195680.0, + "grad_norm": 3.5154137713624487, + "language_loss": 0.71231675, + "learning_rate": 3.85810997982162e-06, + "loss": 0.74209177, + "num_input_tokens_seen": 53070055, + "step": 2445, + "time_per_iteration": 2.781724452972412 + }, + { + "auxiliary_loss_clip": 0.01631808, + "auxiliary_loss_mlp": 0.01362877, + "balance_loss_clip": 1.29600132, + "balance_loss_mlp": 1.17137909, + "epoch": 0.147061476025853, + "flos": 59455600956000.0, + "grad_norm": 0.838592142584177, + "language_loss": 0.63036168, + "learning_rate": 3.857965866494923e-06, + "loss": 0.66030848, + "num_input_tokens_seen": 53126945, + "step": 2446, + "time_per_iteration": 3.2552003860473633 + }, + { + "auxiliary_loss_clip": 0.01556737, + "auxiliary_loss_mlp": 0.0141416, + "balance_loss_clip": 1.20657647, + "balance_loss_mlp": 1.17078209, + "epoch": 0.14712159927852098, + "flos": 28333490580000.0, + "grad_norm": 1.6512207039416935, + "language_loss": 0.75252533, + "learning_rate": 3.857821682713975e-06, + "loss": 0.78223425, + "num_input_tokens_seen": 53149130, + "step": 2447, + "time_per_iteration": 2.8342769145965576 + }, + { + "auxiliary_loss_clip": 0.01543266, + "auxiliary_loss_mlp": 0.01376031, + "balance_loss_clip": 1.19293392, + "balance_loss_mlp": 1.13532329, + "epoch": 0.14718172253118894, + "flos": 27092317085280.0, + "grad_norm": 2.8674504789366058, + "language_loss": 0.85272384, + "learning_rate": 3.857677428484242e-06, + "loss": 0.88191688, + "num_input_tokens_seen": 53167120, + "step": 2448, + "time_per_iteration": 2.7889673709869385 + }, + { + "auxiliary_loss_clip": 0.01631276, + "auxiliary_loss_mlp": 0.01305023, + "balance_loss_clip": 1.2952292, + "balance_loss_mlp": 1.11734009, + "epoch": 0.1472418457838569, + "flos": 66713151015360.0, + "grad_norm": 0.7826127490889095, + "language_loss": 0.5675385, + "learning_rate": 3.857533103811195e-06, + "loss": 0.59690154, + "num_input_tokens_seen": 53227945, + "step": 2449, + "time_per_iteration": 4.783655643463135 + }, + { + "auxiliary_loss_clip": 0.01557499, + "auxiliary_loss_mlp": 0.013299, + "balance_loss_clip": 1.20717573, + "balance_loss_mlp": 1.08766711, + "epoch": 0.14730196903652487, + "flos": 19575639286560.0, + "grad_norm": 2.249373934796398, + "language_loss": 0.85717118, + "learning_rate": 3.857388708700307e-06, + "loss": 0.88604522, + "num_input_tokens_seen": 53244615, + "step": 2450, + "time_per_iteration": 2.7332968711853027 + }, + { + "auxiliary_loss_clip": 0.01554834, + "auxiliary_loss_mlp": 0.01393829, + "balance_loss_clip": 1.20556355, + "balance_loss_mlp": 1.14816248, + "epoch": 0.14736209228919284, + "flos": 16072900807680.0, + "grad_norm": 2.9774450114736464, + "language_loss": 0.75252122, + "learning_rate": 3.857244243157052e-06, + "loss": 0.78200781, + "num_input_tokens_seen": 53262205, + "step": 2451, + "time_per_iteration": 2.7061960697174072 + }, + { + "auxiliary_loss_clip": 0.01551058, + "auxiliary_loss_mlp": 0.01448195, + "balance_loss_clip": 1.20078921, + "balance_loss_mlp": 1.19528103, + "epoch": 0.1474222155418608, + "flos": 23041700870400.0, + "grad_norm": 1.9298856892516165, + "language_loss": 0.82593501, + "learning_rate": 3.85709970718691e-06, + "loss": 0.85592759, + "num_input_tokens_seen": 53282445, + "step": 2452, + "time_per_iteration": 2.768242835998535 + }, + { + "auxiliary_loss_clip": 0.01552539, + "auxiliary_loss_mlp": 0.01454111, + "balance_loss_clip": 1.20202088, + "balance_loss_mlp": 1.20157814, + "epoch": 0.1474823387945288, + "flos": 17020507429440.0, + "grad_norm": 2.052465135602901, + "language_loss": 0.74238706, + "learning_rate": 3.856955100795361e-06, + "loss": 0.77245355, + "num_input_tokens_seen": 53299060, + "step": 2453, + "time_per_iteration": 2.8243038654327393 + }, + { + "auxiliary_loss_clip": 0.01551926, + "auxiliary_loss_mlp": 0.01438968, + "balance_loss_clip": 1.20041573, + "balance_loss_mlp": 1.18471861, + "epoch": 0.14754246204719676, + "flos": 17896998022560.0, + "grad_norm": 2.874500360036818, + "language_loss": 0.76132429, + "learning_rate": 3.856810423987889e-06, + "loss": 0.79123318, + "num_input_tokens_seen": 53315970, + "step": 2454, + "time_per_iteration": 2.701361656188965 + }, + { + "auxiliary_loss_clip": 0.01556101, + "auxiliary_loss_mlp": 0.01379841, + "balance_loss_clip": 1.20548248, + "balance_loss_mlp": 1.12921524, + "epoch": 0.14760258529986472, + "flos": 13080794247360.0, + "grad_norm": 2.484297281676825, + "language_loss": 0.83134979, + "learning_rate": 3.856665676769979e-06, + "loss": 0.86070919, + "num_input_tokens_seen": 53332940, + "step": 2455, + "time_per_iteration": 4.273358583450317 + }, + { + "auxiliary_loss_clip": 0.01549558, + "auxiliary_loss_mlp": 0.01345687, + "balance_loss_clip": 1.19762921, + "balance_loss_mlp": 1.10383475, + "epoch": 0.1476627085525327, + "flos": 30808820859840.0, + "grad_norm": 2.211051145382505, + "language_loss": 0.842188, + "learning_rate": 3.85652085914712e-06, + "loss": 0.87114042, + "num_input_tokens_seen": 53353295, + "step": 2456, + "time_per_iteration": 4.325783967971802 + }, + { + "auxiliary_loss_clip": 0.01557029, + "auxiliary_loss_mlp": 0.01349977, + "balance_loss_clip": 1.20493948, + "balance_loss_mlp": 1.11441922, + "epoch": 0.14772283180520066, + "flos": 21691596607200.0, + "grad_norm": 2.162068429816969, + "language_loss": 0.84660625, + "learning_rate": 3.856375971124805e-06, + "loss": 0.87567627, + "num_input_tokens_seen": 53373410, + "step": 2457, + "time_per_iteration": 4.245769500732422 + }, + { + "auxiliary_loss_clip": 0.01556621, + "auxiliary_loss_mlp": 0.01406093, + "balance_loss_clip": 1.20499277, + "balance_loss_mlp": 1.17644835, + "epoch": 0.14778295505786862, + "flos": 18772388699040.0, + "grad_norm": 2.0506149474320305, + "language_loss": 0.7551254, + "learning_rate": 3.856231012708527e-06, + "loss": 0.78475255, + "num_input_tokens_seen": 53391430, + "step": 2458, + "time_per_iteration": 2.74843168258667 + }, + { + "auxiliary_loss_clip": 0.01555656, + "auxiliary_loss_mlp": 0.01419267, + "balance_loss_clip": 1.20415068, + "balance_loss_mlp": 1.18313754, + "epoch": 0.1478430783105366, + "flos": 22895789781600.0, + "grad_norm": 2.335063895956271, + "language_loss": 0.83241427, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86216348, + "num_input_tokens_seen": 53409960, + "step": 2459, + "time_per_iteration": 2.762740135192871 + }, + { + "auxiliary_loss_clip": 0.0155331, + "auxiliary_loss_mlp": 0.01424818, + "balance_loss_clip": 1.20122564, + "balance_loss_mlp": 1.19479203, + "epoch": 0.14790320156320458, + "flos": 15087100164480.0, + "grad_norm": 2.226829914306968, + "language_loss": 0.75641704, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78619832, + "num_input_tokens_seen": 53426160, + "step": 2460, + "time_per_iteration": 2.75925612449646 + }, + { + "auxiliary_loss_clip": 0.01550679, + "auxiliary_loss_mlp": 0.01426413, + "balance_loss_clip": 1.19812524, + "balance_loss_mlp": 1.19829452, + "epoch": 0.14796332481587254, + "flos": 26507155603680.0, + "grad_norm": 1.7122146614264435, + "language_loss": 0.81647754, + "learning_rate": 3.855795715150896e-06, + "loss": 0.84624851, + "num_input_tokens_seen": 53448530, + "step": 2461, + "time_per_iteration": 2.874581813812256 + }, + { + "auxiliary_loss_clip": 0.01561153, + "auxiliary_loss_mlp": 0.0138734, + "balance_loss_clip": 1.20834947, + "balance_loss_mlp": 1.15368962, + "epoch": 0.1480234480685405, + "flos": 17564933702880.0, + "grad_norm": 4.745557949128436, + "language_loss": 0.66271734, + "learning_rate": 3.855650475213761e-06, + "loss": 0.69220221, + "num_input_tokens_seen": 53465915, + "step": 2462, + "time_per_iteration": 2.755371570587158 + }, + { + "auxiliary_loss_clip": 0.01560058, + "auxiliary_loss_mlp": 0.0135698, + "balance_loss_clip": 1.20703745, + "balance_loss_mlp": 1.12180424, + "epoch": 0.14808357132120847, + "flos": 53586387473760.0, + "grad_norm": 1.6177074089307084, + "language_loss": 0.673118, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.70228839, + "num_input_tokens_seen": 53496055, + "step": 2463, + "time_per_iteration": 3.055626153945923 + }, + { + "auxiliary_loss_clip": 0.01555739, + "auxiliary_loss_mlp": 0.01306419, + "balance_loss_clip": 1.20340538, + "balance_loss_mlp": 1.06666505, + "epoch": 0.14814369457387644, + "flos": 19831277635200.0, + "grad_norm": 2.2035970355452825, + "language_loss": 0.76709461, + "learning_rate": 3.855359784245646e-06, + "loss": 0.79571617, + "num_input_tokens_seen": 53513790, + "step": 2464, + "time_per_iteration": 2.8593947887420654 + }, + { + "auxiliary_loss_clip": 0.01560604, + "auxiliary_loss_mlp": 0.01344865, + "balance_loss_clip": 1.20798445, + "balance_loss_mlp": 1.10205996, + "epoch": 0.1482038178265444, + "flos": 23917243259520.0, + "grad_norm": 1.9435526160370107, + "language_loss": 0.80237037, + "learning_rate": 3.855214333225688e-06, + "loss": 0.83142507, + "num_input_tokens_seen": 53533410, + "step": 2465, + "time_per_iteration": 2.8783116340637207 + }, + { + "auxiliary_loss_clip": 0.01554967, + "auxiliary_loss_mlp": 0.0134, + "balance_loss_clip": 1.20221841, + "balance_loss_mlp": 1.0870856, + "epoch": 0.1482639410792124, + "flos": 24172805751840.0, + "grad_norm": 3.0920827984872363, + "language_loss": 0.7667824, + "learning_rate": 3.855068811855817e-06, + "loss": 0.79573202, + "num_input_tokens_seen": 53554775, + "step": 2466, + "time_per_iteration": 2.846306562423706 + }, + { + "auxiliary_loss_clip": 0.01644562, + "auxiliary_loss_mlp": 0.01230286, + "balance_loss_clip": 1.30320334, + "balance_loss_mlp": 1.03039551, + "epoch": 0.14832406433188036, + "flos": 66197133298080.0, + "grad_norm": 0.785549563759359, + "language_loss": 0.60017157, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62891996, + "num_input_tokens_seen": 53609675, + "step": 2467, + "time_per_iteration": 3.3910436630249023 + }, + { + "auxiliary_loss_clip": 0.01558395, + "auxiliary_loss_mlp": 0.01362817, + "balance_loss_clip": 1.20448434, + "balance_loss_mlp": 1.12077427, + "epoch": 0.14838418758454833, + "flos": 25413637893120.0, + "grad_norm": 2.0663657753451177, + "language_loss": 0.88253766, + "learning_rate": 3.85477755808841e-06, + "loss": 0.91174984, + "num_input_tokens_seen": 53626950, + "step": 2468, + "time_per_iteration": 2.8273603916168213 + }, + { + "auxiliary_loss_clip": 0.01548827, + "auxiliary_loss_mlp": 0.01413289, + "balance_loss_clip": 1.19506359, + "balance_loss_mlp": 1.16647816, + "epoch": 0.1484443108372163, + "flos": 23291877564000.0, + "grad_norm": 2.3511423166786827, + "language_loss": 0.76034027, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78996146, + "num_input_tokens_seen": 53644200, + "step": 2469, + "time_per_iteration": 2.7848198413848877 + }, + { + "auxiliary_loss_clip": 0.01551952, + "auxiliary_loss_mlp": 0.01450047, + "balance_loss_clip": 1.19887936, + "balance_loss_mlp": 1.21143746, + "epoch": 0.14850443408988426, + "flos": 14649442754400.0, + "grad_norm": 2.612313778921489, + "language_loss": 0.75844669, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78846669, + "num_input_tokens_seen": 53659650, + "step": 2470, + "time_per_iteration": 2.7954256534576416 + }, + { + "auxiliary_loss_clip": 0.01555815, + "auxiliary_loss_mlp": 0.01452065, + "balance_loss_clip": 1.20283198, + "balance_loss_mlp": 1.21250212, + "epoch": 0.14856455734255222, + "flos": 23550474309120.0, + "grad_norm": 2.0985785037678197, + "language_loss": 0.72580886, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.75588769, + "num_input_tokens_seen": 53680275, + "step": 2471, + "time_per_iteration": 2.795914888381958 + }, + { + "auxiliary_loss_clip": 0.01545961, + "auxiliary_loss_mlp": 0.01457661, + "balance_loss_clip": 1.19196773, + "balance_loss_mlp": 1.21657181, + "epoch": 0.1486246805952202, + "flos": 18079244652960.0, + "grad_norm": 2.550613313032312, + "language_loss": 0.90032631, + "learning_rate": 3.854194206597615e-06, + "loss": 0.93036252, + "num_input_tokens_seen": 53698270, + "step": 2472, + "time_per_iteration": 2.7486538887023926 + }, + { + "auxiliary_loss_clip": 0.01549557, + "auxiliary_loss_mlp": 0.01451214, + "balance_loss_clip": 1.19634414, + "balance_loss_mlp": 1.21508455, + "epoch": 0.14868480384788818, + "flos": 19355615844480.0, + "grad_norm": 2.3394580815474897, + "language_loss": 0.80375034, + "learning_rate": 3.854048192933008e-06, + "loss": 0.83375806, + "num_input_tokens_seen": 53716845, + "step": 2473, + "time_per_iteration": 2.810352087020874 + }, + { + "auxiliary_loss_clip": 0.01544455, + "auxiliary_loss_mlp": 0.01458825, + "balance_loss_clip": 1.19037175, + "balance_loss_mlp": 1.22059751, + "epoch": 0.14874492710055615, + "flos": 22202380238400.0, + "grad_norm": 5.468956887734675, + "language_loss": 0.77683544, + "learning_rate": 3.853902108962709e-06, + "loss": 0.80686831, + "num_input_tokens_seen": 53734970, + "step": 2474, + "time_per_iteration": 2.7959280014038086 + }, + { + "auxiliary_loss_clip": 0.01537346, + "auxiliary_loss_mlp": 0.01433626, + "balance_loss_clip": 1.18275976, + "balance_loss_mlp": 1.19616151, + "epoch": 0.1488050503532241, + "flos": 21105448993440.0, + "grad_norm": 1.8223026247440084, + "language_loss": 0.82440895, + "learning_rate": 3.853755954692255e-06, + "loss": 0.8541187, + "num_input_tokens_seen": 53753415, + "step": 2475, + "time_per_iteration": 2.7855379581451416 + }, + { + "auxiliary_loss_clip": 0.01552621, + "auxiliary_loss_mlp": 0.01443892, + "balance_loss_clip": 1.19763339, + "balance_loss_mlp": 1.20890677, + "epoch": 0.14886517360589208, + "flos": 12788213506560.0, + "grad_norm": 2.1106494336488044, + "language_loss": 0.80352527, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83349037, + "num_input_tokens_seen": 53770305, + "step": 2476, + "time_per_iteration": 2.7916336059570312 + }, + { + "auxiliary_loss_clip": 0.01550027, + "auxiliary_loss_mlp": 0.01404383, + "balance_loss_clip": 1.19599867, + "balance_loss_mlp": 1.16806245, + "epoch": 0.14892529685856004, + "flos": 29025420924960.0, + "grad_norm": 5.8650383465355835, + "language_loss": 0.78169841, + "learning_rate": 3.853463435273058e-06, + "loss": 0.81124252, + "num_input_tokens_seen": 53788895, + "step": 2477, + "time_per_iteration": 2.8096885681152344 + }, + { + "auxiliary_loss_clip": 0.01623208, + "auxiliary_loss_mlp": 0.01280777, + "balance_loss_clip": 1.27855062, + "balance_loss_mlp": 1.09156799, + "epoch": 0.148985420111228, + "flos": 61932182865120.0, + "grad_norm": 0.8225157199125859, + "language_loss": 0.60115075, + "learning_rate": 3.853317070135407e-06, + "loss": 0.63019061, + "num_input_tokens_seen": 53850260, + "step": 2478, + "time_per_iteration": 3.3807952404022217 + }, + { + "auxiliary_loss_clip": 0.01541844, + "auxiliary_loss_mlp": 0.01331648, + "balance_loss_clip": 1.18726301, + "balance_loss_mlp": 1.08731651, + "epoch": 0.149045543363896, + "flos": 23917394972160.0, + "grad_norm": 2.9952246405560685, + "language_loss": 0.71352661, + "learning_rate": 3.853170634719787e-06, + "loss": 0.74226159, + "num_input_tokens_seen": 53867520, + "step": 2479, + "time_per_iteration": 2.759598970413208 + }, + { + "auxiliary_loss_clip": 0.0154113, + "auxiliary_loss_mlp": 0.01398283, + "balance_loss_clip": 1.18614256, + "balance_loss_mlp": 1.15261674, + "epoch": 0.14910566661656396, + "flos": 23656370824800.0, + "grad_norm": 2.3345693894717563, + "language_loss": 0.81357825, + "learning_rate": 3.853024129031751e-06, + "loss": 0.8429724, + "num_input_tokens_seen": 53886620, + "step": 2480, + "time_per_iteration": 2.7541816234588623 + }, + { + "auxiliary_loss_clip": 0.01535863, + "auxiliary_loss_mlp": 0.01426941, + "balance_loss_clip": 1.18167377, + "balance_loss_mlp": 1.17860377, + "epoch": 0.14916578986923193, + "flos": 20517025690080.0, + "grad_norm": 2.104393129771637, + "language_loss": 0.84661764, + "learning_rate": 3.852877553076854e-06, + "loss": 0.87624568, + "num_input_tokens_seen": 53902230, + "step": 2481, + "time_per_iteration": 2.7655608654022217 + }, + { + "auxiliary_loss_clip": 0.01542828, + "auxiliary_loss_mlp": 0.01443707, + "balance_loss_clip": 1.18892813, + "balance_loss_mlp": 1.19365406, + "epoch": 0.1492259131218999, + "flos": 22494012775200.0, + "grad_norm": 3.701697566282837, + "language_loss": 0.77764475, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.80751014, + "num_input_tokens_seen": 53919475, + "step": 2482, + "time_per_iteration": 2.7308337688446045 + }, + { + "auxiliary_loss_clip": 0.01541102, + "auxiliary_loss_mlp": 0.01450779, + "balance_loss_clip": 1.1877501, + "balance_loss_mlp": 1.19691133, + "epoch": 0.14928603637456786, + "flos": 23188142953440.0, + "grad_norm": 2.5995281304495514, + "language_loss": 0.78759366, + "learning_rate": 3.852584190388713e-06, + "loss": 0.81751251, + "num_input_tokens_seen": 53939150, + "step": 2483, + "time_per_iteration": 2.745448350906372 + }, + { + "auxiliary_loss_clip": 0.01548819, + "auxiliary_loss_mlp": 0.01414059, + "balance_loss_clip": 1.1954248, + "balance_loss_mlp": 1.16839206, + "epoch": 0.14934615962723582, + "flos": 21655412778240.0, + "grad_norm": 1.5984201738498485, + "language_loss": 0.70696843, + "learning_rate": 3.852437403666595e-06, + "loss": 0.73659718, + "num_input_tokens_seen": 53958735, + "step": 2484, + "time_per_iteration": 2.8194425106048584 + }, + { + "auxiliary_loss_clip": 0.01553759, + "auxiliary_loss_mlp": 0.01399907, + "balance_loss_clip": 1.19900441, + "balance_loss_mlp": 1.15862775, + "epoch": 0.1494062828799038, + "flos": 27012174154560.0, + "grad_norm": 1.8356643231708951, + "language_loss": 0.8441931, + "learning_rate": 3.852290546699863e-06, + "loss": 0.87372983, + "num_input_tokens_seen": 53975065, + "step": 2485, + "time_per_iteration": 2.8323984146118164 + }, + { + "auxiliary_loss_clip": 0.01545333, + "auxiliary_loss_mlp": 0.01330687, + "balance_loss_clip": 1.19144583, + "balance_loss_mlp": 1.0911243, + "epoch": 0.14946640613257178, + "flos": 21217072661280.0, + "grad_norm": 2.034540233445279, + "language_loss": 0.85241461, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.8811748, + "num_input_tokens_seen": 53993330, + "step": 2486, + "time_per_iteration": 2.802687168121338 + }, + { + "auxiliary_loss_clip": 0.01546794, + "auxiliary_loss_mlp": 0.01377954, + "balance_loss_clip": 1.19351053, + "balance_loss_mlp": 1.14296901, + "epoch": 0.14952652938523975, + "flos": 13372578496800.0, + "grad_norm": 2.221034551977866, + "language_loss": 0.74563152, + "learning_rate": 3.851996622054842e-06, + "loss": 0.77487904, + "num_input_tokens_seen": 54010515, + "step": 2487, + "time_per_iteration": 4.297451019287109 + }, + { + "auxiliary_loss_clip": 0.01542152, + "auxiliary_loss_mlp": 0.01410991, + "balance_loss_clip": 1.18720019, + "balance_loss_mlp": 1.1811552, + "epoch": 0.1495866526379077, + "flos": 35520645245760.0, + "grad_norm": 3.001900150328936, + "language_loss": 0.72343326, + "learning_rate": 3.8518495543877e-06, + "loss": 0.75296474, + "num_input_tokens_seen": 54031315, + "step": 2488, + "time_per_iteration": 2.8812339305877686 + }, + { + "auxiliary_loss_clip": 0.01539, + "auxiliary_loss_mlp": 0.01437566, + "balance_loss_clip": 1.1852932, + "balance_loss_mlp": 1.20887506, + "epoch": 0.14964677589057568, + "flos": 17634305036160.0, + "grad_norm": 12.266235838136218, + "language_loss": 0.70786393, + "learning_rate": 3.851702416498235e-06, + "loss": 0.73762959, + "num_input_tokens_seen": 54045965, + "step": 2489, + "time_per_iteration": 2.736828565597534 + }, + { + "auxiliary_loss_clip": 0.01546595, + "auxiliary_loss_mlp": 0.01459139, + "balance_loss_clip": 1.19237876, + "balance_loss_mlp": 1.22377229, + "epoch": 0.14970689914324364, + "flos": 20186895706560.0, + "grad_norm": 4.6561404681891405, + "language_loss": 0.82056195, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.85061926, + "num_input_tokens_seen": 54059960, + "step": 2490, + "time_per_iteration": 2.715425491333008 + }, + { + "auxiliary_loss_clip": 0.01546379, + "auxiliary_loss_mlp": 0.01471986, + "balance_loss_clip": 1.19144785, + "balance_loss_mlp": 1.23986149, + "epoch": 0.1497670223959116, + "flos": 37231525810080.0, + "grad_norm": 1.9790352099384396, + "language_loss": 0.80041254, + "learning_rate": 3.851407930074666e-06, + "loss": 0.83059621, + "num_input_tokens_seen": 54079330, + "step": 2491, + "time_per_iteration": 2.8664207458496094 + }, + { + "auxiliary_loss_clip": 0.01538307, + "auxiliary_loss_mlp": 0.01467026, + "balance_loss_clip": 1.18343246, + "balance_loss_mlp": 1.23509264, + "epoch": 0.1498271456485796, + "flos": 24457800860640.0, + "grad_norm": 1.8731623183081416, + "language_loss": 0.91304463, + "learning_rate": 3.851260581551727e-06, + "loss": 0.94309795, + "num_input_tokens_seen": 54097555, + "step": 2492, + "time_per_iteration": 2.776118278503418 + }, + { + "auxiliary_loss_clip": 0.0155276, + "auxiliary_loss_mlp": 0.01476837, + "balance_loss_clip": 1.19723547, + "balance_loss_mlp": 1.24681115, + "epoch": 0.14988726890124757, + "flos": 16255488791520.0, + "grad_norm": 3.1371735876735207, + "language_loss": 0.79122877, + "learning_rate": 3.851113162828802e-06, + "loss": 0.82152474, + "num_input_tokens_seen": 54115600, + "step": 2493, + "time_per_iteration": 2.8247663974761963 + }, + { + "auxiliary_loss_clip": 0.0153871, + "auxiliary_loss_mlp": 0.0145456, + "balance_loss_clip": 1.18352127, + "balance_loss_mlp": 1.22071934, + "epoch": 0.14994739215391553, + "flos": 20668588074720.0, + "grad_norm": 1.784382846473902, + "language_loss": 0.80132103, + "learning_rate": 3.85096567391148e-06, + "loss": 0.83125371, + "num_input_tokens_seen": 54135220, + "step": 2494, + "time_per_iteration": 4.310056924819946 + }, + { + "auxiliary_loss_clip": 0.0154336, + "auxiliary_loss_mlp": 0.01453249, + "balance_loss_clip": 1.18791008, + "balance_loss_mlp": 1.21959853, + "epoch": 0.1500075154065835, + "flos": 70657226300160.0, + "grad_norm": 2.457972590126404, + "language_loss": 0.66427064, + "learning_rate": 3.850818114805354e-06, + "loss": 0.6942367, + "num_input_tokens_seen": 54161065, + "step": 2495, + "time_per_iteration": 4.720576047897339 + }, + { + "auxiliary_loss_clip": 0.01630861, + "auxiliary_loss_mlp": 0.01369194, + "balance_loss_clip": 1.28059435, + "balance_loss_mlp": 1.18379974, + "epoch": 0.15006763865925146, + "flos": 68017930763040.0, + "grad_norm": 0.903117615803456, + "language_loss": 0.59493858, + "learning_rate": 3.850670485516019e-06, + "loss": 0.62493914, + "num_input_tokens_seen": 54225095, + "step": 2496, + "time_per_iteration": 3.2933425903320312 + }, + { + "auxiliary_loss_clip": 0.01544621, + "auxiliary_loss_mlp": 0.01413962, + "balance_loss_clip": 1.1892345, + "balance_loss_mlp": 1.17401731, + "epoch": 0.15012776191191943, + "flos": 18918261859680.0, + "grad_norm": 1.8616910525129362, + "language_loss": 0.658373, + "learning_rate": 3.850522786049075e-06, + "loss": 0.68795884, + "num_input_tokens_seen": 54243750, + "step": 2497, + "time_per_iteration": 2.7927777767181396 + }, + { + "auxiliary_loss_clip": 0.01552625, + "auxiliary_loss_mlp": 0.01347106, + "balance_loss_clip": 1.19653702, + "balance_loss_mlp": 1.10906923, + "epoch": 0.1501878851645874, + "flos": 23703895173600.0, + "grad_norm": 1.7109856402614783, + "language_loss": 0.75387901, + "learning_rate": 3.850375016410121e-06, + "loss": 0.78287631, + "num_input_tokens_seen": 54266185, + "step": 2498, + "time_per_iteration": 2.9177675247192383 + }, + { + "auxiliary_loss_clip": 0.01547571, + "auxiliary_loss_mlp": 0.01340715, + "balance_loss_clip": 1.19189858, + "balance_loss_mlp": 1.08894467, + "epoch": 0.15024800841725539, + "flos": 20414618565120.0, + "grad_norm": 2.4674262902792106, + "language_loss": 0.72463202, + "learning_rate": 3.850227176604761e-06, + "loss": 0.75351489, + "num_input_tokens_seen": 54283940, + "step": 2499, + "time_per_iteration": 2.830550193786621 + }, + { + "auxiliary_loss_clip": 0.01543804, + "auxiliary_loss_mlp": 0.0140138, + "balance_loss_clip": 1.18740332, + "balance_loss_mlp": 1.15304351, + "epoch": 0.15030813166992335, + "flos": 31833649944000.0, + "grad_norm": 1.9794054814905493, + "language_loss": 0.72027475, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74972659, + "num_input_tokens_seen": 54304830, + "step": 2500, + "time_per_iteration": 2.8689472675323486 + }, + { + "auxiliary_loss_clip": 0.01542669, + "auxiliary_loss_mlp": 0.0144188, + "balance_loss_clip": 1.18688166, + "balance_loss_mlp": 1.19106364, + "epoch": 0.15036825492259132, + "flos": 35660449900800.0, + "grad_norm": 2.0009998702655647, + "language_loss": 0.65172648, + "learning_rate": 3.849931286517249e-06, + "loss": 0.68157202, + "num_input_tokens_seen": 54325595, + "step": 2501, + "time_per_iteration": 2.8730599880218506 + }, + { + "auxiliary_loss_clip": 0.01543404, + "auxiliary_loss_mlp": 0.01448054, + "balance_loss_clip": 1.18817973, + "balance_loss_mlp": 1.19552147, + "epoch": 0.15042837817525928, + "flos": 18839863624320.0, + "grad_norm": 2.541471030520374, + "language_loss": 0.83717585, + "learning_rate": 3.849783236246318e-06, + "loss": 0.86709046, + "num_input_tokens_seen": 54342180, + "step": 2502, + "time_per_iteration": 2.798248767852783 + }, + { + "auxiliary_loss_clip": 0.01537767, + "auxiliary_loss_mlp": 0.0143985, + "balance_loss_clip": 1.18203092, + "balance_loss_mlp": 1.18579149, + "epoch": 0.15048850142792725, + "flos": 19537521121440.0, + "grad_norm": 2.755368612583462, + "language_loss": 0.77915508, + "learning_rate": 3.849635115831421e-06, + "loss": 0.80893123, + "num_input_tokens_seen": 54360255, + "step": 2503, + "time_per_iteration": 2.799794912338257 + }, + { + "auxiliary_loss_clip": 0.01544212, + "auxiliary_loss_mlp": 0.0140787, + "balance_loss_clip": 1.18833065, + "balance_loss_mlp": 1.1560998, + "epoch": 0.1505486246805952, + "flos": 22019716398240.0, + "grad_norm": 2.162441654164587, + "language_loss": 0.85425878, + "learning_rate": 3.849486925278176e-06, + "loss": 0.88377959, + "num_input_tokens_seen": 54378260, + "step": 2504, + "time_per_iteration": 2.6969473361968994 + }, + { + "auxiliary_loss_clip": 0.01546038, + "auxiliary_loss_mlp": 0.01372846, + "balance_loss_clip": 1.18978977, + "balance_loss_mlp": 1.12946844, + "epoch": 0.15060874793326318, + "flos": 20745393327360.0, + "grad_norm": 1.6992617958689937, + "language_loss": 0.83363545, + "learning_rate": 3.8493386645922e-06, + "loss": 0.86282426, + "num_input_tokens_seen": 54399745, + "step": 2505, + "time_per_iteration": 2.8180313110351562 + }, + { + "auxiliary_loss_clip": 0.01533181, + "auxiliary_loss_mlp": 0.01314923, + "balance_loss_clip": 1.1771481, + "balance_loss_mlp": 1.07383406, + "epoch": 0.15066887118593117, + "flos": 16473767538240.0, + "grad_norm": 1.8883072814952095, + "language_loss": 0.76371622, + "learning_rate": 3.849190333779117e-06, + "loss": 0.79219723, + "num_input_tokens_seen": 54417105, + "step": 2506, + "time_per_iteration": 2.7722575664520264 + }, + { + "auxiliary_loss_clip": 0.01537265, + "auxiliary_loss_mlp": 0.01355527, + "balance_loss_clip": 1.17974091, + "balance_loss_mlp": 1.11138642, + "epoch": 0.15072899443859913, + "flos": 19861051605120.0, + "grad_norm": 3.1051505030668043, + "language_loss": 0.75881922, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78774714, + "num_input_tokens_seen": 54433920, + "step": 2507, + "time_per_iteration": 2.710909128189087 + }, + { + "auxiliary_loss_clip": 0.01539151, + "auxiliary_loss_mlp": 0.01403112, + "balance_loss_clip": 1.1832701, + "balance_loss_mlp": 1.16641021, + "epoch": 0.1507891176912671, + "flos": 20778277406400.0, + "grad_norm": 2.0649103929295864, + "language_loss": 0.69221783, + "learning_rate": 3.848893461794131e-06, + "loss": 0.72164047, + "num_input_tokens_seen": 54451540, + "step": 2508, + "time_per_iteration": 2.8624651432037354 + }, + { + "auxiliary_loss_clip": 0.0154185, + "auxiliary_loss_mlp": 0.01424968, + "balance_loss_clip": 1.18515134, + "balance_loss_mlp": 1.18578684, + "epoch": 0.15084924094393506, + "flos": 23588895899520.0, + "grad_norm": 1.629166003290684, + "language_loss": 0.77448606, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.80415428, + "num_input_tokens_seen": 54470800, + "step": 2509, + "time_per_iteration": 2.7768146991729736 + }, + { + "auxiliary_loss_clip": 0.01540355, + "auxiliary_loss_mlp": 0.01420574, + "balance_loss_clip": 1.18360519, + "balance_loss_mlp": 1.17776847, + "epoch": 0.15090936419660303, + "flos": 18913027773600.0, + "grad_norm": 8.52975103546817, + "language_loss": 0.80111897, + "learning_rate": 3.848596309368246e-06, + "loss": 0.83072829, + "num_input_tokens_seen": 54486525, + "step": 2510, + "time_per_iteration": 2.740567684173584 + }, + { + "auxiliary_loss_clip": 0.01536137, + "auxiliary_loss_mlp": 0.01414759, + "balance_loss_clip": 1.17896295, + "balance_loss_mlp": 1.1755774, + "epoch": 0.150969487449271, + "flos": 17929995886080.0, + "grad_norm": 2.170212120432146, + "language_loss": 0.74049401, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.77000296, + "num_input_tokens_seen": 54503795, + "step": 2511, + "time_per_iteration": 2.7759575843811035 + }, + { + "auxiliary_loss_clip": 0.01534614, + "auxiliary_loss_mlp": 0.01398921, + "balance_loss_clip": 1.17766476, + "balance_loss_mlp": 1.15668738, + "epoch": 0.151029610701939, + "flos": 24245363050560.0, + "grad_norm": 2.4744959218406555, + "language_loss": 0.69257492, + "learning_rate": 3.848298876546534e-06, + "loss": 0.72191024, + "num_input_tokens_seen": 54523025, + "step": 2512, + "time_per_iteration": 2.741692543029785 + }, + { + "auxiliary_loss_clip": 0.01537821, + "auxiliary_loss_mlp": 0.01384495, + "balance_loss_clip": 1.18024981, + "balance_loss_mlp": 1.14359713, + "epoch": 0.15108973395460695, + "flos": 30265077293280.0, + "grad_norm": 7.536633906050522, + "language_loss": 0.73771465, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76693785, + "num_input_tokens_seen": 54545025, + "step": 2513, + "time_per_iteration": 2.8663289546966553 + }, + { + "auxiliary_loss_clip": 0.01665508, + "auxiliary_loss_mlp": 0.01419693, + "balance_loss_clip": 1.31549549, + "balance_loss_mlp": 1.23544312, + "epoch": 0.15114985720727492, + "flos": 60444018642240.0, + "grad_norm": 1.1459234128450033, + "language_loss": 0.64674067, + "learning_rate": 3.84800116337411e-06, + "loss": 0.67759269, + "num_input_tokens_seen": 54604545, + "step": 2514, + "time_per_iteration": 3.2685017585754395 + }, + { + "auxiliary_loss_clip": 0.01537529, + "auxiliary_loss_mlp": 0.01318147, + "balance_loss_clip": 1.17940855, + "balance_loss_mlp": 1.07400632, + "epoch": 0.15120998045994288, + "flos": 20523776902560.0, + "grad_norm": 3.0678481733208955, + "language_loss": 0.73253942, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.76109624, + "num_input_tokens_seen": 54620590, + "step": 2515, + "time_per_iteration": 2.763908624649048 + }, + { + "auxiliary_loss_clip": 0.01531108, + "auxiliary_loss_mlp": 0.01335598, + "balance_loss_clip": 1.17344999, + "balance_loss_mlp": 1.0775336, + "epoch": 0.15127010371261085, + "flos": 21181040544960.0, + "grad_norm": 2.144001544313278, + "language_loss": 0.77959132, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.80825841, + "num_input_tokens_seen": 54640410, + "step": 2516, + "time_per_iteration": 2.8528974056243896 + }, + { + "auxiliary_loss_clip": 0.01634964, + "auxiliary_loss_mlp": 0.01360489, + "balance_loss_clip": 1.283023, + "balance_loss_mlp": 1.15144348, + "epoch": 0.1513302269652788, + "flos": 65326976707680.0, + "grad_norm": 0.7974642954308556, + "language_loss": 0.54671776, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.57667232, + "num_input_tokens_seen": 54701430, + "step": 2517, + "time_per_iteration": 3.2593538761138916 + }, + { + "auxiliary_loss_clip": 0.01526958, + "auxiliary_loss_mlp": 0.01319487, + "balance_loss_clip": 1.16857421, + "balance_loss_mlp": 1.06104088, + "epoch": 0.15139035021794678, + "flos": 19137868092000.0, + "grad_norm": 1.8820398591153924, + "language_loss": 0.78579545, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.81425989, + "num_input_tokens_seen": 54720845, + "step": 2518, + "time_per_iteration": 2.8123762607574463 + }, + { + "auxiliary_loss_clip": 0.01536042, + "auxiliary_loss_mlp": 0.01343037, + "balance_loss_clip": 1.17945075, + "balance_loss_mlp": 1.08897817, + "epoch": 0.15145047347061477, + "flos": 26581002459840.0, + "grad_norm": 3.9649817206263114, + "language_loss": 0.70706928, + "learning_rate": 3.847255654205137e-06, + "loss": 0.73586005, + "num_input_tokens_seen": 54740495, + "step": 2519, + "time_per_iteration": 2.7942447662353516 + }, + { + "auxiliary_loss_clip": 0.01532876, + "auxiliary_loss_mlp": 0.01396867, + "balance_loss_clip": 1.1753366, + "balance_loss_mlp": 1.14566922, + "epoch": 0.15151059672328274, + "flos": 20305043017920.0, + "grad_norm": 2.051611223569874, + "language_loss": 0.78825355, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81755096, + "num_input_tokens_seen": 54758415, + "step": 2520, + "time_per_iteration": 2.8855059146881104 + }, + { + "auxiliary_loss_clip": 0.01536325, + "auxiliary_loss_mlp": 0.01436309, + "balance_loss_clip": 1.17631161, + "balance_loss_mlp": 1.1904521, + "epoch": 0.1515707199759507, + "flos": 27230452901280.0, + "grad_norm": 2.759125948945102, + "language_loss": 0.75192964, + "learning_rate": 3.846956960161114e-06, + "loss": 0.78165603, + "num_input_tokens_seen": 54779355, + "step": 2521, + "time_per_iteration": 2.8019049167633057 + }, + { + "auxiliary_loss_clip": 0.01522234, + "auxiliary_loss_mlp": 0.01436028, + "balance_loss_clip": 1.16415548, + "balance_loss_mlp": 1.19379485, + "epoch": 0.15163084322861867, + "flos": 23589654462720.0, + "grad_norm": 4.65183509217509, + "language_loss": 0.82251036, + "learning_rate": 3.84680750808108e-06, + "loss": 0.85209298, + "num_input_tokens_seen": 54799465, + "step": 2522, + "time_per_iteration": 2.7762935161590576 + }, + { + "auxiliary_loss_clip": 0.01649677, + "auxiliary_loss_mlp": 0.01390854, + "balance_loss_clip": 1.29544032, + "balance_loss_mlp": 1.20431519, + "epoch": 0.15169096648128663, + "flos": 66896042424480.0, + "grad_norm": 0.8179204588312877, + "language_loss": 0.57948405, + "learning_rate": 3.846657985969922e-06, + "loss": 0.60988933, + "num_input_tokens_seen": 54857665, + "step": 2523, + "time_per_iteration": 3.3020873069763184 + }, + { + "auxiliary_loss_clip": 0.01527435, + "auxiliary_loss_mlp": 0.01441803, + "balance_loss_clip": 1.17055774, + "balance_loss_mlp": 1.20376587, + "epoch": 0.1517510897339546, + "flos": 29097978223680.0, + "grad_norm": 2.342523701344782, + "language_loss": 0.75398397, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.78367639, + "num_input_tokens_seen": 54879895, + "step": 2524, + "time_per_iteration": 2.8259217739105225 + }, + { + "auxiliary_loss_clip": 0.01521349, + "auxiliary_loss_mlp": 0.01422632, + "balance_loss_clip": 1.16333032, + "balance_loss_mlp": 1.18020785, + "epoch": 0.1518112129866226, + "flos": 18408578145120.0, + "grad_norm": 1.7339425555110886, + "language_loss": 0.74704921, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.77648902, + "num_input_tokens_seen": 54898245, + "step": 2525, + "time_per_iteration": 4.395634889602661 + }, + { + "auxiliary_loss_clip": 0.01522839, + "auxiliary_loss_mlp": 0.01406412, + "balance_loss_clip": 1.16454148, + "balance_loss_mlp": 1.16932845, + "epoch": 0.15187133623929056, + "flos": 19427338723680.0, + "grad_norm": 1.9054617102996918, + "language_loss": 0.79890698, + "learning_rate": 3.846208999506402e-06, + "loss": 0.82819945, + "num_input_tokens_seen": 54917060, + "step": 2526, + "time_per_iteration": 2.76041579246521 + }, + { + "auxiliary_loss_clip": 0.01530656, + "auxiliary_loss_mlp": 0.01367745, + "balance_loss_clip": 1.17206001, + "balance_loss_mlp": 1.12913561, + "epoch": 0.15193145949195852, + "flos": 17568119668320.0, + "grad_norm": 1.7884056032419504, + "language_loss": 0.84584296, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87482691, + "num_input_tokens_seen": 54936365, + "step": 2527, + "time_per_iteration": 2.831071615219116 + }, + { + "auxiliary_loss_clip": 0.01528381, + "auxiliary_loss_mlp": 0.01317717, + "balance_loss_clip": 1.1691035, + "balance_loss_mlp": 1.07185984, + "epoch": 0.15199158274462649, + "flos": 36179350158240.0, + "grad_norm": 1.7562667616192236, + "language_loss": 0.69297409, + "learning_rate": 3.845909325145779e-06, + "loss": 0.72143507, + "num_input_tokens_seen": 54961365, + "step": 2528, + "time_per_iteration": 2.9248297214508057 + }, + { + "auxiliary_loss_clip": 0.01537457, + "auxiliary_loss_mlp": 0.01371493, + "balance_loss_clip": 1.17842889, + "balance_loss_mlp": 1.12983179, + "epoch": 0.15205170599729445, + "flos": 23076177932160.0, + "grad_norm": 1.9705077461370755, + "language_loss": 0.87096518, + "learning_rate": 3.845759382967026e-06, + "loss": 0.90005463, + "num_input_tokens_seen": 54980750, + "step": 2529, + "time_per_iteration": 2.800170660018921 + }, + { + "auxiliary_loss_clip": 0.01526714, + "auxiliary_loss_mlp": 0.01414792, + "balance_loss_clip": 1.16751647, + "balance_loss_mlp": 1.16397536, + "epoch": 0.15211182924996242, + "flos": 21910558060800.0, + "grad_norm": 2.1446425578012867, + "language_loss": 0.83477187, + "learning_rate": 3.845609370796893e-06, + "loss": 0.864187, + "num_input_tokens_seen": 54999675, + "step": 2530, + "time_per_iteration": 2.862319231033325 + }, + { + "auxiliary_loss_clip": 0.01520411, + "auxiliary_loss_mlp": 0.01414636, + "balance_loss_clip": 1.16133285, + "balance_loss_mlp": 1.16172123, + "epoch": 0.15217195250263038, + "flos": 13883248343520.0, + "grad_norm": 3.5321804235940464, + "language_loss": 0.80633163, + "learning_rate": 3.845459288641066e-06, + "loss": 0.83568209, + "num_input_tokens_seen": 55018295, + "step": 2531, + "time_per_iteration": 2.79179310798645 + }, + { + "auxiliary_loss_clip": 0.01526948, + "auxiliary_loss_mlp": 0.01426961, + "balance_loss_clip": 1.16797411, + "balance_loss_mlp": 1.17747962, + "epoch": 0.15223207575529837, + "flos": 24537640366080.0, + "grad_norm": 1.973223385278769, + "language_loss": 0.79038119, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81992024, + "num_input_tokens_seen": 55037975, + "step": 2532, + "time_per_iteration": 4.252911329269409 + }, + { + "auxiliary_loss_clip": 0.01523738, + "auxiliary_loss_mlp": 0.01416838, + "balance_loss_clip": 1.16389418, + "balance_loss_mlp": 1.16831076, + "epoch": 0.15229219900796634, + "flos": 25559169700320.0, + "grad_norm": 1.8657653897900017, + "language_loss": 0.87801468, + "learning_rate": 3.845158914395105e-06, + "loss": 0.9074204, + "num_input_tokens_seen": 55057135, + "step": 2533, + "time_per_iteration": 4.135894775390625 + }, + { + "auxiliary_loss_clip": 0.01527362, + "auxiliary_loss_mlp": 0.01380185, + "balance_loss_clip": 1.16708112, + "balance_loss_mlp": 1.13299227, + "epoch": 0.1523523222606343, + "flos": 18219466517760.0, + "grad_norm": 2.6713007902275074, + "language_loss": 0.78623819, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81531364, + "num_input_tokens_seen": 55075525, + "step": 2534, + "time_per_iteration": 4.655238389968872 + }, + { + "auxiliary_loss_clip": 0.01524439, + "auxiliary_loss_mlp": 0.01335958, + "balance_loss_clip": 1.16375637, + "balance_loss_mlp": 1.08647728, + "epoch": 0.15241244551330227, + "flos": 13261713392160.0, + "grad_norm": 2.7295879583999993, + "language_loss": 0.76751018, + "learning_rate": 3.844858260274702e-06, + "loss": 0.79611409, + "num_input_tokens_seen": 55090845, + "step": 2535, + "time_per_iteration": 2.8246400356292725 + }, + { + "auxiliary_loss_clip": 0.01519842, + "auxiliary_loss_mlp": 0.01339493, + "balance_loss_clip": 1.15992689, + "balance_loss_mlp": 1.09554291, + "epoch": 0.15247256876597023, + "flos": 19717416205920.0, + "grad_norm": 2.3222268442248146, + "language_loss": 0.78860974, + "learning_rate": 3.844707828275835e-06, + "loss": 0.8172031, + "num_input_tokens_seen": 55108750, + "step": 2536, + "time_per_iteration": 2.87551212310791 + }, + { + "auxiliary_loss_clip": 0.01523813, + "auxiliary_loss_mlp": 0.0137195, + "balance_loss_clip": 1.165097, + "balance_loss_mlp": 1.13200545, + "epoch": 0.1525326920186382, + "flos": 20377941670080.0, + "grad_norm": 2.6662068167655972, + "language_loss": 0.76149213, + "learning_rate": 3.844557326325461e-06, + "loss": 0.79044974, + "num_input_tokens_seen": 55126750, + "step": 2537, + "time_per_iteration": 2.7623772621154785 + }, + { + "auxiliary_loss_clip": 0.01525199, + "auxiliary_loss_mlp": 0.01414909, + "balance_loss_clip": 1.16550803, + "balance_loss_mlp": 1.17744446, + "epoch": 0.15259281527130616, + "flos": 13591691663040.0, + "grad_norm": 3.3033492068380976, + "language_loss": 0.77782238, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.8072235, + "num_input_tokens_seen": 55144690, + "step": 2538, + "time_per_iteration": 2.785644769668579 + }, + { + "auxiliary_loss_clip": 0.01517654, + "auxiliary_loss_mlp": 0.01401731, + "balance_loss_clip": 1.15806091, + "balance_loss_mlp": 1.16655445, + "epoch": 0.15265293852397416, + "flos": 22862791918080.0, + "grad_norm": 2.478403765059674, + "language_loss": 0.89830971, + "learning_rate": 3.844256112593029e-06, + "loss": 0.92750353, + "num_input_tokens_seen": 55166055, + "step": 2539, + "time_per_iteration": 2.8478658199310303 + }, + { + "auxiliary_loss_clip": 0.01524141, + "auxiliary_loss_mlp": 0.01400424, + "balance_loss_clip": 1.16348815, + "balance_loss_mlp": 1.15857267, + "epoch": 0.15271306177664212, + "flos": 29240475778080.0, + "grad_norm": 2.4784351782075404, + "language_loss": 0.93908733, + "learning_rate": 3.844105400822391e-06, + "loss": 0.96833301, + "num_input_tokens_seen": 55186285, + "step": 2540, + "time_per_iteration": 2.8374431133270264 + }, + { + "auxiliary_loss_clip": 0.01522436, + "auxiliary_loss_mlp": 0.01395394, + "balance_loss_clip": 1.16242361, + "balance_loss_mlp": 1.16231573, + "epoch": 0.1527731850293101, + "flos": 31248829815840.0, + "grad_norm": 1.9086364611706064, + "language_loss": 0.75581479, + "learning_rate": 3.843954619123092e-06, + "loss": 0.78499305, + "num_input_tokens_seen": 55207915, + "step": 2541, + "time_per_iteration": 2.833624839782715 + }, + { + "auxiliary_loss_clip": 0.01524207, + "auxiliary_loss_mlp": 0.01359926, + "balance_loss_clip": 1.16475987, + "balance_loss_mlp": 1.11902785, + "epoch": 0.15283330828197805, + "flos": 22384247587200.0, + "grad_norm": 2.1380512500772864, + "language_loss": 0.8156141, + "learning_rate": 3.84380376750085e-06, + "loss": 0.84445548, + "num_input_tokens_seen": 55227860, + "step": 2542, + "time_per_iteration": 2.8097620010375977 + }, + { + "auxiliary_loss_clip": 0.01536633, + "auxiliary_loss_mlp": 0.01327931, + "balance_loss_clip": 1.17667425, + "balance_loss_mlp": 1.09008455, + "epoch": 0.15289343153464602, + "flos": 25522492805280.0, + "grad_norm": 2.6315773267752722, + "language_loss": 0.78979856, + "learning_rate": 3.843652845961383e-06, + "loss": 0.81844413, + "num_input_tokens_seen": 55247330, + "step": 2543, + "time_per_iteration": 2.8979737758636475 + }, + { + "auxiliary_loss_clip": 0.01531919, + "auxiliary_loss_mlp": 0.01319427, + "balance_loss_clip": 1.17243683, + "balance_loss_mlp": 1.0754776, + "epoch": 0.15295355478731398, + "flos": 22712177737440.0, + "grad_norm": 2.0899573186078553, + "language_loss": 0.86722285, + "learning_rate": 3.843501854510416e-06, + "loss": 0.89573628, + "num_input_tokens_seen": 55266195, + "step": 2544, + "time_per_iteration": 2.8275489807128906 + }, + { + "auxiliary_loss_clip": 0.01518957, + "auxiliary_loss_mlp": 0.01357057, + "balance_loss_clip": 1.15938807, + "balance_loss_mlp": 1.10891116, + "epoch": 0.15301367803998198, + "flos": 23253797327040.0, + "grad_norm": 2.310270947616152, + "language_loss": 0.82987535, + "learning_rate": 3.843350793153673e-06, + "loss": 0.85863549, + "num_input_tokens_seen": 55283305, + "step": 2545, + "time_per_iteration": 2.8908753395080566 + }, + { + "auxiliary_loss_clip": 0.01537727, + "auxiliary_loss_mlp": 0.01340607, + "balance_loss_clip": 1.17916644, + "balance_loss_mlp": 1.087502, + "epoch": 0.15307380129264994, + "flos": 25888730761440.0, + "grad_norm": 2.2358798519165024, + "language_loss": 0.71422982, + "learning_rate": 3.843199661896884e-06, + "loss": 0.74301314, + "num_input_tokens_seen": 55303035, + "step": 2546, + "time_per_iteration": 2.863569974899292 + }, + { + "auxiliary_loss_clip": 0.01536972, + "auxiliary_loss_mlp": 0.01335911, + "balance_loss_clip": 1.17699015, + "balance_loss_mlp": 1.08204293, + "epoch": 0.1531339245453179, + "flos": 46976770729440.0, + "grad_norm": 1.7176855217172087, + "language_loss": 0.77522516, + "learning_rate": 3.843048460745779e-06, + "loss": 0.80395401, + "num_input_tokens_seen": 55327570, + "step": 2547, + "time_per_iteration": 2.997185707092285 + }, + { + "auxiliary_loss_clip": 0.01546843, + "auxiliary_loss_mlp": 0.0132153, + "balance_loss_clip": 1.18592334, + "balance_loss_mlp": 1.07452798, + "epoch": 0.15319404779798587, + "flos": 35884986793920.0, + "grad_norm": 2.065722883911737, + "language_loss": 0.74379599, + "learning_rate": 3.842897189706092e-06, + "loss": 0.77247971, + "num_input_tokens_seen": 55351090, + "step": 2548, + "time_per_iteration": 2.914375066757202 + }, + { + "auxiliary_loss_clip": 0.01541761, + "auxiliary_loss_mlp": 0.01329187, + "balance_loss_clip": 1.18023586, + "balance_loss_mlp": 1.07856119, + "epoch": 0.15325417105065384, + "flos": 25666924695840.0, + "grad_norm": 1.595765018103348, + "language_loss": 0.80692464, + "learning_rate": 3.842745848783558e-06, + "loss": 0.83563411, + "num_input_tokens_seen": 55371050, + "step": 2549, + "time_per_iteration": 2.823429584503174 + }, + { + "auxiliary_loss_clip": 0.01537885, + "auxiliary_loss_mlp": 0.01363881, + "balance_loss_clip": 1.1763432, + "balance_loss_mlp": 1.11955011, + "epoch": 0.1533142943033218, + "flos": 18772843836960.0, + "grad_norm": 3.16822801363438, + "language_loss": 0.75170392, + "learning_rate": 3.842594437983917e-06, + "loss": 0.78072155, + "num_input_tokens_seen": 55390375, + "step": 2550, + "time_per_iteration": 2.804891347885132 + }, + { + "auxiliary_loss_clip": 0.01541514, + "auxiliary_loss_mlp": 0.01372099, + "balance_loss_clip": 1.17941046, + "balance_loss_mlp": 1.12566948, + "epoch": 0.15337441755598977, + "flos": 23109327508320.0, + "grad_norm": 2.4275842248701354, + "language_loss": 0.77309775, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.80223393, + "num_input_tokens_seen": 55408890, + "step": 2551, + "time_per_iteration": 2.810112237930298 + }, + { + "auxiliary_loss_clip": 0.01706564, + "auxiliary_loss_mlp": 0.01281517, + "balance_loss_clip": 1.3465631, + "balance_loss_mlp": 1.08391571, + "epoch": 0.15343454080865776, + "flos": 59867846134560.0, + "grad_norm": 0.9312217329070797, + "language_loss": 0.56648517, + "learning_rate": 3.842291406776283e-06, + "loss": 0.59636593, + "num_input_tokens_seen": 55463815, + "step": 2552, + "time_per_iteration": 3.3283746242523193 + }, + { + "auxiliary_loss_clip": 0.01546547, + "auxiliary_loss_mlp": 0.01314209, + "balance_loss_clip": 1.18505883, + "balance_loss_mlp": 1.06263018, + "epoch": 0.15349466406132573, + "flos": 11912481476640.0, + "grad_norm": 3.1650656710820315, + "language_loss": 0.89170957, + "learning_rate": 3.84213978637978e-06, + "loss": 0.92031705, + "num_input_tokens_seen": 55481050, + "step": 2553, + "time_per_iteration": 2.757784366607666 + }, + { + "auxiliary_loss_clip": 0.01535092, + "auxiliary_loss_mlp": 0.01345816, + "balance_loss_clip": 1.17561495, + "balance_loss_mlp": 1.08622634, + "epoch": 0.1535547873139937, + "flos": 24099186464640.0, + "grad_norm": 1.6159070332564254, + "language_loss": 0.78360409, + "learning_rate": 3.841988096129152e-06, + "loss": 0.81241316, + "num_input_tokens_seen": 55500050, + "step": 2554, + "time_per_iteration": 2.842094898223877 + }, + { + "auxiliary_loss_clip": 0.01537418, + "auxiliary_loss_mlp": 0.01382292, + "balance_loss_clip": 1.177001, + "balance_loss_mlp": 1.1251812, + "epoch": 0.15361491056666166, + "flos": 17568385165440.0, + "grad_norm": 2.3882313113190534, + "language_loss": 0.78039622, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80959332, + "num_input_tokens_seen": 55518125, + "step": 2555, + "time_per_iteration": 2.7770450115203857 + }, + { + "auxiliary_loss_clip": 0.01543942, + "auxiliary_loss_mlp": 0.01384624, + "balance_loss_clip": 1.1825105, + "balance_loss_mlp": 1.13418925, + "epoch": 0.15367503381932962, + "flos": 25048386069120.0, + "grad_norm": 1.8684595048739927, + "language_loss": 0.7704103, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.79969597, + "num_input_tokens_seen": 55540960, + "step": 2556, + "time_per_iteration": 2.861719846725464 + }, + { + "auxiliary_loss_clip": 0.01544389, + "auxiliary_loss_mlp": 0.01365515, + "balance_loss_clip": 1.18323565, + "balance_loss_mlp": 1.11317253, + "epoch": 0.15373515707199759, + "flos": 21509501689440.0, + "grad_norm": 2.749869462494156, + "language_loss": 0.89598906, + "learning_rate": 3.84153260631005e-06, + "loss": 0.92508811, + "num_input_tokens_seen": 55559210, + "step": 2557, + "time_per_iteration": 2.789285659790039 + }, + { + "auxiliary_loss_clip": 0.01536224, + "auxiliary_loss_mlp": 0.0133956, + "balance_loss_clip": 1.17758393, + "balance_loss_mlp": 1.08950686, + "epoch": 0.15379528032466555, + "flos": 25997130535680.0, + "grad_norm": 2.3720272635023676, + "language_loss": 0.70680767, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73556548, + "num_input_tokens_seen": 55578925, + "step": 2558, + "time_per_iteration": 2.853285789489746 + }, + { + "auxiliary_loss_clip": 0.01548626, + "auxiliary_loss_mlp": 0.01324535, + "balance_loss_clip": 1.18902564, + "balance_loss_mlp": 1.0872612, + "epoch": 0.15385540357733354, + "flos": 19279114017120.0, + "grad_norm": 2.029795830948196, + "language_loss": 0.92435527, + "learning_rate": 3.841228597265548e-06, + "loss": 0.95308697, + "num_input_tokens_seen": 55597255, + "step": 2559, + "time_per_iteration": 2.8718719482421875 + }, + { + "auxiliary_loss_clip": 0.01548217, + "auxiliary_loss_mlp": 0.01351724, + "balance_loss_clip": 1.18937635, + "balance_loss_mlp": 1.11292434, + "epoch": 0.1539155268300015, + "flos": 28551693470400.0, + "grad_norm": 2.2407829893492055, + "language_loss": 0.63737893, + "learning_rate": 3.841076488011055e-06, + "loss": 0.66637826, + "num_input_tokens_seen": 55619515, + "step": 2560, + "time_per_iteration": 2.8219869136810303 + }, + { + "auxiliary_loss_clip": 0.01544585, + "auxiliary_loss_mlp": 0.01354409, + "balance_loss_clip": 1.18466651, + "balance_loss_mlp": 1.1205678, + "epoch": 0.15397565008266947, + "flos": 23550019171200.0, + "grad_norm": 2.2722928089450445, + "language_loss": 0.88343084, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.91242081, + "num_input_tokens_seen": 55640050, + "step": 2561, + "time_per_iteration": 2.7948055267333984 + }, + { + "auxiliary_loss_clip": 0.01547634, + "auxiliary_loss_mlp": 0.0133524, + "balance_loss_clip": 1.18900621, + "balance_loss_mlp": 1.09453285, + "epoch": 0.15403577333533744, + "flos": 17131789743840.0, + "grad_norm": 1.8663695565645728, + "language_loss": 0.83455729, + "learning_rate": 3.840772060066425e-06, + "loss": 0.86338603, + "num_input_tokens_seen": 55658695, + "step": 2562, + "time_per_iteration": 2.7697439193725586 + }, + { + "auxiliary_loss_clip": 0.01552373, + "auxiliary_loss_mlp": 0.01349367, + "balance_loss_clip": 1.19310772, + "balance_loss_mlp": 1.10179365, + "epoch": 0.1540958965880054, + "flos": 17896429100160.0, + "grad_norm": 2.0187481710815147, + "language_loss": 0.75091004, + "learning_rate": 3.840619741387832e-06, + "loss": 0.77992737, + "num_input_tokens_seen": 55676340, + "step": 2563, + "time_per_iteration": 4.34395694732666 + }, + { + "auxiliary_loss_clip": 0.01553603, + "auxiliary_loss_mlp": 0.01348532, + "balance_loss_clip": 1.19495082, + "balance_loss_mlp": 1.10420084, + "epoch": 0.15415601984067337, + "flos": 32163817855680.0, + "grad_norm": 2.6984023456099373, + "language_loss": 0.76205754, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.79107887, + "num_input_tokens_seen": 55698890, + "step": 2564, + "time_per_iteration": 2.97851300239563 + }, + { + "auxiliary_loss_clip": 0.01541612, + "auxiliary_loss_mlp": 0.01326563, + "balance_loss_clip": 1.1846714, + "balance_loss_mlp": 1.08623743, + "epoch": 0.15421614309334136, + "flos": 24026705022240.0, + "grad_norm": 2.097314690411368, + "language_loss": 0.71258855, + "learning_rate": 3.840314894646969e-06, + "loss": 0.7412703, + "num_input_tokens_seen": 55718535, + "step": 2565, + "time_per_iteration": 2.83402681350708 + }, + { + "auxiliary_loss_clip": 0.01535572, + "auxiliary_loss_mlp": 0.01308499, + "balance_loss_clip": 1.17730331, + "balance_loss_mlp": 1.05825448, + "epoch": 0.15427626634600933, + "flos": 24388429527360.0, + "grad_norm": 2.0575418433020833, + "language_loss": 0.71730232, + "learning_rate": 3.840162366596259e-06, + "loss": 0.74574304, + "num_input_tokens_seen": 55738970, + "step": 2566, + "time_per_iteration": 2.834425926208496 + }, + { + "auxiliary_loss_clip": 0.01538216, + "auxiliary_loss_mlp": 0.01311699, + "balance_loss_clip": 1.18197465, + "balance_loss_mlp": 1.06641388, + "epoch": 0.1543363895986773, + "flos": 23333788545120.0, + "grad_norm": 2.0029037142118455, + "language_loss": 0.851161, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87966019, + "num_input_tokens_seen": 55759585, + "step": 2567, + "time_per_iteration": 2.788767099380493 + }, + { + "auxiliary_loss_clip": 0.01554146, + "auxiliary_loss_mlp": 0.01303187, + "balance_loss_clip": 1.19574094, + "balance_loss_mlp": 1.06114483, + "epoch": 0.15439651285134526, + "flos": 24276274865280.0, + "grad_norm": 2.1905476813469353, + "language_loss": 0.78979784, + "learning_rate": 3.839857101163202e-06, + "loss": 0.81837118, + "num_input_tokens_seen": 55779250, + "step": 2568, + "time_per_iteration": 2.863818407058716 + }, + { + "auxiliary_loss_clip": 0.01547696, + "auxiliary_loss_mlp": 0.01322373, + "balance_loss_clip": 1.18990731, + "balance_loss_mlp": 1.07575274, + "epoch": 0.15445663610401322, + "flos": 22458397868640.0, + "grad_norm": 2.340409828551267, + "language_loss": 0.70291531, + "learning_rate": 3.83970436379243e-06, + "loss": 0.73161602, + "num_input_tokens_seen": 55800470, + "step": 2569, + "time_per_iteration": 2.8910539150238037 + }, + { + "auxiliary_loss_clip": 0.01542458, + "auxiliary_loss_mlp": 0.01344591, + "balance_loss_clip": 1.18590188, + "balance_loss_mlp": 1.10273957, + "epoch": 0.1545167593566812, + "flos": 22051424704320.0, + "grad_norm": 2.0065842273956536, + "language_loss": 0.77090704, + "learning_rate": 3.839551556659884e-06, + "loss": 0.79977763, + "num_input_tokens_seen": 55817795, + "step": 2570, + "time_per_iteration": 4.263026475906372 + }, + { + "auxiliary_loss_clip": 0.01543251, + "auxiliary_loss_mlp": 0.01312147, + "balance_loss_clip": 1.18571901, + "balance_loss_mlp": 1.06667161, + "epoch": 0.15457688260934915, + "flos": 19320304363200.0, + "grad_norm": 2.8588002118505456, + "language_loss": 0.77959263, + "learning_rate": 3.839398679771359e-06, + "loss": 0.8081466, + "num_input_tokens_seen": 55836125, + "step": 2571, + "time_per_iteration": 4.245629787445068 + }, + { + "auxiliary_loss_clip": 0.01541134, + "auxiliary_loss_mlp": 0.01309393, + "balance_loss_clip": 1.18492639, + "balance_loss_mlp": 1.06487072, + "epoch": 0.15463700586201715, + "flos": 24136280569440.0, + "grad_norm": 2.7160956055678445, + "language_loss": 0.82470608, + "learning_rate": 3.839245733132652e-06, + "loss": 0.8532114, + "num_input_tokens_seen": 55855280, + "step": 2572, + "time_per_iteration": 4.3984010219573975 + }, + { + "auxiliary_loss_clip": 0.01548386, + "auxiliary_loss_mlp": 0.01342477, + "balance_loss_clip": 1.18986058, + "balance_loss_mlp": 1.10062516, + "epoch": 0.1546971291146851, + "flos": 22423238100000.0, + "grad_norm": 1.8788759873447618, + "language_loss": 0.90733033, + "learning_rate": 3.839092716749563e-06, + "loss": 0.93623894, + "num_input_tokens_seen": 55875695, + "step": 2573, + "time_per_iteration": 2.836569309234619 + }, + { + "auxiliary_loss_clip": 0.01545577, + "auxiliary_loss_mlp": 0.0132025, + "balance_loss_clip": 1.18903708, + "balance_loss_mlp": 1.07763481, + "epoch": 0.15475725236735308, + "flos": 17532163408320.0, + "grad_norm": 1.9831305816379547, + "language_loss": 0.699826, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72848427, + "num_input_tokens_seen": 55894575, + "step": 2574, + "time_per_iteration": 2.772761106491089 + }, + { + "auxiliary_loss_clip": 0.0154807, + "auxiliary_loss_mlp": 0.01302733, + "balance_loss_clip": 1.19043636, + "balance_loss_mlp": 1.0524888, + "epoch": 0.15481737562002104, + "flos": 22563763390080.0, + "grad_norm": 4.602361216826536, + "language_loss": 0.8241989, + "learning_rate": 3.838786474773448e-06, + "loss": 0.85270691, + "num_input_tokens_seen": 55912855, + "step": 2575, + "time_per_iteration": 2.7927043437957764 + }, + { + "auxiliary_loss_clip": 0.01538296, + "auxiliary_loss_mlp": 0.01320188, + "balance_loss_clip": 1.1796813, + "balance_loss_mlp": 1.07356787, + "epoch": 0.154877498872689, + "flos": 24902930118240.0, + "grad_norm": 2.2018567480549276, + "language_loss": 0.852319, + "learning_rate": 3.838633249192036e-06, + "loss": 0.88090384, + "num_input_tokens_seen": 55932375, + "step": 2576, + "time_per_iteration": 2.8459064960479736 + }, + { + "auxiliary_loss_clip": 0.01538558, + "auxiliary_loss_mlp": 0.01299299, + "balance_loss_clip": 1.17973971, + "balance_loss_mlp": 1.05553961, + "epoch": 0.15493762212535697, + "flos": 28150030248480.0, + "grad_norm": 2.6096312945951654, + "language_loss": 0.81947821, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84785676, + "num_input_tokens_seen": 55953970, + "step": 2577, + "time_per_iteration": 2.836794376373291 + }, + { + "auxiliary_loss_clip": 0.01556262, + "auxiliary_loss_mlp": 0.01341555, + "balance_loss_clip": 1.19656754, + "balance_loss_mlp": 1.09951186, + "epoch": 0.15499774537802496, + "flos": 25413448252320.0, + "grad_norm": 2.5676018756534957, + "language_loss": 0.76238513, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.79136324, + "num_input_tokens_seen": 55973120, + "step": 2578, + "time_per_iteration": 2.8595635890960693 + }, + { + "auxiliary_loss_clip": 0.01547242, + "auxiliary_loss_mlp": 0.01335218, + "balance_loss_clip": 1.18827367, + "balance_loss_mlp": 1.09260333, + "epoch": 0.15505786863069293, + "flos": 22093790823360.0, + "grad_norm": 2.09743800686278, + "language_loss": 0.82930648, + "learning_rate": 3.83817315414411e-06, + "loss": 0.85813111, + "num_input_tokens_seen": 55993260, + "step": 2579, + "time_per_iteration": 2.831681251525879 + }, + { + "auxiliary_loss_clip": 0.0155658, + "auxiliary_loss_mlp": 0.01306003, + "balance_loss_clip": 1.19835699, + "balance_loss_mlp": 1.06243491, + "epoch": 0.1551179918833609, + "flos": 18919172135520.0, + "grad_norm": 2.029184019862861, + "language_loss": 0.80568135, + "learning_rate": 3.838019649712958e-06, + "loss": 0.83430719, + "num_input_tokens_seen": 56012130, + "step": 2580, + "time_per_iteration": 2.8837890625 + }, + { + "auxiliary_loss_clip": 0.0169671, + "auxiliary_loss_mlp": 0.0125354, + "balance_loss_clip": 1.33728099, + "balance_loss_mlp": 1.05899048, + "epoch": 0.15517811513602886, + "flos": 66245909276160.0, + "grad_norm": 0.8370211787452398, + "language_loss": 0.58818239, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.6176849, + "num_input_tokens_seen": 56079045, + "step": 2581, + "time_per_iteration": 3.460433006286621 + }, + { + "auxiliary_loss_clip": 0.01542254, + "auxiliary_loss_mlp": 0.01320709, + "balance_loss_clip": 1.18355918, + "balance_loss_mlp": 1.07695031, + "epoch": 0.15523823838869683, + "flos": 24023177703360.0, + "grad_norm": 1.9506942928169162, + "language_loss": 0.85495442, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.88358402, + "num_input_tokens_seen": 56098745, + "step": 2582, + "time_per_iteration": 2.804095506668091 + }, + { + "auxiliary_loss_clip": 0.01556835, + "auxiliary_loss_mlp": 0.0135062, + "balance_loss_clip": 1.19594538, + "balance_loss_mlp": 1.11201072, + "epoch": 0.1552983616413648, + "flos": 20487137935680.0, + "grad_norm": 2.235920588665112, + "language_loss": 0.79071081, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.81978542, + "num_input_tokens_seen": 56117655, + "step": 2583, + "time_per_iteration": 2.839815139770508 + }, + { + "auxiliary_loss_clip": 0.01547037, + "auxiliary_loss_mlp": 0.01333528, + "balance_loss_clip": 1.18710303, + "balance_loss_mlp": 1.09129429, + "epoch": 0.15535848489403276, + "flos": 32126420325600.0, + "grad_norm": 1.6706633708154452, + "language_loss": 0.76429498, + "learning_rate": 3.837404935067705e-06, + "loss": 0.7931006, + "num_input_tokens_seen": 56141960, + "step": 2584, + "time_per_iteration": 2.885507106781006 + }, + { + "auxiliary_loss_clip": 0.01543761, + "auxiliary_loss_mlp": 0.01296083, + "balance_loss_clip": 1.18340993, + "balance_loss_mlp": 1.05308723, + "epoch": 0.15541860814670075, + "flos": 19100432633760.0, + "grad_norm": 2.7044254336267985, + "language_loss": 0.75877082, + "learning_rate": 3.837251082205368e-06, + "loss": 0.78716928, + "num_input_tokens_seen": 56161430, + "step": 2585, + "time_per_iteration": 2.819324493408203 + }, + { + "auxiliary_loss_clip": 0.01552945, + "auxiliary_loss_mlp": 0.01303754, + "balance_loss_clip": 1.19335675, + "balance_loss_mlp": 1.06533527, + "epoch": 0.1554787313993687, + "flos": 19174127777280.0, + "grad_norm": 2.1004697695005867, + "language_loss": 0.62068683, + "learning_rate": 3.837097159674286e-06, + "loss": 0.64925385, + "num_input_tokens_seen": 56179390, + "step": 2586, + "time_per_iteration": 2.762848138809204 + }, + { + "auxiliary_loss_clip": 0.01547442, + "auxiliary_loss_mlp": 0.01296374, + "balance_loss_clip": 1.18762231, + "balance_loss_mlp": 1.05814576, + "epoch": 0.15553885465203668, + "flos": 16145685675360.0, + "grad_norm": 1.8834077529437374, + "language_loss": 0.81663895, + "learning_rate": 3.836943167480296e-06, + "loss": 0.84507716, + "num_input_tokens_seen": 56198020, + "step": 2587, + "time_per_iteration": 2.8671836853027344 + }, + { + "auxiliary_loss_clip": 0.01550619, + "auxiliary_loss_mlp": 0.01321742, + "balance_loss_clip": 1.1902349, + "balance_loss_mlp": 1.08446813, + "epoch": 0.15559897790470464, + "flos": 25340132390400.0, + "grad_norm": 2.04296696366951, + "language_loss": 0.88525146, + "learning_rate": 3.836789105629236e-06, + "loss": 0.913975, + "num_input_tokens_seen": 56218165, + "step": 2588, + "time_per_iteration": 2.842900037765503 + }, + { + "auxiliary_loss_clip": 0.01546613, + "auxiliary_loss_mlp": 0.01332395, + "balance_loss_clip": 1.18619061, + "balance_loss_mlp": 1.09893537, + "epoch": 0.1556591011573726, + "flos": 23151010920480.0, + "grad_norm": 2.882791905747083, + "language_loss": 0.64717329, + "learning_rate": 3.83663497412695e-06, + "loss": 0.67596334, + "num_input_tokens_seen": 56237160, + "step": 2589, + "time_per_iteration": 2.898031234741211 + }, + { + "auxiliary_loss_clip": 0.01553687, + "auxiliary_loss_mlp": 0.01326196, + "balance_loss_clip": 1.19266653, + "balance_loss_mlp": 1.08930349, + "epoch": 0.15571922441004057, + "flos": 25373054397600.0, + "grad_norm": 1.787473035348291, + "language_loss": 0.83034801, + "learning_rate": 3.836480772979281e-06, + "loss": 0.85914683, + "num_input_tokens_seen": 56257610, + "step": 2590, + "time_per_iteration": 2.878298044204712 + }, + { + "auxiliary_loss_clip": 0.01541592, + "auxiliary_loss_mlp": 0.012993, + "balance_loss_clip": 1.18138957, + "balance_loss_mlp": 1.05954587, + "epoch": 0.15577934766270854, + "flos": 14503114455840.0, + "grad_norm": 2.2872946505665195, + "language_loss": 0.79658031, + "learning_rate": 3.836326502192077e-06, + "loss": 0.8249892, + "num_input_tokens_seen": 56275215, + "step": 2591, + "time_per_iteration": 2.821812152862549 + }, + { + "auxiliary_loss_clip": 0.01547982, + "auxiliary_loss_mlp": 0.01303788, + "balance_loss_clip": 1.18650091, + "balance_loss_mlp": 1.06231785, + "epoch": 0.15583947091537653, + "flos": 37417565256480.0, + "grad_norm": 4.519166073227066, + "language_loss": 0.65189618, + "learning_rate": 3.836172161771189e-06, + "loss": 0.68041396, + "num_input_tokens_seen": 56297130, + "step": 2592, + "time_per_iteration": 3.004404067993164 + }, + { + "auxiliary_loss_clip": 0.01551375, + "auxiliary_loss_mlp": 0.01308365, + "balance_loss_clip": 1.18984556, + "balance_loss_mlp": 1.0695653, + "epoch": 0.1558995941680445, + "flos": 21836862917280.0, + "grad_norm": 2.2899490941782905, + "language_loss": 0.82378763, + "learning_rate": 3.836017751722467e-06, + "loss": 0.85238504, + "num_input_tokens_seen": 56314995, + "step": 2593, + "time_per_iteration": 2.783595323562622 + }, + { + "auxiliary_loss_clip": 0.01547153, + "auxiliary_loss_mlp": 0.01327521, + "balance_loss_clip": 1.18629646, + "balance_loss_mlp": 1.08986545, + "epoch": 0.15595971742071246, + "flos": 19794714524640.0, + "grad_norm": 2.267703096484556, + "language_loss": 0.73711574, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.76586246, + "num_input_tokens_seen": 56334005, + "step": 2594, + "time_per_iteration": 2.7859103679656982 + }, + { + "auxiliary_loss_clip": 0.01538302, + "auxiliary_loss_mlp": 0.01314298, + "balance_loss_clip": 1.17823339, + "balance_loss_mlp": 1.08141124, + "epoch": 0.15601984067338043, + "flos": 26724448218240.0, + "grad_norm": 3.0490286502934127, + "language_loss": 0.8206706, + "learning_rate": 3.835708722764952e-06, + "loss": 0.84919655, + "num_input_tokens_seen": 56353795, + "step": 2595, + "time_per_iteration": 2.838749647140503 + }, + { + "auxiliary_loss_clip": 0.01550925, + "auxiliary_loss_mlp": 0.01305368, + "balance_loss_clip": 1.19043946, + "balance_loss_mlp": 1.06599593, + "epoch": 0.1560799639260484, + "flos": 18371256471360.0, + "grad_norm": 2.2527818285588603, + "language_loss": 0.86876243, + "learning_rate": 3.835554103867876e-06, + "loss": 0.89732534, + "num_input_tokens_seen": 56373195, + "step": 2596, + "time_per_iteration": 2.7824478149414062 + }, + { + "auxiliary_loss_clip": 0.01545073, + "auxiliary_loss_mlp": 0.01308619, + "balance_loss_clip": 1.18594384, + "balance_loss_mlp": 1.06695807, + "epoch": 0.15614008717871636, + "flos": 22601047135680.0, + "grad_norm": 1.7445899019424107, + "language_loss": 0.68765312, + "learning_rate": 3.835399415366404e-06, + "loss": 0.71619004, + "num_input_tokens_seen": 56391525, + "step": 2597, + "time_per_iteration": 2.877453327178955 + }, + { + "auxiliary_loss_clip": 0.01543163, + "auxiliary_loss_mlp": 0.01296347, + "balance_loss_clip": 1.18303812, + "balance_loss_mlp": 1.05602145, + "epoch": 0.15620021043138435, + "flos": 22749044273280.0, + "grad_norm": 2.029296272261754, + "language_loss": 0.79979205, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82818717, + "num_input_tokens_seen": 56410715, + "step": 2598, + "time_per_iteration": 2.7856881618499756 + }, + { + "auxiliary_loss_clip": 0.01551167, + "auxiliary_loss_mlp": 0.01296524, + "balance_loss_clip": 1.19018149, + "balance_loss_mlp": 1.05810523, + "epoch": 0.15626033368405232, + "flos": 13116598794720.0, + "grad_norm": 2.577195978746469, + "language_loss": 0.83016753, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.85864449, + "num_input_tokens_seen": 56429170, + "step": 2599, + "time_per_iteration": 2.76957631111145 + }, + { + "auxiliary_loss_clip": 0.01550478, + "auxiliary_loss_mlp": 0.01331025, + "balance_loss_clip": 1.18918777, + "balance_loss_mlp": 1.0905081, + "epoch": 0.15632045693672028, + "flos": 16474184748000.0, + "grad_norm": 2.0601643438345674, + "language_loss": 0.82003766, + "learning_rate": 3.834934932294287e-06, + "loss": 0.84885275, + "num_input_tokens_seen": 56445685, + "step": 2600, + "time_per_iteration": 2.8211989402770996 + }, + { + "auxiliary_loss_clip": 0.01538243, + "auxiliary_loss_mlp": 0.01287083, + "balance_loss_clip": 1.1770339, + "balance_loss_mlp": 1.04446793, + "epoch": 0.15638058018938825, + "flos": 20852465616000.0, + "grad_norm": 2.16471667297808, + "language_loss": 0.88321501, + "learning_rate": 3.834779965433917e-06, + "loss": 0.91146827, + "num_input_tokens_seen": 56465900, + "step": 2601, + "time_per_iteration": 4.393733978271484 + }, + { + "auxiliary_loss_clip": 0.0155995, + "auxiliary_loss_mlp": 0.01368417, + "balance_loss_clip": 1.19801152, + "balance_loss_mlp": 1.12542117, + "epoch": 0.1564407034420562, + "flos": 21874412160000.0, + "grad_norm": 2.0350937062878693, + "language_loss": 0.78750885, + "learning_rate": 3.834624928998508e-06, + "loss": 0.81679249, + "num_input_tokens_seen": 56485020, + "step": 2602, + "time_per_iteration": 2.936734437942505 + }, + { + "auxiliary_loss_clip": 0.01543616, + "auxiliary_loss_mlp": 0.01329889, + "balance_loss_clip": 1.18250322, + "balance_loss_mlp": 1.08689308, + "epoch": 0.15650082669472418, + "flos": 21836673276480.0, + "grad_norm": 3.435183895159227, + "language_loss": 0.73801023, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.76674527, + "num_input_tokens_seen": 56505205, + "step": 2603, + "time_per_iteration": 2.833082914352417 + }, + { + "auxiliary_loss_clip": 0.01543704, + "auxiliary_loss_mlp": 0.01332792, + "balance_loss_clip": 1.18161333, + "balance_loss_mlp": 1.0930388, + "epoch": 0.15656094994739214, + "flos": 13801815855360.0, + "grad_norm": 4.734649608889914, + "language_loss": 0.87588692, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.90465188, + "num_input_tokens_seen": 56521495, + "step": 2604, + "time_per_iteration": 2.80253529548645 + }, + { + "auxiliary_loss_clip": 0.01539733, + "auxiliary_loss_mlp": 0.01282056, + "balance_loss_clip": 1.17945242, + "balance_loss_mlp": 1.04077685, + "epoch": 0.15662107320006013, + "flos": 27310671688320.0, + "grad_norm": 2.742927707743136, + "language_loss": 0.85456836, + "learning_rate": 3.834159402300841e-06, + "loss": 0.88278627, + "num_input_tokens_seen": 56540665, + "step": 2605, + "time_per_iteration": 2.856276750564575 + }, + { + "auxiliary_loss_clip": 0.01542146, + "auxiliary_loss_mlp": 0.01347553, + "balance_loss_clip": 1.1821692, + "balance_loss_mlp": 1.11161423, + "epoch": 0.1566811964527281, + "flos": 26687354113440.0, + "grad_norm": 3.2199123099199114, + "language_loss": 0.73835552, + "learning_rate": 3.834004087624087e-06, + "loss": 0.76725256, + "num_input_tokens_seen": 56560805, + "step": 2606, + "time_per_iteration": 2.9226036071777344 + }, + { + "auxiliary_loss_clip": 0.01553737, + "auxiliary_loss_mlp": 0.01366871, + "balance_loss_clip": 1.19288921, + "balance_loss_mlp": 1.1358912, + "epoch": 0.15674131970539606, + "flos": 16105064251680.0, + "grad_norm": 2.343825413516844, + "language_loss": 0.76234269, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.79154879, + "num_input_tokens_seen": 56576335, + "step": 2607, + "time_per_iteration": 2.821320056915283 + }, + { + "auxiliary_loss_clip": 0.01548204, + "auxiliary_loss_mlp": 0.01365342, + "balance_loss_clip": 1.18713784, + "balance_loss_mlp": 1.13493419, + "epoch": 0.15680144295806403, + "flos": 19171586590560.0, + "grad_norm": 2.2996116283994943, + "language_loss": 0.82491136, + "learning_rate": 3.833693249639615e-06, + "loss": 0.85404682, + "num_input_tokens_seen": 56595880, + "step": 2608, + "time_per_iteration": 2.8089940547943115 + }, + { + "auxiliary_loss_clip": 0.01540116, + "auxiliary_loss_mlp": 0.0132989, + "balance_loss_clip": 1.180233, + "balance_loss_mlp": 1.08956432, + "epoch": 0.156861566210732, + "flos": 20815523223840.0, + "grad_norm": 2.2244294827562316, + "language_loss": 0.72573835, + "learning_rate": 3.833537726343684e-06, + "loss": 0.7544384, + "num_input_tokens_seen": 56615130, + "step": 2609, + "time_per_iteration": 4.296345472335815 + }, + { + "auxiliary_loss_clip": 0.0154183, + "auxiliary_loss_mlp": 0.01300496, + "balance_loss_clip": 1.18130934, + "balance_loss_mlp": 1.05654585, + "epoch": 0.15692168946339996, + "flos": 20050011519840.0, + "grad_norm": 3.3999605177660714, + "language_loss": 0.72156763, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74999094, + "num_input_tokens_seen": 56634005, + "step": 2610, + "time_per_iteration": 4.3780739307403564 + }, + { + "auxiliary_loss_clip": 0.01546032, + "auxiliary_loss_mlp": 0.01326351, + "balance_loss_clip": 1.18724775, + "balance_loss_mlp": 1.08488023, + "epoch": 0.15698181271606793, + "flos": 21400191639360.0, + "grad_norm": 2.500392068307157, + "language_loss": 0.73265332, + "learning_rate": 3.833226471173919e-06, + "loss": 0.76137722, + "num_input_tokens_seen": 56653480, + "step": 2611, + "time_per_iteration": 2.846090078353882 + }, + { + "auxiliary_loss_clip": 0.0155898, + "auxiliary_loss_mlp": 0.0132634, + "balance_loss_clip": 1.2011385, + "balance_loss_mlp": 1.08925605, + "epoch": 0.15704193596873592, + "flos": 20847648739680.0, + "grad_norm": 2.084052412215268, + "language_loss": 0.70834672, + "learning_rate": 3.833070739311887e-06, + "loss": 0.7371999, + "num_input_tokens_seen": 56672270, + "step": 2612, + "time_per_iteration": 2.800483226776123 + }, + { + "auxiliary_loss_clip": 0.01553125, + "auxiliary_loss_mlp": 0.01306772, + "balance_loss_clip": 1.19554162, + "balance_loss_mlp": 1.06854439, + "epoch": 0.15710205922140388, + "flos": 21765102109920.0, + "grad_norm": 1.9013520457713982, + "language_loss": 0.76364958, + "learning_rate": 3.83291493793963e-06, + "loss": 0.79224861, + "num_input_tokens_seen": 56691510, + "step": 2613, + "time_per_iteration": 2.7728071212768555 + }, + { + "auxiliary_loss_clip": 0.01541334, + "auxiliary_loss_mlp": 0.01309955, + "balance_loss_clip": 1.18288445, + "balance_loss_mlp": 1.07058299, + "epoch": 0.15716218247407185, + "flos": 25010002406880.0, + "grad_norm": 1.890903868624341, + "language_loss": 0.66332132, + "learning_rate": 3.832759067063055e-06, + "loss": 0.69183421, + "num_input_tokens_seen": 56712230, + "step": 2614, + "time_per_iteration": 2.7953171730041504 + }, + { + "auxiliary_loss_clip": 0.01546916, + "auxiliary_loss_mlp": 0.01325399, + "balance_loss_clip": 1.18896341, + "balance_loss_mlp": 1.08202112, + "epoch": 0.1572223057267398, + "flos": 20193760703520.0, + "grad_norm": 3.2540932165266994, + "language_loss": 0.75410056, + "learning_rate": 3.832603126688072e-06, + "loss": 0.78282368, + "num_input_tokens_seen": 56727490, + "step": 2615, + "time_per_iteration": 2.7478175163269043 + }, + { + "auxiliary_loss_clip": 0.01566437, + "auxiliary_loss_mlp": 0.0130219, + "balance_loss_clip": 1.20642507, + "balance_loss_mlp": 1.06415296, + "epoch": 0.15728242897940778, + "flos": 20961623953440.0, + "grad_norm": 1.6026620694011562, + "language_loss": 0.73445654, + "learning_rate": 3.832447116820594e-06, + "loss": 0.76314276, + "num_input_tokens_seen": 56747385, + "step": 2616, + "time_per_iteration": 2.7519266605377197 + }, + { + "auxiliary_loss_clip": 0.01555616, + "auxiliary_loss_mlp": 0.01303506, + "balance_loss_clip": 1.19616008, + "balance_loss_mlp": 1.0618453, + "epoch": 0.15734255223207574, + "flos": 23040449241120.0, + "grad_norm": 1.8102918416553553, + "language_loss": 0.72794312, + "learning_rate": 3.832291037466539e-06, + "loss": 0.75653434, + "num_input_tokens_seen": 56768055, + "step": 2617, + "time_per_iteration": 2.8884618282318115 + }, + { + "auxiliary_loss_clip": 0.0154737, + "auxiliary_loss_mlp": 0.01314692, + "balance_loss_clip": 1.18931103, + "balance_loss_mlp": 1.07417572, + "epoch": 0.15740267548474374, + "flos": 20552981950080.0, + "grad_norm": 3.7793287149274226, + "language_loss": 0.74574924, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.7743699, + "num_input_tokens_seen": 56785110, + "step": 2618, + "time_per_iteration": 2.756078004837036 + }, + { + "auxiliary_loss_clip": 0.01550857, + "auxiliary_loss_mlp": 0.01335429, + "balance_loss_clip": 1.1906538, + "balance_loss_mlp": 1.0924325, + "epoch": 0.1574627987374117, + "flos": 22668711701760.0, + "grad_norm": 2.145286696708418, + "language_loss": 0.78901321, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.81787604, + "num_input_tokens_seen": 56804975, + "step": 2619, + "time_per_iteration": 2.854294776916504 + }, + { + "auxiliary_loss_clip": 0.01556306, + "auxiliary_loss_mlp": 0.01315682, + "balance_loss_clip": 1.19813561, + "balance_loss_mlp": 1.08126855, + "epoch": 0.15752292199007967, + "flos": 16802570036160.0, + "grad_norm": 1.7904802132375697, + "language_loss": 0.76906621, + "learning_rate": 3.831822382544101e-06, + "loss": 0.79778606, + "num_input_tokens_seen": 56822470, + "step": 2620, + "time_per_iteration": 2.837700843811035 + }, + { + "auxiliary_loss_clip": 0.01548855, + "auxiliary_loss_mlp": 0.01308713, + "balance_loss_clip": 1.19016922, + "balance_loss_mlp": 1.06533527, + "epoch": 0.15758304524274763, + "flos": 29828633584320.0, + "grad_norm": 1.9450148330033759, + "language_loss": 0.71493602, + "learning_rate": 3.831666025302944e-06, + "loss": 0.74351168, + "num_input_tokens_seen": 56842100, + "step": 2621, + "time_per_iteration": 2.9088797569274902 + }, + { + "auxiliary_loss_clip": 0.01553493, + "auxiliary_loss_mlp": 0.01307204, + "balance_loss_clip": 1.19417715, + "balance_loss_mlp": 1.06115592, + "epoch": 0.1576431684954156, + "flos": 53581343028480.0, + "grad_norm": 2.244659101963108, + "language_loss": 0.72668409, + "learning_rate": 3.831509598604828e-06, + "loss": 0.75529104, + "num_input_tokens_seen": 56865920, + "step": 2622, + "time_per_iteration": 3.0979807376861572 + }, + { + "auxiliary_loss_clip": 0.0155034, + "auxiliary_loss_mlp": 0.01306268, + "balance_loss_clip": 1.19061041, + "balance_loss_mlp": 1.06804025, + "epoch": 0.15770329174808356, + "flos": 20815712864640.0, + "grad_norm": 2.0412659785561114, + "language_loss": 0.87919772, + "learning_rate": 3.831353102455684e-06, + "loss": 0.90776384, + "num_input_tokens_seen": 56885265, + "step": 2623, + "time_per_iteration": 2.8279216289520264 + }, + { + "auxiliary_loss_clip": 0.01552192, + "auxiliary_loss_mlp": 0.01286205, + "balance_loss_clip": 1.19221663, + "balance_loss_mlp": 1.04282761, + "epoch": 0.15776341500075153, + "flos": 24976397692800.0, + "grad_norm": 2.313147582466554, + "language_loss": 0.81768751, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84607148, + "num_input_tokens_seen": 56906710, + "step": 2624, + "time_per_iteration": 2.8610806465148926 + }, + { + "auxiliary_loss_clip": 0.01549602, + "auxiliary_loss_mlp": 0.01302537, + "balance_loss_clip": 1.19089985, + "balance_loss_mlp": 1.061257, + "epoch": 0.15782353825341952, + "flos": 21910064994720.0, + "grad_norm": 2.1089447544948587, + "language_loss": 0.80693173, + "learning_rate": 3.831039901828054e-06, + "loss": 0.83545315, + "num_input_tokens_seen": 56924275, + "step": 2625, + "time_per_iteration": 2.813236713409424 + }, + { + "auxiliary_loss_clip": 0.01552455, + "auxiliary_loss_mlp": 0.01295418, + "balance_loss_clip": 1.1946404, + "balance_loss_mlp": 1.05280304, + "epoch": 0.15788366150608749, + "flos": 26179604735040.0, + "grad_norm": 3.125719528684489, + "language_loss": 0.80315429, + "learning_rate": 3.830883197361445e-06, + "loss": 0.83163303, + "num_input_tokens_seen": 56941525, + "step": 2626, + "time_per_iteration": 2.839707374572754 + }, + { + "auxiliary_loss_clip": 0.01562198, + "auxiliary_loss_mlp": 0.01294292, + "balance_loss_clip": 1.20284104, + "balance_loss_mlp": 1.04976964, + "epoch": 0.15794378475875545, + "flos": 27712334910240.0, + "grad_norm": 1.7394101613099773, + "language_loss": 0.73816836, + "learning_rate": 3.830726423467561e-06, + "loss": 0.76673329, + "num_input_tokens_seen": 56962145, + "step": 2627, + "time_per_iteration": 2.812649965286255 + }, + { + "auxiliary_loss_clip": 0.015461, + "auxiliary_loss_mlp": 0.01321696, + "balance_loss_clip": 1.18730247, + "balance_loss_mlp": 1.07583857, + "epoch": 0.15800390801142342, + "flos": 12131784283680.0, + "grad_norm": 2.3827170989526465, + "language_loss": 0.85076916, + "learning_rate": 3.830569580152348e-06, + "loss": 0.8794471, + "num_input_tokens_seen": 56977505, + "step": 2628, + "time_per_iteration": 2.7846522331237793 + }, + { + "auxiliary_loss_clip": 0.01546901, + "auxiliary_loss_mlp": 0.01298558, + "balance_loss_clip": 1.1877327, + "balance_loss_mlp": 1.05098367, + "epoch": 0.15806403126409138, + "flos": 20706857952480.0, + "grad_norm": 2.4736896604328455, + "language_loss": 0.77019167, + "learning_rate": 3.830412667421752e-06, + "loss": 0.79864621, + "num_input_tokens_seen": 56996770, + "step": 2629, + "time_per_iteration": 2.818570375442505 + }, + { + "auxiliary_loss_clip": 0.01549623, + "auxiliary_loss_mlp": 0.01305085, + "balance_loss_clip": 1.19055057, + "balance_loss_mlp": 1.05884612, + "epoch": 0.15812415451675935, + "flos": 17823644232480.0, + "grad_norm": 2.9192738885256833, + "language_loss": 0.73956174, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.76810884, + "num_input_tokens_seen": 57014970, + "step": 2630, + "time_per_iteration": 2.8131496906280518 + }, + { + "auxiliary_loss_clip": 0.01553389, + "auxiliary_loss_mlp": 0.01325838, + "balance_loss_clip": 1.19380069, + "balance_loss_mlp": 1.08303261, + "epoch": 0.15818427776942734, + "flos": 20086119492480.0, + "grad_norm": 2.937273594716846, + "language_loss": 0.84484547, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.8736378, + "num_input_tokens_seen": 57034045, + "step": 2631, + "time_per_iteration": 2.880504608154297 + }, + { + "auxiliary_loss_clip": 0.01548211, + "auxiliary_loss_mlp": 0.01293583, + "balance_loss_clip": 1.1893189, + "balance_loss_mlp": 1.05535543, + "epoch": 0.1582444010220953, + "flos": 21217034733120.0, + "grad_norm": 1.8831107851830502, + "language_loss": 0.78899461, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.81741261, + "num_input_tokens_seen": 57053695, + "step": 2632, + "time_per_iteration": 2.892183303833008 + }, + { + "auxiliary_loss_clip": 0.0155342, + "auxiliary_loss_mlp": 0.01309744, + "balance_loss_clip": 1.19570541, + "balance_loss_mlp": 1.0625515, + "epoch": 0.15830452427476327, + "flos": 17860283199360.0, + "grad_norm": 2.0984181290998998, + "language_loss": 0.83475512, + "learning_rate": 3.829784322464594e-06, + "loss": 0.86338675, + "num_input_tokens_seen": 57071290, + "step": 2633, + "time_per_iteration": 2.7946043014526367 + }, + { + "auxiliary_loss_clip": 0.0154852, + "auxiliary_loss_mlp": 0.01299651, + "balance_loss_clip": 1.19065595, + "balance_loss_mlp": 1.05722678, + "epoch": 0.15836464752743123, + "flos": 24537261084480.0, + "grad_norm": 1.787135108720559, + "language_loss": 0.77636081, + "learning_rate": 3.829627062746394e-06, + "loss": 0.80484247, + "num_input_tokens_seen": 57091465, + "step": 2634, + "time_per_iteration": 2.851733922958374 + }, + { + "auxiliary_loss_clip": 0.01545286, + "auxiliary_loss_mlp": 0.01317273, + "balance_loss_clip": 1.18677568, + "balance_loss_mlp": 1.08247852, + "epoch": 0.1584247707800992, + "flos": 20122910172000.0, + "grad_norm": 2.7014080519244574, + "language_loss": 0.89395469, + "learning_rate": 3.829469733648552e-06, + "loss": 0.92258024, + "num_input_tokens_seen": 57110075, + "step": 2635, + "time_per_iteration": 2.782426118850708 + }, + { + "auxiliary_loss_clip": 0.01541704, + "auxiliary_loss_mlp": 0.01326771, + "balance_loss_clip": 1.18306088, + "balance_loss_mlp": 1.09025991, + "epoch": 0.15848489403276717, + "flos": 20378055454560.0, + "grad_norm": 2.441090300079672, + "language_loss": 0.76000464, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78868937, + "num_input_tokens_seen": 57128945, + "step": 2636, + "time_per_iteration": 2.7894186973571777 + }, + { + "auxiliary_loss_clip": 0.01546906, + "auxiliary_loss_mlp": 0.01305887, + "balance_loss_clip": 1.18758786, + "balance_loss_mlp": 1.06956625, + "epoch": 0.15854501728543513, + "flos": 39349948461120.0, + "grad_norm": 2.076295275373042, + "language_loss": 0.72022945, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74875736, + "num_input_tokens_seen": 57152385, + "step": 2637, + "time_per_iteration": 2.92704176902771 + }, + { + "auxiliary_loss_clip": 0.0155292, + "auxiliary_loss_mlp": 0.01351408, + "balance_loss_clip": 1.19434786, + "balance_loss_mlp": 1.1095562, + "epoch": 0.15860514053810312, + "flos": 24866594576640.0, + "grad_norm": 2.125207617632337, + "language_loss": 0.78080201, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80984533, + "num_input_tokens_seen": 57172620, + "step": 2638, + "time_per_iteration": 2.85733699798584 + }, + { + "auxiliary_loss_clip": 0.01542289, + "auxiliary_loss_mlp": 0.01357403, + "balance_loss_clip": 1.18472838, + "balance_loss_mlp": 1.11726737, + "epoch": 0.1586652637907711, + "flos": 26180135729280.0, + "grad_norm": 5.656387782845762, + "language_loss": 0.75631893, + "learning_rate": 3.828839723580128e-06, + "loss": 0.78531587, + "num_input_tokens_seen": 57194680, + "step": 2639, + "time_per_iteration": 4.3396124839782715 + }, + { + "auxiliary_loss_clip": 0.0154854, + "auxiliary_loss_mlp": 0.01339718, + "balance_loss_clip": 1.19009686, + "balance_loss_mlp": 1.09271646, + "epoch": 0.15872538704343905, + "flos": 19794107674080.0, + "grad_norm": 2.0979593190167103, + "language_loss": 0.81346005, + "learning_rate": 3.82868204767362e-06, + "loss": 0.84234267, + "num_input_tokens_seen": 57214675, + "step": 2640, + "time_per_iteration": 2.800114154815674 + }, + { + "auxiliary_loss_clip": 0.01537737, + "auxiliary_loss_mlp": 0.01301983, + "balance_loss_clip": 1.18073606, + "balance_loss_mlp": 1.06375492, + "epoch": 0.15878551029610702, + "flos": 28477884542400.0, + "grad_norm": 2.594899496621474, + "language_loss": 0.6717447, + "learning_rate": 3.828524302423306e-06, + "loss": 0.70014191, + "num_input_tokens_seen": 57235830, + "step": 2641, + "time_per_iteration": 2.8512489795684814 + }, + { + "auxiliary_loss_clip": 0.0155513, + "auxiliary_loss_mlp": 0.01343187, + "balance_loss_clip": 1.19753003, + "balance_loss_mlp": 1.10324216, + "epoch": 0.15884563354877498, + "flos": 24208989580800.0, + "grad_norm": 2.727276419786142, + "language_loss": 0.75424051, + "learning_rate": 3.828366487835167e-06, + "loss": 0.78322363, + "num_input_tokens_seen": 57255970, + "step": 2642, + "time_per_iteration": 2.8120410442352295 + }, + { + "auxiliary_loss_clip": 0.01543705, + "auxiliary_loss_mlp": 0.01319936, + "balance_loss_clip": 1.18601573, + "balance_loss_mlp": 1.08666742, + "epoch": 0.15890575680144295, + "flos": 23951985818400.0, + "grad_norm": 2.3799082869634116, + "language_loss": 0.703076, + "learning_rate": 3.828208603915186e-06, + "loss": 0.73171246, + "num_input_tokens_seen": 57274435, + "step": 2643, + "time_per_iteration": 2.805711030960083 + }, + { + "auxiliary_loss_clip": 0.01545688, + "auxiliary_loss_mlp": 0.01319133, + "balance_loss_clip": 1.18759954, + "balance_loss_mlp": 1.08071446, + "epoch": 0.15896588005411091, + "flos": 21217148517600.0, + "grad_norm": 2.3921196743367217, + "language_loss": 0.78984404, + "learning_rate": 3.828050650669353e-06, + "loss": 0.81849223, + "num_input_tokens_seen": 57293115, + "step": 2644, + "time_per_iteration": 2.8670051097869873 + }, + { + "auxiliary_loss_clip": 0.01539722, + "auxiliary_loss_mlp": 0.01320671, + "balance_loss_clip": 1.18169999, + "balance_loss_mlp": 1.08168006, + "epoch": 0.1590260033067789, + "flos": 24354673100640.0, + "grad_norm": 2.263476731714113, + "language_loss": 0.82213205, + "learning_rate": 3.827892628103657e-06, + "loss": 0.85073596, + "num_input_tokens_seen": 57312565, + "step": 2645, + "time_per_iteration": 2.8244900703430176 + }, + { + "auxiliary_loss_clip": 0.01542318, + "auxiliary_loss_mlp": 0.01311854, + "balance_loss_clip": 1.18524837, + "balance_loss_mlp": 1.07000244, + "epoch": 0.15908612655944687, + "flos": 32051549409120.0, + "grad_norm": 2.6660141624404274, + "language_loss": 0.70344675, + "learning_rate": 3.827734536224087e-06, + "loss": 0.73198843, + "num_input_tokens_seen": 57333360, + "step": 2646, + "time_per_iteration": 2.889277935028076 + }, + { + "auxiliary_loss_clip": 0.0155107, + "auxiliary_loss_mlp": 0.01326451, + "balance_loss_clip": 1.19454193, + "balance_loss_mlp": 1.09070325, + "epoch": 0.15914624981211484, + "flos": 17787308690880.0, + "grad_norm": 3.1735771198930927, + "language_loss": 0.62906438, + "learning_rate": 3.827576375036642e-06, + "loss": 0.65783954, + "num_input_tokens_seen": 57350575, + "step": 2647, + "time_per_iteration": 4.260493516921997 + }, + { + "auxiliary_loss_clip": 0.01549357, + "auxiliary_loss_mlp": 0.01312381, + "balance_loss_clip": 1.19222677, + "balance_loss_mlp": 1.07243633, + "epoch": 0.1592063730647828, + "flos": 17714447966880.0, + "grad_norm": 2.317716409731893, + "language_loss": 0.8911947, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91981208, + "num_input_tokens_seen": 57367570, + "step": 2648, + "time_per_iteration": 4.3716254234313965 + }, + { + "auxiliary_loss_clip": 0.01547315, + "auxiliary_loss_mlp": 0.01290719, + "balance_loss_clip": 1.18935728, + "balance_loss_mlp": 1.05153775, + "epoch": 0.15926649631745077, + "flos": 18805348634400.0, + "grad_norm": 2.6528825960195426, + "language_loss": 0.91553897, + "learning_rate": 3.827259844762114e-06, + "loss": 0.94391924, + "num_input_tokens_seen": 57383980, + "step": 2649, + "time_per_iteration": 4.222329378128052 + }, + { + "auxiliary_loss_clip": 0.01543527, + "auxiliary_loss_mlp": 0.01327406, + "balance_loss_clip": 1.18719578, + "balance_loss_mlp": 1.08402801, + "epoch": 0.15932661957011873, + "flos": 17568157596480.0, + "grad_norm": 3.436752386176227, + "language_loss": 0.71823001, + "learning_rate": 3.827101475687033e-06, + "loss": 0.74693936, + "num_input_tokens_seen": 57400840, + "step": 2650, + "time_per_iteration": 2.8227362632751465 + }, + { + "auxiliary_loss_clip": 0.01548146, + "auxiliary_loss_mlp": 0.0129422, + "balance_loss_clip": 1.19045806, + "balance_loss_mlp": 1.05961585, + "epoch": 0.15938674282278673, + "flos": 13336053314400.0, + "grad_norm": 4.142357687654957, + "language_loss": 0.71257102, + "learning_rate": 3.826943037328082e-06, + "loss": 0.74099469, + "num_input_tokens_seen": 57419230, + "step": 2651, + "time_per_iteration": 2.7829365730285645 + }, + { + "auxiliary_loss_clip": 0.01545692, + "auxiliary_loss_mlp": 0.01301126, + "balance_loss_clip": 1.18888164, + "balance_loss_mlp": 1.05927396, + "epoch": 0.1594468660754547, + "flos": 22490864737920.0, + "grad_norm": 2.1051315186829993, + "language_loss": 0.79834783, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82681602, + "num_input_tokens_seen": 57439315, + "step": 2652, + "time_per_iteration": 2.9067018032073975 + }, + { + "auxiliary_loss_clip": 0.01551008, + "auxiliary_loss_mlp": 0.01337602, + "balance_loss_clip": 1.19277275, + "balance_loss_mlp": 1.10089993, + "epoch": 0.15950698932812266, + "flos": 15008815713600.0, + "grad_norm": 3.7048418868878823, + "language_loss": 0.69741488, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72630101, + "num_input_tokens_seen": 57454635, + "step": 2653, + "time_per_iteration": 2.7921576499938965 + }, + { + "auxiliary_loss_clip": 0.0154981, + "auxiliary_loss_mlp": 0.01339, + "balance_loss_clip": 1.19282639, + "balance_loss_mlp": 1.10210764, + "epoch": 0.15956711258079062, + "flos": 30157739507520.0, + "grad_norm": 2.8160866071998263, + "language_loss": 0.77175188, + "learning_rate": 3.826467306608095e-06, + "loss": 0.80063999, + "num_input_tokens_seen": 57476805, + "step": 2654, + "time_per_iteration": 2.911956310272217 + }, + { + "auxiliary_loss_clip": 0.01536532, + "auxiliary_loss_mlp": 0.01294242, + "balance_loss_clip": 1.17952597, + "balance_loss_mlp": 1.05219996, + "epoch": 0.1596272358334586, + "flos": 21034826030880.0, + "grad_norm": 1.9335356707411344, + "language_loss": 0.82035345, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84866118, + "num_input_tokens_seen": 57496400, + "step": 2655, + "time_per_iteration": 2.7861311435699463 + }, + { + "auxiliary_loss_clip": 0.01543133, + "auxiliary_loss_mlp": 0.01384496, + "balance_loss_clip": 1.1866641, + "balance_loss_mlp": 1.14035547, + "epoch": 0.15968735908612655, + "flos": 15269839860960.0, + "grad_norm": 2.2164967626582284, + "language_loss": 0.73627913, + "learning_rate": 3.826149806485631e-06, + "loss": 0.76555538, + "num_input_tokens_seen": 57513700, + "step": 2656, + "time_per_iteration": 2.762221336364746 + }, + { + "auxiliary_loss_clip": 0.01544439, + "auxiliary_loss_mlp": 0.0139657, + "balance_loss_clip": 1.18715882, + "balance_loss_mlp": 1.15204787, + "epoch": 0.15974748233879452, + "flos": 52669616810400.0, + "grad_norm": 2.670988952903009, + "language_loss": 0.77865601, + "learning_rate": 3.825990952549713e-06, + "loss": 0.80806607, + "num_input_tokens_seen": 57536180, + "step": 2657, + "time_per_iteration": 3.09726619720459 + }, + { + "auxiliary_loss_clip": 0.01546018, + "auxiliary_loss_mlp": 0.01351191, + "balance_loss_clip": 1.18890762, + "balance_loss_mlp": 1.10438001, + "epoch": 0.1598076055914625, + "flos": 18735180809760.0, + "grad_norm": 1.7015029226465819, + "language_loss": 0.74693918, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77591133, + "num_input_tokens_seen": 57555025, + "step": 2658, + "time_per_iteration": 2.716003656387329 + }, + { + "auxiliary_loss_clip": 0.01544596, + "auxiliary_loss_mlp": 0.01329407, + "balance_loss_clip": 1.18620539, + "balance_loss_mlp": 1.0902257, + "epoch": 0.15986772884413047, + "flos": 34352066977920.0, + "grad_norm": 1.719250062477336, + "language_loss": 0.75244778, + "learning_rate": 3.825673036958624e-06, + "loss": 0.78118777, + "num_input_tokens_seen": 57577660, + "step": 2659, + "time_per_iteration": 2.9509599208831787 + }, + { + "auxiliary_loss_clip": 0.01545863, + "auxiliary_loss_mlp": 0.01354263, + "balance_loss_clip": 1.18780088, + "balance_loss_mlp": 1.11450958, + "epoch": 0.15992785209679844, + "flos": 22057189784640.0, + "grad_norm": 2.652699676804116, + "language_loss": 0.9075774, + "learning_rate": 3.825513975315508e-06, + "loss": 0.93657875, + "num_input_tokens_seen": 57596335, + "step": 2660, + "time_per_iteration": 2.757946014404297 + }, + { + "auxiliary_loss_clip": 0.0154569, + "auxiliary_loss_mlp": 0.01348488, + "balance_loss_clip": 1.18838024, + "balance_loss_mlp": 1.10873413, + "epoch": 0.1599879753494664, + "flos": 33069096286560.0, + "grad_norm": 2.1181611626725854, + "language_loss": 0.78260064, + "learning_rate": 3.82535484444872e-06, + "loss": 0.81154245, + "num_input_tokens_seen": 57616830, + "step": 2661, + "time_per_iteration": 2.8646206855773926 + }, + { + "auxiliary_loss_clip": 0.0153692, + "auxiliary_loss_mlp": 0.0132023, + "balance_loss_clip": 1.17978966, + "balance_loss_mlp": 1.07532656, + "epoch": 0.16004809860213437, + "flos": 28040530557600.0, + "grad_norm": 1.9607602092090246, + "language_loss": 0.74506867, + "learning_rate": 3.825195644364292e-06, + "loss": 0.77364016, + "num_input_tokens_seen": 57635515, + "step": 2662, + "time_per_iteration": 2.8224761486053467 + }, + { + "auxiliary_loss_clip": 0.01543136, + "auxiliary_loss_mlp": 0.01348429, + "balance_loss_clip": 1.18501198, + "balance_loss_mlp": 1.11287189, + "epoch": 0.16010822185480234, + "flos": 22782117993120.0, + "grad_norm": 2.353555135861546, + "language_loss": 0.82282346, + "learning_rate": 3.825036375068263e-06, + "loss": 0.85173917, + "num_input_tokens_seen": 57654250, + "step": 2663, + "time_per_iteration": 2.893378257751465 + }, + { + "auxiliary_loss_clip": 0.01549056, + "auxiliary_loss_mlp": 0.01330675, + "balance_loss_clip": 1.19058025, + "balance_loss_mlp": 1.096071, + "epoch": 0.16016834510747033, + "flos": 20086081564320.0, + "grad_norm": 2.240365291408016, + "language_loss": 0.7991569, + "learning_rate": 3.824877036566672e-06, + "loss": 0.82795417, + "num_input_tokens_seen": 57672645, + "step": 2664, + "time_per_iteration": 2.8166747093200684 + }, + { + "auxiliary_loss_clip": 0.01540936, + "auxiliary_loss_mlp": 0.0131908, + "balance_loss_clip": 1.1835978, + "balance_loss_mlp": 1.08619308, + "epoch": 0.1602284683601383, + "flos": 21175692674400.0, + "grad_norm": 1.9362891159631601, + "language_loss": 0.94080639, + "learning_rate": 3.824717628865561e-06, + "loss": 0.96940655, + "num_input_tokens_seen": 57691055, + "step": 2665, + "time_per_iteration": 2.9062650203704834 + }, + { + "auxiliary_loss_clip": 0.01544404, + "auxiliary_loss_mlp": 0.01347368, + "balance_loss_clip": 1.18617988, + "balance_loss_mlp": 1.10608804, + "epoch": 0.16028859161280626, + "flos": 14649328969920.0, + "grad_norm": 2.251649004675273, + "language_loss": 0.85279876, + "learning_rate": 3.824558151970974e-06, + "loss": 0.88171649, + "num_input_tokens_seen": 57707235, + "step": 2666, + "time_per_iteration": 2.7896170616149902 + }, + { + "auxiliary_loss_clip": 0.01538036, + "auxiliary_loss_mlp": 0.01334255, + "balance_loss_clip": 1.1804527, + "balance_loss_mlp": 1.09411955, + "epoch": 0.16034871486547422, + "flos": 20992118558400.0, + "grad_norm": 2.8134716887498645, + "language_loss": 0.81532145, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.84404445, + "num_input_tokens_seen": 57724190, + "step": 2667, + "time_per_iteration": 2.7467198371887207 + }, + { + "auxiliary_loss_clip": 0.01548188, + "auxiliary_loss_mlp": 0.01326095, + "balance_loss_clip": 1.18859661, + "balance_loss_mlp": 1.08920193, + "epoch": 0.1604088381181422, + "flos": 21399888214080.0, + "grad_norm": 2.251226949539064, + "language_loss": 0.74066138, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76940423, + "num_input_tokens_seen": 57743620, + "step": 2668, + "time_per_iteration": 2.8244776725769043 + }, + { + "auxiliary_loss_clip": 0.01537497, + "auxiliary_loss_mlp": 0.01302108, + "balance_loss_clip": 1.17991173, + "balance_loss_mlp": 1.06101918, + "epoch": 0.16046896137081015, + "flos": 23879238878880.0, + "grad_norm": 2.160297172931961, + "language_loss": 0.77523118, + "learning_rate": 3.824079306186848e-06, + "loss": 0.80362725, + "num_input_tokens_seen": 57764810, + "step": 2669, + "time_per_iteration": 2.8895492553710938 + }, + { + "auxiliary_loss_clip": 0.0162128, + "auxiliary_loss_mlp": 0.01249397, + "balance_loss_clip": 1.26432371, + "balance_loss_mlp": 1.05561066, + "epoch": 0.16052908462347812, + "flos": 59812470656640.0, + "grad_norm": 0.8092184240847633, + "language_loss": 0.55506361, + "learning_rate": 3.823919552578861e-06, + "loss": 0.58377039, + "num_input_tokens_seen": 57824390, + "step": 2670, + "time_per_iteration": 3.238588571548462 + }, + { + "auxiliary_loss_clip": 0.01530123, + "auxiliary_loss_mlp": 0.01397232, + "balance_loss_clip": 1.17178512, + "balance_loss_mlp": 1.16205597, + "epoch": 0.1605892078761461, + "flos": 18298547460000.0, + "grad_norm": 2.223568191667572, + "language_loss": 0.77602583, + "learning_rate": 3.82375972980766e-06, + "loss": 0.8052994, + "num_input_tokens_seen": 57843665, + "step": 2671, + "time_per_iteration": 2.7614760398864746 + }, + { + "auxiliary_loss_clip": 0.01545575, + "auxiliary_loss_mlp": 0.01497222, + "balance_loss_clip": 1.18595278, + "balance_loss_mlp": 1.27654207, + "epoch": 0.16064933112881408, + "flos": 32163476502240.0, + "grad_norm": 2.6420827929421793, + "language_loss": 0.64928317, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.67971122, + "num_input_tokens_seen": 57863305, + "step": 2672, + "time_per_iteration": 2.885788679122925 + }, + { + "auxiliary_loss_clip": 0.01535479, + "auxiliary_loss_mlp": 0.01495594, + "balance_loss_clip": 1.17669261, + "balance_loss_mlp": 1.27033615, + "epoch": 0.16070945438148204, + "flos": 19830860425440.0, + "grad_norm": 11.040402030676969, + "language_loss": 0.85955483, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.88986552, + "num_input_tokens_seen": 57883025, + "step": 2673, + "time_per_iteration": 2.877924680709839 + }, + { + "auxiliary_loss_clip": 0.01546642, + "auxiliary_loss_mlp": 0.01531686, + "balance_loss_clip": 1.18749189, + "balance_loss_mlp": 1.31157827, + "epoch": 0.16076957763415, + "flos": 18914962109760.0, + "grad_norm": 3.362056348146592, + "language_loss": 0.72509414, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75587738, + "num_input_tokens_seen": 57901430, + "step": 2674, + "time_per_iteration": 2.8241491317749023 + }, + { + "auxiliary_loss_clip": 0.01535514, + "auxiliary_loss_mlp": 0.01500391, + "balance_loss_clip": 1.17699623, + "balance_loss_mlp": 1.27990127, + "epoch": 0.16082970088681797, + "flos": 16766272422720.0, + "grad_norm": 2.6957057854612896, + "language_loss": 0.84278095, + "learning_rate": 3.823119747211986e-06, + "loss": 0.87313998, + "num_input_tokens_seen": 57919550, + "step": 2675, + "time_per_iteration": 2.866776466369629 + }, + { + "auxiliary_loss_clip": 0.01545582, + "auxiliary_loss_mlp": 0.0151319, + "balance_loss_clip": 1.1855737, + "balance_loss_mlp": 1.29403532, + "epoch": 0.16088982413948594, + "flos": 35153041875840.0, + "grad_norm": 2.163592209243569, + "language_loss": 0.82751691, + "learning_rate": 3.822959578715685e-06, + "loss": 0.85810459, + "num_input_tokens_seen": 57939890, + "step": 2676, + "time_per_iteration": 2.8824429512023926 + }, + { + "auxiliary_loss_clip": 0.01537011, + "auxiliary_loss_mlp": 0.01472689, + "balance_loss_clip": 1.17686343, + "balance_loss_mlp": 1.24342608, + "epoch": 0.1609499473921539, + "flos": 18627122388960.0, + "grad_norm": 2.0665316294520473, + "language_loss": 0.73535514, + "learning_rate": 3.822799341092573e-06, + "loss": 0.76545209, + "num_input_tokens_seen": 57957410, + "step": 2677, + "time_per_iteration": 2.8343162536621094 + }, + { + "auxiliary_loss_clip": 0.01533061, + "auxiliary_loss_mlp": 0.01431886, + "balance_loss_clip": 1.17375863, + "balance_loss_mlp": 1.20452988, + "epoch": 0.1610100706448219, + "flos": 33148518582240.0, + "grad_norm": 2.134360608982183, + "language_loss": 0.76410484, + "learning_rate": 3.822639034348728e-06, + "loss": 0.79375434, + "num_input_tokens_seen": 57977900, + "step": 2678, + "time_per_iteration": 4.40893816947937 + }, + { + "auxiliary_loss_clip": 0.0153889, + "auxiliary_loss_mlp": 0.0138713, + "balance_loss_clip": 1.1797632, + "balance_loss_mlp": 1.15958309, + "epoch": 0.16107019389748986, + "flos": 34679504062080.0, + "grad_norm": 3.292565119063174, + "language_loss": 0.7061159, + "learning_rate": 3.822478658490228e-06, + "loss": 0.73537606, + "num_input_tokens_seen": 57998210, + "step": 2679, + "time_per_iteration": 2.8966920375823975 + }, + { + "auxiliary_loss_clip": 0.016156, + "auxiliary_loss_mlp": 0.01247124, + "balance_loss_clip": 1.25798631, + "balance_loss_mlp": 1.06401825, + "epoch": 0.16113031715015783, + "flos": 65719271674080.0, + "grad_norm": 0.8516708459882047, + "language_loss": 0.5182364, + "learning_rate": 3.822318213523154e-06, + "loss": 0.54686362, + "num_input_tokens_seen": 58059420, + "step": 2680, + "time_per_iteration": 3.331874132156372 + }, + { + "auxiliary_loss_clip": 0.01540779, + "auxiliary_loss_mlp": 0.01411404, + "balance_loss_clip": 1.18382955, + "balance_loss_mlp": 1.17336655, + "epoch": 0.1611904404028258, + "flos": 20812337258400.0, + "grad_norm": 1.8504723466647885, + "language_loss": 0.80563676, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.83515853, + "num_input_tokens_seen": 58078370, + "step": 2681, + "time_per_iteration": 2.843944787979126 + }, + { + "auxiliary_loss_clip": 0.01545143, + "auxiliary_loss_mlp": 0.01459101, + "balance_loss_clip": 1.18805552, + "balance_loss_mlp": 1.21267128, + "epoch": 0.16125056365549376, + "flos": 27015625617120.0, + "grad_norm": 2.56369114895921, + "language_loss": 0.6914221, + "learning_rate": 3.821997116287627e-06, + "loss": 0.72146457, + "num_input_tokens_seen": 58097395, + "step": 2682, + "time_per_iteration": 2.835322380065918 + }, + { + "auxiliary_loss_clip": 0.01549408, + "auxiliary_loss_mlp": 0.01457998, + "balance_loss_clip": 1.1912694, + "balance_loss_mlp": 1.21423841, + "epoch": 0.16131068690816172, + "flos": 19278279597600.0, + "grad_norm": 2.1240203825990798, + "language_loss": 0.87672961, + "learning_rate": 3.821836464031348e-06, + "loss": 0.90680361, + "num_input_tokens_seen": 58115630, + "step": 2683, + "time_per_iteration": 2.758185625076294 + }, + { + "auxiliary_loss_clip": 0.01543423, + "auxiliary_loss_mlp": 0.01405678, + "balance_loss_clip": 1.18640232, + "balance_loss_mlp": 1.16191912, + "epoch": 0.16137081016082971, + "flos": 35341015658400.0, + "grad_norm": 2.1054677649489437, + "language_loss": 0.74260759, + "learning_rate": 3.821675742690849e-06, + "loss": 0.7720986, + "num_input_tokens_seen": 58138655, + "step": 2684, + "time_per_iteration": 2.954458236694336 + }, + { + "auxiliary_loss_clip": 0.01536932, + "auxiliary_loss_mlp": 0.0132159, + "balance_loss_clip": 1.17961156, + "balance_loss_mlp": 1.08202696, + "epoch": 0.16143093341349768, + "flos": 34237826267040.0, + "grad_norm": 1.8556745950111768, + "language_loss": 0.70289576, + "learning_rate": 3.821514952272223e-06, + "loss": 0.73148102, + "num_input_tokens_seen": 58157440, + "step": 2685, + "time_per_iteration": 4.323825359344482 + }, + { + "auxiliary_loss_clip": 0.0155031, + "auxiliary_loss_mlp": 0.01380321, + "balance_loss_clip": 1.19168055, + "balance_loss_mlp": 1.15372813, + "epoch": 0.16149105666616564, + "flos": 28001653829280.0, + "grad_norm": 6.750559835406333, + "language_loss": 0.72233713, + "learning_rate": 3.821354092781567e-06, + "loss": 0.75164348, + "num_input_tokens_seen": 58176660, + "step": 2686, + "time_per_iteration": 4.346671104431152 + }, + { + "auxiliary_loss_clip": 0.01549628, + "auxiliary_loss_mlp": 0.01429651, + "balance_loss_clip": 1.19296598, + "balance_loss_mlp": 1.20324922, + "epoch": 0.1615511799188336, + "flos": 19423963117440.0, + "grad_norm": 1.9389989687750813, + "language_loss": 0.81879544, + "learning_rate": 3.821193164224981e-06, + "loss": 0.84858823, + "num_input_tokens_seen": 58195085, + "step": 2687, + "time_per_iteration": 4.317488193511963 + }, + { + "auxiliary_loss_clip": 0.01537096, + "auxiliary_loss_mlp": 0.01482462, + "balance_loss_clip": 1.17954373, + "balance_loss_mlp": 1.25586915, + "epoch": 0.16161130317150157, + "flos": 22857026837760.0, + "grad_norm": 3.6329388057533376, + "language_loss": 0.71935534, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74955094, + "num_input_tokens_seen": 58213540, + "step": 2688, + "time_per_iteration": 2.763589382171631 + }, + { + "auxiliary_loss_clip": 0.01540006, + "auxiliary_loss_mlp": 0.01491854, + "balance_loss_clip": 1.18373299, + "balance_loss_mlp": 1.26659632, + "epoch": 0.16167142642416954, + "flos": 26113343510880.0, + "grad_norm": 1.7461749161668916, + "language_loss": 0.76027012, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.79058874, + "num_input_tokens_seen": 58236995, + "step": 2689, + "time_per_iteration": 2.796144485473633 + }, + { + "auxiliary_loss_clip": 0.01549888, + "auxiliary_loss_mlp": 0.01499199, + "balance_loss_clip": 1.19164538, + "balance_loss_mlp": 1.28004503, + "epoch": 0.1617315496768375, + "flos": 22781624927040.0, + "grad_norm": 1.820633042134608, + "language_loss": 0.87630194, + "learning_rate": 3.820709964220683e-06, + "loss": 0.90679282, + "num_input_tokens_seen": 58257230, + "step": 2690, + "time_per_iteration": 2.823218584060669 + }, + { + "auxiliary_loss_clip": 0.01547806, + "auxiliary_loss_mlp": 0.0149137, + "balance_loss_clip": 1.19045174, + "balance_loss_mlp": 1.26878262, + "epoch": 0.1617916729295055, + "flos": 22019450901120.0, + "grad_norm": 1.7463909138309865, + "language_loss": 0.8822937, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.91268539, + "num_input_tokens_seen": 58277080, + "step": 2691, + "time_per_iteration": 2.757324457168579 + }, + { + "auxiliary_loss_clip": 0.01540975, + "auxiliary_loss_mlp": 0.01496941, + "balance_loss_clip": 1.18255973, + "balance_loss_mlp": 1.27053857, + "epoch": 0.16185179618217346, + "flos": 23440367767680.0, + "grad_norm": 3.238353002448533, + "language_loss": 0.82404649, + "learning_rate": 3.820387485666784e-06, + "loss": 0.85442567, + "num_input_tokens_seen": 58294815, + "step": 2692, + "time_per_iteration": 2.7993409633636475 + }, + { + "auxiliary_loss_clip": 0.01540879, + "auxiliary_loss_mlp": 0.01482375, + "balance_loss_clip": 1.18409216, + "balance_loss_mlp": 1.25158548, + "epoch": 0.16191191943484143, + "flos": 25668479750400.0, + "grad_norm": 2.356282305491148, + "language_loss": 0.81803381, + "learning_rate": 3.820226142842862e-06, + "loss": 0.84826636, + "num_input_tokens_seen": 58313215, + "step": 2693, + "time_per_iteration": 2.7852227687835693 + }, + { + "auxiliary_loss_clip": 0.01542988, + "auxiliary_loss_mlp": 0.01425117, + "balance_loss_clip": 1.18616343, + "balance_loss_mlp": 1.1992867, + "epoch": 0.1619720426875094, + "flos": 23479623777600.0, + "grad_norm": 1.6832372840334076, + "language_loss": 0.84158874, + "learning_rate": 3.820064730995783e-06, + "loss": 0.87126982, + "num_input_tokens_seen": 58333215, + "step": 2694, + "time_per_iteration": 2.796807050704956 + }, + { + "auxiliary_loss_clip": 0.01541692, + "auxiliary_loss_mlp": 0.01388528, + "balance_loss_clip": 1.18412781, + "balance_loss_mlp": 1.15888309, + "epoch": 0.16203216594017736, + "flos": 24135939216000.0, + "grad_norm": 2.321074711015489, + "language_loss": 0.6970681, + "learning_rate": 3.819903250131667e-06, + "loss": 0.72637028, + "num_input_tokens_seen": 58351160, + "step": 2695, + "time_per_iteration": 2.7753238677978516 + }, + { + "auxiliary_loss_clip": 0.01550425, + "auxiliary_loss_mlp": 0.01325312, + "balance_loss_clip": 1.19308329, + "balance_loss_mlp": 1.09089899, + "epoch": 0.16209228919284532, + "flos": 22342791744000.0, + "grad_norm": 2.3346304708818737, + "language_loss": 0.82714075, + "learning_rate": 3.819741700256637e-06, + "loss": 0.85589814, + "num_input_tokens_seen": 58368505, + "step": 2696, + "time_per_iteration": 2.767781972885132 + }, + { + "auxiliary_loss_clip": 0.01541668, + "auxiliary_loss_mlp": 0.01377528, + "balance_loss_clip": 1.18498635, + "balance_loss_mlp": 1.13090754, + "epoch": 0.1621524124455133, + "flos": 15816959033760.0, + "grad_norm": 3.71351118952421, + "language_loss": 0.88850456, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.91769654, + "num_input_tokens_seen": 58385085, + "step": 2697, + "time_per_iteration": 2.815703868865967 + }, + { + "auxiliary_loss_clip": 0.01549033, + "auxiliary_loss_mlp": 0.0137705, + "balance_loss_clip": 1.19086289, + "balance_loss_mlp": 1.13538933, + "epoch": 0.16221253569818128, + "flos": 30189106460160.0, + "grad_norm": 1.6600435231845918, + "language_loss": 0.80956346, + "learning_rate": 3.819418393498343e-06, + "loss": 0.83882427, + "num_input_tokens_seen": 58406985, + "step": 2698, + "time_per_iteration": 2.8590471744537354 + }, + { + "auxiliary_loss_clip": 0.0154938, + "auxiliary_loss_mlp": 0.01362507, + "balance_loss_clip": 1.1934762, + "balance_loss_mlp": 1.11836672, + "epoch": 0.16227265895084925, + "flos": 24608187472320.0, + "grad_norm": 1.7117073432945218, + "language_loss": 0.77302456, + "learning_rate": 3.819256636627339e-06, + "loss": 0.80214345, + "num_input_tokens_seen": 58426205, + "step": 2699, + "time_per_iteration": 2.8023123741149902 + }, + { + "auxiliary_loss_clip": 0.01550618, + "auxiliary_loss_mlp": 0.01322579, + "balance_loss_clip": 1.19438148, + "balance_loss_mlp": 1.08320713, + "epoch": 0.1623327822035172, + "flos": 19575373789440.0, + "grad_norm": 2.063289323966601, + "language_loss": 0.8568604, + "learning_rate": 3.81909481076994e-06, + "loss": 0.88559234, + "num_input_tokens_seen": 58443830, + "step": 2700, + "time_per_iteration": 2.778228998184204 + }, + { + "auxiliary_loss_clip": 0.01547236, + "auxiliary_loss_mlp": 0.01336076, + "balance_loss_clip": 1.19089174, + "balance_loss_mlp": 1.09765697, + "epoch": 0.16239290545618518, + "flos": 26470857990240.0, + "grad_norm": 2.023406250693485, + "language_loss": 0.80992758, + "learning_rate": 3.818932915932284e-06, + "loss": 0.83876067, + "num_input_tokens_seen": 58464405, + "step": 2701, + "time_per_iteration": 2.8442039489746094 + }, + { + "auxiliary_loss_clip": 0.01547503, + "auxiliary_loss_mlp": 0.01362243, + "balance_loss_clip": 1.19137311, + "balance_loss_mlp": 1.12267995, + "epoch": 0.16245302870885314, + "flos": 15853787641440.0, + "grad_norm": 2.2664138326339067, + "language_loss": 0.73152864, + "learning_rate": 3.818770952120511e-06, + "loss": 0.7606262, + "num_input_tokens_seen": 58483295, + "step": 2702, + "time_per_iteration": 2.7896640300750732 + }, + { + "auxiliary_loss_clip": 0.01552646, + "auxiliary_loss_mlp": 0.01371256, + "balance_loss_clip": 1.19510639, + "balance_loss_mlp": 1.13646173, + "epoch": 0.1625131519615211, + "flos": 14758259738400.0, + "grad_norm": 2.720647011756592, + "language_loss": 0.73082387, + "learning_rate": 3.81860891934076e-06, + "loss": 0.76006293, + "num_input_tokens_seen": 58501205, + "step": 2703, + "time_per_iteration": 2.76405668258667 + }, + { + "auxiliary_loss_clip": 0.01547764, + "auxiliary_loss_mlp": 0.01380354, + "balance_loss_clip": 1.19126439, + "balance_loss_mlp": 1.14403319, + "epoch": 0.1625732752141891, + "flos": 28223042685120.0, + "grad_norm": 2.0558313851426164, + "language_loss": 0.70868742, + "learning_rate": 3.818446817599176e-06, + "loss": 0.73796868, + "num_input_tokens_seen": 58522315, + "step": 2704, + "time_per_iteration": 2.853870391845703 + }, + { + "auxiliary_loss_clip": 0.01665654, + "auxiliary_loss_mlp": 0.01272682, + "balance_loss_clip": 1.31256127, + "balance_loss_mlp": 1.08156586, + "epoch": 0.16263339846685707, + "flos": 67334420469600.0, + "grad_norm": 0.7799818729140883, + "language_loss": 0.53366023, + "learning_rate": 3.818284646901907e-06, + "loss": 0.56304359, + "num_input_tokens_seen": 58586695, + "step": 2705, + "time_per_iteration": 3.336655616760254 + }, + { + "auxiliary_loss_clip": 0.01550679, + "auxiliary_loss_mlp": 0.0134127, + "balance_loss_clip": 1.19461966, + "balance_loss_mlp": 1.09827352, + "epoch": 0.16269352171952503, + "flos": 14320905753600.0, + "grad_norm": 2.565019152001008, + "language_loss": 0.75897789, + "learning_rate": 3.818122407255102e-06, + "loss": 0.78789741, + "num_input_tokens_seen": 58602435, + "step": 2706, + "time_per_iteration": 2.7717125415802 + }, + { + "auxiliary_loss_clip": 0.01550426, + "auxiliary_loss_mlp": 0.01443206, + "balance_loss_clip": 1.19275761, + "balance_loss_mlp": 1.20192647, + "epoch": 0.162753644972193, + "flos": 28363454190720.0, + "grad_norm": 4.000866103523967, + "language_loss": 0.7218163, + "learning_rate": 3.817960098664914e-06, + "loss": 0.75175261, + "num_input_tokens_seen": 58621275, + "step": 2707, + "time_per_iteration": 2.8363800048828125 + }, + { + "auxiliary_loss_clip": 0.0155719, + "auxiliary_loss_mlp": 0.01499328, + "balance_loss_clip": 1.20062721, + "balance_loss_mlp": 1.25137281, + "epoch": 0.16281376822486096, + "flos": 19939904978400.0, + "grad_norm": 2.559886472044134, + "language_loss": 0.83242208, + "learning_rate": 3.817797721137495e-06, + "loss": 0.86298728, + "num_input_tokens_seen": 58637550, + "step": 2708, + "time_per_iteration": 2.7975943088531494 + }, + { + "auxiliary_loss_clip": 0.01545523, + "auxiliary_loss_mlp": 0.01510882, + "balance_loss_clip": 1.18911719, + "balance_loss_mlp": 1.25968456, + "epoch": 0.16287389147752893, + "flos": 21253863340800.0, + "grad_norm": 2.8160035518290405, + "language_loss": 0.8626495, + "learning_rate": 3.817635274679006e-06, + "loss": 0.89321351, + "num_input_tokens_seen": 58654135, + "step": 2709, + "time_per_iteration": 2.845109701156616 + }, + { + "auxiliary_loss_clip": 0.01542286, + "auxiliary_loss_mlp": 0.01487898, + "balance_loss_clip": 1.18662763, + "balance_loss_mlp": 1.23784494, + "epoch": 0.1629340147301969, + "flos": 19246685076000.0, + "grad_norm": 2.1025474577031447, + "language_loss": 0.91638374, + "learning_rate": 3.817472759295605e-06, + "loss": 0.94668561, + "num_input_tokens_seen": 58674320, + "step": 2710, + "time_per_iteration": 2.8979685306549072 + }, + { + "auxiliary_loss_clip": 0.015547, + "auxiliary_loss_mlp": 0.01449184, + "balance_loss_clip": 1.19890666, + "balance_loss_mlp": 1.20618784, + "epoch": 0.16299413798286488, + "flos": 21251853148320.0, + "grad_norm": 2.834061252357016, + "language_loss": 0.81604886, + "learning_rate": 3.817310174993453e-06, + "loss": 0.84608775, + "num_input_tokens_seen": 58691000, + "step": 2711, + "time_per_iteration": 2.851921319961548 + }, + { + "auxiliary_loss_clip": 0.01540241, + "auxiliary_loss_mlp": 0.0136748, + "balance_loss_clip": 1.18403625, + "balance_loss_mlp": 1.12620044, + "epoch": 0.16305426123553285, + "flos": 18772578339840.0, + "grad_norm": 5.599518436300733, + "language_loss": 0.81195498, + "learning_rate": 3.817147521778719e-06, + "loss": 0.84103221, + "num_input_tokens_seen": 58710230, + "step": 2712, + "time_per_iteration": 2.935295581817627 + }, + { + "auxiliary_loss_clip": 0.01551104, + "auxiliary_loss_mlp": 0.01351238, + "balance_loss_clip": 1.19490802, + "balance_loss_mlp": 1.11796975, + "epoch": 0.16311438448820081, + "flos": 22089656653920.0, + "grad_norm": 1.8352992593027286, + "language_loss": 0.7662791, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79530251, + "num_input_tokens_seen": 58728610, + "step": 2713, + "time_per_iteration": 2.846550703048706 + }, + { + "auxiliary_loss_clip": 0.0156639, + "auxiliary_loss_mlp": 0.01412858, + "balance_loss_clip": 1.20871997, + "balance_loss_mlp": 1.18969774, + "epoch": 0.16317450774086878, + "flos": 16469329943520.0, + "grad_norm": 2.9686444431810672, + "language_loss": 0.7946353, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.82442772, + "num_input_tokens_seen": 58744385, + "step": 2714, + "time_per_iteration": 2.7387075424194336 + }, + { + "auxiliary_loss_clip": 0.0154594, + "auxiliary_loss_mlp": 0.01442587, + "balance_loss_clip": 1.1905688, + "balance_loss_mlp": 1.21446824, + "epoch": 0.16323463099353674, + "flos": 24355242023040.0, + "grad_norm": 1.642770922491389, + "language_loss": 0.78093034, + "learning_rate": 3.816659148720702e-06, + "loss": 0.81081557, + "num_input_tokens_seen": 58763905, + "step": 2715, + "time_per_iteration": 2.8678812980651855 + }, + { + "auxiliary_loss_clip": 0.01553124, + "auxiliary_loss_mlp": 0.01456368, + "balance_loss_clip": 1.19754696, + "balance_loss_mlp": 1.23168182, + "epoch": 0.1632947542462047, + "flos": 24903157687200.0, + "grad_norm": 2.1181917339162717, + "language_loss": 0.81908935, + "learning_rate": 3.816496219917336e-06, + "loss": 0.84918433, + "num_input_tokens_seen": 58785580, + "step": 2716, + "time_per_iteration": 4.37394905090332 + }, + { + "auxiliary_loss_clip": 0.01554985, + "auxiliary_loss_mlp": 0.01478456, + "balance_loss_clip": 1.19969916, + "balance_loss_mlp": 1.25319815, + "epoch": 0.1633548774988727, + "flos": 24902626692960.0, + "grad_norm": 2.667207004536488, + "language_loss": 0.86364317, + "learning_rate": 3.816333222232251e-06, + "loss": 0.89397764, + "num_input_tokens_seen": 58806075, + "step": 2717, + "time_per_iteration": 2.8351571559906006 + }, + { + "auxiliary_loss_clip": 0.01557102, + "auxiliary_loss_mlp": 0.01459358, + "balance_loss_clip": 1.20295501, + "balance_loss_mlp": 1.23410058, + "epoch": 0.16341500075154067, + "flos": 30444213814560.0, + "grad_norm": 1.8714547490429776, + "language_loss": 0.7616086, + "learning_rate": 3.816170155671629e-06, + "loss": 0.7917732, + "num_input_tokens_seen": 58827405, + "step": 2718, + "time_per_iteration": 2.9202888011932373 + }, + { + "auxiliary_loss_clip": 0.01544674, + "auxiliary_loss_mlp": 0.01429692, + "balance_loss_clip": 1.19021618, + "balance_loss_mlp": 1.19527876, + "epoch": 0.16347512400420863, + "flos": 22786783156800.0, + "grad_norm": 2.0604530501682303, + "language_loss": 0.7375102, + "learning_rate": 3.816007020241652e-06, + "loss": 0.76725382, + "num_input_tokens_seen": 58847205, + "step": 2719, + "time_per_iteration": 2.8569018840789795 + }, + { + "auxiliary_loss_clip": 0.01551073, + "auxiliary_loss_mlp": 0.0140354, + "balance_loss_clip": 1.1979785, + "balance_loss_mlp": 1.16626573, + "epoch": 0.1635352472568766, + "flos": 22635220772160.0, + "grad_norm": 1.6861326034302324, + "language_loss": 0.72759557, + "learning_rate": 3.815843815948507e-06, + "loss": 0.75714171, + "num_input_tokens_seen": 58866865, + "step": 2720, + "time_per_iteration": 2.8430418968200684 + }, + { + "auxiliary_loss_clip": 0.01546347, + "auxiliary_loss_mlp": 0.01348833, + "balance_loss_clip": 1.19221592, + "balance_loss_mlp": 1.11766207, + "epoch": 0.16359537050954456, + "flos": 15524795502720.0, + "grad_norm": 2.585812876539355, + "language_loss": 0.75548935, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.78444111, + "num_input_tokens_seen": 58885200, + "step": 2721, + "time_per_iteration": 3.017469882965088 + }, + { + "auxiliary_loss_clip": 0.01545011, + "auxiliary_loss_mlp": 0.01351408, + "balance_loss_clip": 1.19186711, + "balance_loss_mlp": 1.10860252, + "epoch": 0.16365549376221253, + "flos": 22092311625120.0, + "grad_norm": 1.814437134711272, + "language_loss": 0.79725003, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.82621419, + "num_input_tokens_seen": 58906385, + "step": 2722, + "time_per_iteration": 2.7848174571990967 + }, + { + "auxiliary_loss_clip": 0.01539908, + "auxiliary_loss_mlp": 0.01370418, + "balance_loss_clip": 1.18732262, + "balance_loss_mlp": 1.12189054, + "epoch": 0.1637156170148805, + "flos": 24062661282240.0, + "grad_norm": 2.1919019083288807, + "language_loss": 0.8514908, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.88059402, + "num_input_tokens_seen": 58925040, + "step": 2723, + "time_per_iteration": 4.310307025909424 + }, + { + "auxiliary_loss_clip": 0.01548192, + "auxiliary_loss_mlp": 0.01379097, + "balance_loss_clip": 1.19596815, + "balance_loss_mlp": 1.13476562, + "epoch": 0.1637757402675485, + "flos": 26687657538720.0, + "grad_norm": 1.9373761849150501, + "language_loss": 0.71157181, + "learning_rate": 3.815190310268058e-06, + "loss": 0.74084473, + "num_input_tokens_seen": 58944790, + "step": 2724, + "time_per_iteration": 4.412710428237915 + }, + { + "auxiliary_loss_clip": 0.01550348, + "auxiliary_loss_mlp": 0.01358554, + "balance_loss_clip": 1.19716907, + "balance_loss_mlp": 1.11536717, + "epoch": 0.16383586352021645, + "flos": 16108781211360.0, + "grad_norm": 2.2400429930536894, + "language_loss": 0.70923966, + "learning_rate": 3.815026761751955e-06, + "loss": 0.7383287, + "num_input_tokens_seen": 58962500, + "step": 2725, + "time_per_iteration": 2.912553548812866 + }, + { + "auxiliary_loss_clip": 0.0154824, + "auxiliary_loss_mlp": 0.01293642, + "balance_loss_clip": 1.19408631, + "balance_loss_mlp": 1.0569396, + "epoch": 0.16389598677288442, + "flos": 19167679990080.0, + "grad_norm": 6.848104426967188, + "language_loss": 0.88746202, + "learning_rate": 3.814863144409855e-06, + "loss": 0.91588086, + "num_input_tokens_seen": 58980355, + "step": 2726, + "time_per_iteration": 4.389035940170288 + }, + { + "auxiliary_loss_clip": 0.01552197, + "auxiliary_loss_mlp": 0.01341373, + "balance_loss_clip": 1.20047128, + "balance_loss_mlp": 1.09894907, + "epoch": 0.16395611002555238, + "flos": 21509160336000.0, + "grad_norm": 2.0329197847507574, + "language_loss": 0.74100137, + "learning_rate": 3.814699458247963e-06, + "loss": 0.7699371, + "num_input_tokens_seen": 58999505, + "step": 2727, + "time_per_iteration": 2.808809995651245 + }, + { + "auxiliary_loss_clip": 0.01553214, + "auxiliary_loss_mlp": 0.01373956, + "balance_loss_clip": 1.202106, + "balance_loss_mlp": 1.13572848, + "epoch": 0.16401623327822035, + "flos": 21473090291520.0, + "grad_norm": 1.7442258728313274, + "language_loss": 0.82758528, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.856857, + "num_input_tokens_seen": 59017930, + "step": 2728, + "time_per_iteration": 2.858532190322876 + }, + { + "auxiliary_loss_clip": 0.01549612, + "auxiliary_loss_mlp": 0.01364693, + "balance_loss_clip": 1.19780314, + "balance_loss_mlp": 1.1249398, + "epoch": 0.1640763565308883, + "flos": 13627951348320.0, + "grad_norm": 3.013428181711631, + "language_loss": 0.85205114, + "learning_rate": 3.814371879489633e-06, + "loss": 0.88119423, + "num_input_tokens_seen": 59035130, + "step": 2729, + "time_per_iteration": 2.7578699588775635 + }, + { + "auxiliary_loss_clip": 0.01549378, + "auxiliary_loss_mlp": 0.013497, + "balance_loss_clip": 1.19772649, + "balance_loss_mlp": 1.1082294, + "epoch": 0.16413647978355628, + "flos": 15453338120640.0, + "grad_norm": 3.627878142801492, + "language_loss": 0.7306658, + "learning_rate": 3.814207986905616e-06, + "loss": 0.75965655, + "num_input_tokens_seen": 59053080, + "step": 2730, + "time_per_iteration": 2.787288188934326 + }, + { + "auxiliary_loss_clip": 0.0154737, + "auxiliary_loss_mlp": 0.01319485, + "balance_loss_clip": 1.19661045, + "balance_loss_mlp": 1.06962204, + "epoch": 0.16419660303622427, + "flos": 45882153102240.0, + "grad_norm": 1.6246484582482557, + "language_loss": 0.74503577, + "learning_rate": 3.814044025526651e-06, + "loss": 0.77370435, + "num_input_tokens_seen": 59075610, + "step": 2731, + "time_per_iteration": 2.9784083366394043 + }, + { + "auxiliary_loss_clip": 0.01550207, + "auxiliary_loss_mlp": 0.01349041, + "balance_loss_clip": 1.19824791, + "balance_loss_mlp": 1.09955955, + "epoch": 0.16425672628889224, + "flos": 18954824970240.0, + "grad_norm": 2.188181945883187, + "language_loss": 0.79357618, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.82256866, + "num_input_tokens_seen": 59094555, + "step": 2732, + "time_per_iteration": 2.816164493560791 + }, + { + "auxiliary_loss_clip": 0.01553388, + "auxiliary_loss_mlp": 0.0135776, + "balance_loss_clip": 1.20113838, + "balance_loss_mlp": 1.10637164, + "epoch": 0.1643168495415602, + "flos": 24315113665440.0, + "grad_norm": 1.9856825769692767, + "language_loss": 0.6953975, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.72450894, + "num_input_tokens_seen": 59113515, + "step": 2733, + "time_per_iteration": 2.786944627761841 + }, + { + "auxiliary_loss_clip": 0.01546135, + "auxiliary_loss_mlp": 0.01340467, + "balance_loss_clip": 1.19379497, + "balance_loss_mlp": 1.09575438, + "epoch": 0.16437697279422817, + "flos": 26430615848160.0, + "grad_norm": 4.845852492762271, + "language_loss": 0.81462312, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.84348917, + "num_input_tokens_seen": 59133275, + "step": 2734, + "time_per_iteration": 2.8252594470977783 + }, + { + "auxiliary_loss_clip": 0.01548337, + "auxiliary_loss_mlp": 0.01310959, + "balance_loss_clip": 1.19544959, + "balance_loss_mlp": 1.07006037, + "epoch": 0.16443709604689613, + "flos": 34535110099680.0, + "grad_norm": 4.039277721873878, + "language_loss": 0.82234025, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.85093325, + "num_input_tokens_seen": 59154095, + "step": 2735, + "time_per_iteration": 2.9223837852478027 + }, + { + "auxiliary_loss_clip": 0.01553098, + "auxiliary_loss_mlp": 0.01328011, + "balance_loss_clip": 1.20013571, + "balance_loss_mlp": 1.08940196, + "epoch": 0.1644972192995641, + "flos": 23260283042400.0, + "grad_norm": 5.374695433602822, + "language_loss": 0.78638691, + "learning_rate": 3.813223186925296e-06, + "loss": 0.81519794, + "num_input_tokens_seen": 59173795, + "step": 2736, + "time_per_iteration": 2.8192994594573975 + }, + { + "auxiliary_loss_clip": 0.01546431, + "auxiliary_loss_mlp": 0.01341486, + "balance_loss_clip": 1.19471264, + "balance_loss_mlp": 1.09963417, + "epoch": 0.1645573425522321, + "flos": 26981982974880.0, + "grad_norm": 1.932925119976584, + "language_loss": 0.81534219, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.84422135, + "num_input_tokens_seen": 59191610, + "step": 2737, + "time_per_iteration": 2.9038290977478027 + }, + { + "auxiliary_loss_clip": 0.01546912, + "auxiliary_loss_mlp": 0.01354809, + "balance_loss_clip": 1.19578671, + "balance_loss_mlp": 1.11028671, + "epoch": 0.16461746580490005, + "flos": 28734357310560.0, + "grad_norm": 2.040510881961296, + "language_loss": 0.87455642, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.90357357, + "num_input_tokens_seen": 59213000, + "step": 2738, + "time_per_iteration": 2.870635747909546 + }, + { + "auxiliary_loss_clip": 0.01545673, + "auxiliary_loss_mlp": 0.01323243, + "balance_loss_clip": 1.19279802, + "balance_loss_mlp": 1.07795763, + "epoch": 0.16467758905756802, + "flos": 24932135165760.0, + "grad_norm": 2.4438223631272558, + "language_loss": 0.72091782, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74960691, + "num_input_tokens_seen": 59232340, + "step": 2739, + "time_per_iteration": 2.8321969509124756 + }, + { + "auxiliary_loss_clip": 0.01545943, + "auxiliary_loss_mlp": 0.01304338, + "balance_loss_clip": 1.19295967, + "balance_loss_mlp": 1.0605793, + "epoch": 0.16473771231023598, + "flos": 24828817764960.0, + "grad_norm": 1.968866223326466, + "language_loss": 0.81933534, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.84783816, + "num_input_tokens_seen": 59253950, + "step": 2740, + "time_per_iteration": 2.859666585922241 + }, + { + "auxiliary_loss_clip": 0.0154778, + "auxiliary_loss_mlp": 0.01333131, + "balance_loss_clip": 1.19403541, + "balance_loss_mlp": 1.08460355, + "epoch": 0.16479783556290395, + "flos": 39899267467200.0, + "grad_norm": 2.5218265937983593, + "language_loss": 0.69136667, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.7201758, + "num_input_tokens_seen": 59275545, + "step": 2741, + "time_per_iteration": 2.931248903274536 + }, + { + "auxiliary_loss_clip": 0.01548782, + "auxiliary_loss_mlp": 0.0131353, + "balance_loss_clip": 1.19680953, + "balance_loss_mlp": 1.06977105, + "epoch": 0.16485795881557191, + "flos": 19898866344960.0, + "grad_norm": 2.6337244018926906, + "language_loss": 0.80115354, + "learning_rate": 3.812235911671472e-06, + "loss": 0.8297767, + "num_input_tokens_seen": 59293480, + "step": 2742, + "time_per_iteration": 2.7597482204437256 + }, + { + "auxiliary_loss_clip": 0.01545948, + "auxiliary_loss_mlp": 0.01301553, + "balance_loss_clip": 1.19353724, + "balance_loss_mlp": 1.05893862, + "epoch": 0.16491808206823988, + "flos": 20558102251680.0, + "grad_norm": 2.1860005646247744, + "language_loss": 0.85016, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.87863505, + "num_input_tokens_seen": 59313435, + "step": 2743, + "time_per_iteration": 2.803880214691162 + }, + { + "auxiliary_loss_clip": 0.01546747, + "auxiliary_loss_mlp": 0.01323098, + "balance_loss_clip": 1.19301152, + "balance_loss_mlp": 1.08734953, + "epoch": 0.16497820532090787, + "flos": 23802812907840.0, + "grad_norm": 2.8366348854384533, + "language_loss": 0.85911566, + "learning_rate": 3.811906270092265e-06, + "loss": 0.88781404, + "num_input_tokens_seen": 59331535, + "step": 2744, + "time_per_iteration": 2.8356337547302246 + }, + { + "auxiliary_loss_clip": 0.01549568, + "auxiliary_loss_mlp": 0.01278041, + "balance_loss_clip": 1.19828773, + "balance_loss_mlp": 1.03924108, + "epoch": 0.16503832857357584, + "flos": 25484943562560.0, + "grad_norm": 2.0105039213822353, + "language_loss": 0.83222389, + "learning_rate": 3.811741346238036e-06, + "loss": 0.86050004, + "num_input_tokens_seen": 59350680, + "step": 2745, + "time_per_iteration": 2.7898061275482178 + }, + { + "auxiliary_loss_clip": 0.01549034, + "auxiliary_loss_mlp": 0.01343057, + "balance_loss_clip": 1.19477916, + "balance_loss_mlp": 1.10444784, + "epoch": 0.1650984518262438, + "flos": 17677998640800.0, + "grad_norm": 3.9173166363656144, + "language_loss": 0.76768839, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.79660928, + "num_input_tokens_seen": 59367020, + "step": 2746, + "time_per_iteration": 2.8616442680358887 + }, + { + "auxiliary_loss_clip": 0.01548814, + "auxiliary_loss_mlp": 0.01357641, + "balance_loss_clip": 1.19576168, + "balance_loss_mlp": 1.11311853, + "epoch": 0.16515857507891177, + "flos": 18700362394560.0, + "grad_norm": 1.607562526491197, + "language_loss": 0.80776662, + "learning_rate": 3.811411292431592e-06, + "loss": 0.83683115, + "num_input_tokens_seen": 59386075, + "step": 2747, + "time_per_iteration": 2.820927619934082 + }, + { + "auxiliary_loss_clip": 0.01550004, + "auxiliary_loss_mlp": 0.01331783, + "balance_loss_clip": 1.19566476, + "balance_loss_mlp": 1.08554423, + "epoch": 0.16521869833157973, + "flos": 15012153391680.0, + "grad_norm": 3.4249654256339768, + "language_loss": 0.69823849, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.72705632, + "num_input_tokens_seen": 59402690, + "step": 2748, + "time_per_iteration": 2.756340742111206 + }, + { + "auxiliary_loss_clip": 0.01551225, + "auxiliary_loss_mlp": 0.01295293, + "balance_loss_clip": 1.19599164, + "balance_loss_mlp": 1.05611181, + "epoch": 0.1652788215842477, + "flos": 22122995870880.0, + "grad_norm": 2.3489395785612914, + "language_loss": 0.88089681, + "learning_rate": 3.811080963869561e-06, + "loss": 0.90936208, + "num_input_tokens_seen": 59421130, + "step": 2749, + "time_per_iteration": 2.7800040245056152 + }, + { + "auxiliary_loss_clip": 0.01548365, + "auxiliary_loss_mlp": 0.0129735, + "balance_loss_clip": 1.1942513, + "balance_loss_mlp": 1.05721521, + "epoch": 0.16533894483691566, + "flos": 18335072642400.0, + "grad_norm": 2.176151721512627, + "language_loss": 0.79353166, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.82198882, + "num_input_tokens_seen": 59438970, + "step": 2750, + "time_per_iteration": 2.7705535888671875 + }, + { + "auxiliary_loss_clip": 0.01545475, + "auxiliary_loss_mlp": 0.01294841, + "balance_loss_clip": 1.18948042, + "balance_loss_mlp": 1.056041, + "epoch": 0.16539906808958366, + "flos": 22384285515360.0, + "grad_norm": 1.9084207315195252, + "language_loss": 0.95400977, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.98241287, + "num_input_tokens_seen": 59458510, + "step": 2751, + "time_per_iteration": 2.8158748149871826 + }, + { + "auxiliary_loss_clip": 0.01547914, + "auxiliary_loss_mlp": 0.01306886, + "balance_loss_clip": 1.19316614, + "balance_loss_mlp": 1.06617892, + "epoch": 0.16545919134225162, + "flos": 22713315582240.0, + "grad_norm": 3.3306086222720706, + "language_loss": 0.71056306, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73911101, + "num_input_tokens_seen": 59477110, + "step": 2752, + "time_per_iteration": 2.8093795776367188 + }, + { + "auxiliary_loss_clip": 0.01704533, + "auxiliary_loss_mlp": 0.0121534, + "balance_loss_clip": 1.35943675, + "balance_loss_mlp": 1.02269745, + "epoch": 0.1655193145949196, + "flos": 67809513337920.0, + "grad_norm": 0.8862269185526003, + "language_loss": 0.54063171, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56983042, + "num_input_tokens_seen": 59541155, + "step": 2753, + "time_per_iteration": 3.431023120880127 + }, + { + "auxiliary_loss_clip": 0.01541393, + "auxiliary_loss_mlp": 0.01379618, + "balance_loss_clip": 1.18888152, + "balance_loss_mlp": 1.13452363, + "epoch": 0.16557943784758755, + "flos": 24282722652480.0, + "grad_norm": 1.908401645140051, + "language_loss": 0.75561488, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.78482503, + "num_input_tokens_seen": 59561155, + "step": 2754, + "time_per_iteration": 4.3296802043914795 + }, + { + "auxiliary_loss_clip": 0.01542424, + "auxiliary_loss_mlp": 0.01469163, + "balance_loss_clip": 1.18817616, + "balance_loss_mlp": 1.21987295, + "epoch": 0.16563956110025552, + "flos": 20085664354560.0, + "grad_norm": 5.479673578089434, + "language_loss": 0.87014192, + "learning_rate": 3.810088330151188e-06, + "loss": 0.90025777, + "num_input_tokens_seen": 59580460, + "step": 2755, + "time_per_iteration": 2.84525728225708 + }, + { + "auxiliary_loss_clip": 0.01544089, + "auxiliary_loss_mlp": 0.01496574, + "balance_loss_clip": 1.18897223, + "balance_loss_mlp": 1.24938166, + "epoch": 0.16569968435292348, + "flos": 28036775669760.0, + "grad_norm": 2.0715649809013477, + "language_loss": 0.73487175, + "learning_rate": 3.80992265092595e-06, + "loss": 0.76527846, + "num_input_tokens_seen": 59600025, + "step": 2756, + "time_per_iteration": 2.7997477054595947 + }, + { + "auxiliary_loss_clip": 0.0154603, + "auxiliary_loss_mlp": 0.01438282, + "balance_loss_clip": 1.19237781, + "balance_loss_mlp": 1.20081758, + "epoch": 0.16575980760559147, + "flos": 26252655099840.0, + "grad_norm": 2.07236021226608, + "language_loss": 0.75363749, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.78348064, + "num_input_tokens_seen": 59620600, + "step": 2757, + "time_per_iteration": 2.916705846786499 + }, + { + "auxiliary_loss_clip": 0.01548791, + "auxiliary_loss_mlp": 0.01365944, + "balance_loss_clip": 1.19389009, + "balance_loss_mlp": 1.13477325, + "epoch": 0.16581993085825944, + "flos": 26946443924640.0, + "grad_norm": 1.889487349971814, + "language_loss": 0.84952939, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.87867677, + "num_input_tokens_seen": 59641385, + "step": 2758, + "time_per_iteration": 2.8115382194519043 + }, + { + "auxiliary_loss_clip": 0.01535292, + "auxiliary_loss_mlp": 0.01395003, + "balance_loss_clip": 1.18076026, + "balance_loss_mlp": 1.16688418, + "epoch": 0.1658800541109274, + "flos": 21655829988000.0, + "grad_norm": 2.0532089950251584, + "language_loss": 0.7943505, + "learning_rate": 3.809425201480689e-06, + "loss": 0.82365346, + "num_input_tokens_seen": 59659865, + "step": 2759, + "time_per_iteration": 2.80938458442688 + }, + { + "auxiliary_loss_clip": 0.01541203, + "auxiliary_loss_mlp": 0.01443815, + "balance_loss_clip": 1.18628919, + "balance_loss_mlp": 1.2246604, + "epoch": 0.16594017736359537, + "flos": 16437356140320.0, + "grad_norm": 2.455191155870581, + "language_loss": 0.75311458, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.78296471, + "num_input_tokens_seen": 59678780, + "step": 2760, + "time_per_iteration": 4.176727056503296 + }, + { + "auxiliary_loss_clip": 0.01534665, + "auxiliary_loss_mlp": 0.01461214, + "balance_loss_clip": 1.18014872, + "balance_loss_mlp": 1.23481202, + "epoch": 0.16600030061626334, + "flos": 22639468726080.0, + "grad_norm": 2.3275581027087444, + "language_loss": 0.73464698, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.76460576, + "num_input_tokens_seen": 59698795, + "step": 2761, + "time_per_iteration": 2.781913995742798 + }, + { + "auxiliary_loss_clip": 0.01540315, + "auxiliary_loss_mlp": 0.01482842, + "balance_loss_clip": 1.18592238, + "balance_loss_mlp": 1.26273406, + "epoch": 0.1660604238689313, + "flos": 26399021326560.0, + "grad_norm": 2.3465309830414847, + "language_loss": 0.8904084, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.92063999, + "num_input_tokens_seen": 59718795, + "step": 2762, + "time_per_iteration": 2.8453779220581055 + }, + { + "auxiliary_loss_clip": 0.01536888, + "auxiliary_loss_mlp": 0.01486885, + "balance_loss_clip": 1.18232965, + "balance_loss_mlp": 1.26734865, + "epoch": 0.16612054712159927, + "flos": 23042269792800.0, + "grad_norm": 1.9841510422003228, + "language_loss": 0.87945777, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90969551, + "num_input_tokens_seen": 59737555, + "step": 2763, + "time_per_iteration": 5.81091833114624 + }, + { + "auxiliary_loss_clip": 0.01672893, + "auxiliary_loss_mlp": 0.01398788, + "balance_loss_clip": 1.32754266, + "balance_loss_mlp": 1.22407532, + "epoch": 0.16618067037426726, + "flos": 59247980022240.0, + "grad_norm": 0.7834571357621538, + "language_loss": 0.59733802, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.62805486, + "num_input_tokens_seen": 59800915, + "step": 2764, + "time_per_iteration": 3.309023380279541 + }, + { + "auxiliary_loss_clip": 0.01541506, + "auxiliary_loss_mlp": 0.01370887, + "balance_loss_clip": 1.18689966, + "balance_loss_mlp": 1.13876235, + "epoch": 0.16624079362693522, + "flos": 27201285781920.0, + "grad_norm": 2.144347956657854, + "language_loss": 0.82401371, + "learning_rate": 3.808428450193401e-06, + "loss": 0.85313767, + "num_input_tokens_seen": 59822910, + "step": 2765, + "time_per_iteration": 2.7967793941497803 + }, + { + "auxiliary_loss_clip": 0.01537477, + "auxiliary_loss_mlp": 0.01336821, + "balance_loss_clip": 1.18310058, + "balance_loss_mlp": 1.09268022, + "epoch": 0.1663009168796032, + "flos": 10926567048960.0, + "grad_norm": 2.4425517877736485, + "language_loss": 0.70148355, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.73022652, + "num_input_tokens_seen": 59838805, + "step": 2766, + "time_per_iteration": 2.7607946395874023 + }, + { + "auxiliary_loss_clip": 0.01539143, + "auxiliary_loss_mlp": 0.01431509, + "balance_loss_clip": 1.18518519, + "balance_loss_mlp": 1.18813109, + "epoch": 0.16636104013227115, + "flos": 17896353243840.0, + "grad_norm": 6.042195052878659, + "language_loss": 0.88847429, + "learning_rate": 3.808095651090769e-06, + "loss": 0.91818082, + "num_input_tokens_seen": 59855345, + "step": 2767, + "time_per_iteration": 2.783069372177124 + }, + { + "auxiliary_loss_clip": 0.01672762, + "auxiliary_loss_mlp": 0.01595909, + "balance_loss_clip": 1.328493, + "balance_loss_mlp": 1.38610077, + "epoch": 0.16642116338493912, + "flos": 66733518437280.0, + "grad_norm": 0.7238629060859261, + "language_loss": 0.52779174, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.56047845, + "num_input_tokens_seen": 59917710, + "step": 2768, + "time_per_iteration": 3.355238437652588 + }, + { + "auxiliary_loss_clip": 0.01535278, + "auxiliary_loss_mlp": 0.01301234, + "balance_loss_clip": 1.18027818, + "balance_loss_mlp": 1.05938244, + "epoch": 0.16648128663760708, + "flos": 19028064975840.0, + "grad_norm": 3.0406859659973473, + "language_loss": 0.85144806, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87981319, + "num_input_tokens_seen": 59935105, + "step": 2769, + "time_per_iteration": 2.771115303039551 + }, + { + "auxiliary_loss_clip": 0.0166916, + "auxiliary_loss_mlp": 0.01344826, + "balance_loss_clip": 1.32530725, + "balance_loss_mlp": 1.16400909, + "epoch": 0.16654140989027508, + "flos": 70141473715680.0, + "grad_norm": 0.8082045859224156, + "language_loss": 0.57448733, + "learning_rate": 3.80759593822885e-06, + "loss": 0.60462725, + "num_input_tokens_seen": 59984085, + "step": 2770, + "time_per_iteration": 3.151592254638672 + }, + { + "auxiliary_loss_clip": 0.01666562, + "auxiliary_loss_mlp": 0.01394989, + "balance_loss_clip": 1.32270229, + "balance_loss_mlp": 1.21875, + "epoch": 0.16660153314294304, + "flos": 70278433758720.0, + "grad_norm": 0.8707426666680187, + "language_loss": 0.56240427, + "learning_rate": 3.807429230178015e-06, + "loss": 0.59301972, + "num_input_tokens_seen": 60043470, + "step": 2771, + "time_per_iteration": 3.127485513687134 + }, + { + "auxiliary_loss_clip": 0.01542591, + "auxiliary_loss_mlp": 0.01414574, + "balance_loss_clip": 1.18966126, + "balance_loss_mlp": 1.18473852, + "epoch": 0.166661656395611, + "flos": 23077239920640.0, + "grad_norm": 3.563525862678538, + "language_loss": 0.70278424, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.73235595, + "num_input_tokens_seen": 60063045, + "step": 2772, + "time_per_iteration": 2.7584047317504883 + }, + { + "auxiliary_loss_clip": 0.01539468, + "auxiliary_loss_mlp": 0.01357566, + "balance_loss_clip": 1.18558741, + "balance_loss_mlp": 1.12620497, + "epoch": 0.16672177964827897, + "flos": 28368877917600.0, + "grad_norm": 1.9062288734540953, + "language_loss": 0.85825634, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88722664, + "num_input_tokens_seen": 60081945, + "step": 2773, + "time_per_iteration": 2.8628175258636475 + }, + { + "auxiliary_loss_clip": 0.01537164, + "auxiliary_loss_mlp": 0.0134543, + "balance_loss_clip": 1.18499184, + "balance_loss_mlp": 1.10643888, + "epoch": 0.16678190290094694, + "flos": 19092581504640.0, + "grad_norm": 2.474480335662655, + "language_loss": 0.82406384, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.85288978, + "num_input_tokens_seen": 60096820, + "step": 2774, + "time_per_iteration": 2.7492687702178955 + }, + { + "auxiliary_loss_clip": 0.01539938, + "auxiliary_loss_mlp": 0.01405707, + "balance_loss_clip": 1.18734527, + "balance_loss_mlp": 1.16271067, + "epoch": 0.1668420261536149, + "flos": 21801285938880.0, + "grad_norm": 2.99609549907913, + "language_loss": 0.8297075, + "learning_rate": 3.806761712658952e-06, + "loss": 0.859164, + "num_input_tokens_seen": 60116140, + "step": 2775, + "time_per_iteration": 2.8027656078338623 + }, + { + "auxiliary_loss_clip": 0.01541792, + "auxiliary_loss_mlp": 0.01423769, + "balance_loss_clip": 1.18810296, + "balance_loss_mlp": 1.18096399, + "epoch": 0.16690214940628287, + "flos": 19064552230080.0, + "grad_norm": 3.0559545155964, + "language_loss": 0.80959666, + "learning_rate": 3.806594661981897e-06, + "loss": 0.83925223, + "num_input_tokens_seen": 60134235, + "step": 2776, + "time_per_iteration": 2.7705235481262207 + }, + { + "auxiliary_loss_clip": 0.01539817, + "auxiliary_loss_mlp": 0.01386883, + "balance_loss_clip": 1.18622315, + "balance_loss_mlp": 1.14293289, + "epoch": 0.16696227265895086, + "flos": 18590559278400.0, + "grad_norm": 2.095934352773753, + "language_loss": 0.80526644, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.83453345, + "num_input_tokens_seen": 60153275, + "step": 2777, + "time_per_iteration": 2.7969484329223633 + }, + { + "auxiliary_loss_clip": 0.0154257, + "auxiliary_loss_mlp": 0.01340682, + "balance_loss_clip": 1.18958354, + "balance_loss_mlp": 1.10359883, + "epoch": 0.16702239591161883, + "flos": 23296353086880.0, + "grad_norm": 1.6943854642439533, + "language_loss": 0.85364825, + "learning_rate": 3.806260355115371e-06, + "loss": 0.8824808, + "num_input_tokens_seen": 60173215, + "step": 2778, + "time_per_iteration": 2.7479100227355957 + }, + { + "auxiliary_loss_clip": 0.01541003, + "auxiliary_loss_mlp": 0.0137174, + "balance_loss_clip": 1.18681741, + "balance_loss_mlp": 1.13541913, + "epoch": 0.1670825191642868, + "flos": 24427951034400.0, + "grad_norm": 2.6788323120226054, + "language_loss": 0.74484617, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.77397358, + "num_input_tokens_seen": 60190515, + "step": 2779, + "time_per_iteration": 2.8023412227630615 + }, + { + "auxiliary_loss_clip": 0.01533628, + "auxiliary_loss_mlp": 0.01425985, + "balance_loss_clip": 1.17959726, + "balance_loss_mlp": 1.19862938, + "epoch": 0.16714264241695476, + "flos": 26799963913440.0, + "grad_norm": 3.1697778054542556, + "language_loss": 0.65788561, + "learning_rate": 3.805925774274554e-06, + "loss": 0.6874817, + "num_input_tokens_seen": 60211655, + "step": 2780, + "time_per_iteration": 2.844943046569824 + }, + { + "auxiliary_loss_clip": 0.01544213, + "auxiliary_loss_mlp": 0.01453559, + "balance_loss_clip": 1.19073558, + "balance_loss_mlp": 1.23268855, + "epoch": 0.16720276566962272, + "flos": 21837469767840.0, + "grad_norm": 2.6221760313061058, + "language_loss": 0.78531003, + "learning_rate": 3.805758381129643e-06, + "loss": 0.81528771, + "num_input_tokens_seen": 60230860, + "step": 2781, + "time_per_iteration": 2.7584896087646484 + }, + { + "auxiliary_loss_clip": 0.01540986, + "auxiliary_loss_mlp": 0.01448491, + "balance_loss_clip": 1.18759799, + "balance_loss_mlp": 1.2165575, + "epoch": 0.1672628889222907, + "flos": 21472559297280.0, + "grad_norm": 1.742320952806215, + "language_loss": 0.75471389, + "learning_rate": 3.805590919510193e-06, + "loss": 0.78460872, + "num_input_tokens_seen": 60250535, + "step": 2782, + "time_per_iteration": 2.8252720832824707 + }, + { + "auxiliary_loss_clip": 0.01539649, + "auxiliary_loss_mlp": 0.01429738, + "balance_loss_clip": 1.18702829, + "balance_loss_mlp": 1.19475293, + "epoch": 0.16732301217495865, + "flos": 30776657415840.0, + "grad_norm": 6.285544293283539, + "language_loss": 0.68321931, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.71291316, + "num_input_tokens_seen": 60269530, + "step": 2783, + "time_per_iteration": 2.8143012523651123 + }, + { + "auxiliary_loss_clip": 0.01540755, + "auxiliary_loss_mlp": 0.01435116, + "balance_loss_clip": 1.18662381, + "balance_loss_mlp": 1.20432639, + "epoch": 0.16738313542762664, + "flos": 23476741237440.0, + "grad_norm": 1.9295563468109116, + "language_loss": 0.70153826, + "learning_rate": 3.805255790873081e-06, + "loss": 0.73129702, + "num_input_tokens_seen": 60289900, + "step": 2784, + "time_per_iteration": 2.812511682510376 + }, + { + "auxiliary_loss_clip": 0.0153636, + "auxiliary_loss_mlp": 0.01407388, + "balance_loss_clip": 1.18165112, + "balance_loss_mlp": 1.17659879, + "epoch": 0.1674432586802946, + "flos": 29791691192160.0, + "grad_norm": 2.639639226107774, + "language_loss": 0.60764301, + "learning_rate": 3.805088123868126e-06, + "loss": 0.63708049, + "num_input_tokens_seen": 60310025, + "step": 2785, + "time_per_iteration": 2.8077731132507324 + }, + { + "auxiliary_loss_clip": 0.01676169, + "auxiliary_loss_mlp": 0.01395966, + "balance_loss_clip": 1.32687676, + "balance_loss_mlp": 1.21934509, + "epoch": 0.16750338193296258, + "flos": 66143160797760.0, + "grad_norm": 0.8656678073144535, + "language_loss": 0.58748579, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.6182071, + "num_input_tokens_seen": 60377800, + "step": 2786, + "time_per_iteration": 3.40813946723938 + }, + { + "auxiliary_loss_clip": 0.01541527, + "auxiliary_loss_mlp": 0.014138, + "balance_loss_clip": 1.18725455, + "balance_loss_mlp": 1.16717958, + "epoch": 0.16756350518563054, + "flos": 25698822642720.0, + "grad_norm": 3.3367985852328403, + "language_loss": 0.76232916, + "learning_rate": 3.80475258451721e-06, + "loss": 0.79188246, + "num_input_tokens_seen": 60398215, + "step": 2787, + "time_per_iteration": 2.80639386177063 + }, + { + "auxiliary_loss_clip": 0.01540114, + "auxiliary_loss_mlp": 0.01578023, + "balance_loss_clip": 1.18737113, + "balance_loss_mlp": 1.31805098, + "epoch": 0.1676236284382985, + "flos": 23838048532800.0, + "grad_norm": 2.2853453217514934, + "language_loss": 0.77410269, + "learning_rate": 3.804584712183972e-06, + "loss": 0.80528408, + "num_input_tokens_seen": 60416910, + "step": 2788, + "time_per_iteration": 2.845659017562866 + }, + { + "auxiliary_loss_clip": 0.01678861, + "auxiliary_loss_mlp": 0.01790543, + "balance_loss_clip": 1.33001423, + "balance_loss_mlp": 1.5738678, + "epoch": 0.16768375169096647, + "flos": 59880400719840.0, + "grad_norm": 0.9124877326846169, + "language_loss": 0.59336132, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.62805533, + "num_input_tokens_seen": 60468660, + "step": 2789, + "time_per_iteration": 3.144632339477539 + }, + { + "auxiliary_loss_clip": 0.01546815, + "auxiliary_loss_mlp": 0.01418515, + "balance_loss_clip": 1.19292259, + "balance_loss_mlp": 1.169415, + "epoch": 0.16774387494363446, + "flos": 38438715309120.0, + "grad_norm": 2.1069703379286318, + "language_loss": 0.69979334, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72944665, + "num_input_tokens_seen": 60492370, + "step": 2790, + "time_per_iteration": 2.9409842491149902 + }, + { + "auxiliary_loss_clip": 0.01539874, + "auxiliary_loss_mlp": 0.01386563, + "balance_loss_clip": 1.18529654, + "balance_loss_mlp": 1.14604592, + "epoch": 0.16780399819630243, + "flos": 22639810079520.0, + "grad_norm": 5.589930470211204, + "language_loss": 0.79177201, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.82103634, + "num_input_tokens_seen": 60512655, + "step": 2791, + "time_per_iteration": 2.837691068649292 + }, + { + "auxiliary_loss_clip": 0.01534116, + "auxiliary_loss_mlp": 0.01468411, + "balance_loss_clip": 1.18210924, + "balance_loss_mlp": 1.22694075, + "epoch": 0.1678641214489704, + "flos": 32894359431840.0, + "grad_norm": 2.1834248528858793, + "language_loss": 0.71508503, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.74511027, + "num_input_tokens_seen": 60533090, + "step": 2792, + "time_per_iteration": 2.910346746444702 + }, + { + "auxiliary_loss_clip": 0.01535281, + "auxiliary_loss_mlp": 0.01510892, + "balance_loss_clip": 1.18187261, + "balance_loss_mlp": 1.28124762, + "epoch": 0.16792424470163836, + "flos": 19976999083200.0, + "grad_norm": 2.037548846029386, + "language_loss": 0.72015172, + "learning_rate": 3.803744324194691e-06, + "loss": 0.75061351, + "num_input_tokens_seen": 60553190, + "step": 2793, + "time_per_iteration": 4.343337297439575 + }, + { + "auxiliary_loss_clip": 0.01537464, + "auxiliary_loss_mlp": 0.01532912, + "balance_loss_clip": 1.18380105, + "balance_loss_mlp": 1.30593789, + "epoch": 0.16798436795430632, + "flos": 19721967585120.0, + "grad_norm": 18.99154012950904, + "language_loss": 0.77285075, + "learning_rate": 3.803576041376831e-06, + "loss": 0.80355448, + "num_input_tokens_seen": 60571995, + "step": 2794, + "time_per_iteration": 2.787116765975952 + }, + { + "auxiliary_loss_clip": 0.01534992, + "auxiliary_loss_mlp": 0.01531376, + "balance_loss_clip": 1.18102336, + "balance_loss_mlp": 1.30554557, + "epoch": 0.1680444912069743, + "flos": 28107019350720.0, + "grad_norm": 2.6166771919094645, + "language_loss": 0.71939993, + "learning_rate": 3.803407690167187e-06, + "loss": 0.75006366, + "num_input_tokens_seen": 60591275, + "step": 2795, + "time_per_iteration": 2.8235387802124023 + }, + { + "auxiliary_loss_clip": 0.01538589, + "auxiliary_loss_mlp": 0.01525469, + "balance_loss_clip": 1.18514848, + "balance_loss_mlp": 1.30002046, + "epoch": 0.16810461445964225, + "flos": 18077044819680.0, + "grad_norm": 2.425224228319426, + "language_loss": 0.84582424, + "learning_rate": 3.803239270572142e-06, + "loss": 0.87646484, + "num_input_tokens_seen": 60609235, + "step": 2796, + "time_per_iteration": 2.81258487701416 + }, + { + "auxiliary_loss_clip": 0.01529822, + "auxiliary_loss_mlp": 0.01540279, + "balance_loss_clip": 1.17559409, + "balance_loss_mlp": 1.31006193, + "epoch": 0.16816473771231025, + "flos": 23880945646080.0, + "grad_norm": 2.034425750158143, + "language_loss": 0.81827116, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.8489722, + "num_input_tokens_seen": 60629880, + "step": 2797, + "time_per_iteration": 2.8683550357818604 + }, + { + "auxiliary_loss_clip": 0.01539095, + "auxiliary_loss_mlp": 0.01521699, + "balance_loss_clip": 1.18567801, + "balance_loss_mlp": 1.29930258, + "epoch": 0.1682248609649782, + "flos": 22785797024640.0, + "grad_norm": 1.4980411934258224, + "language_loss": 0.74929321, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77990115, + "num_input_tokens_seen": 60651175, + "step": 2798, + "time_per_iteration": 4.312123775482178 + }, + { + "auxiliary_loss_clip": 0.01539874, + "auxiliary_loss_mlp": 0.01527069, + "balance_loss_clip": 1.18421507, + "balance_loss_mlp": 1.30772388, + "epoch": 0.16828498421764618, + "flos": 20707275162240.0, + "grad_norm": 1.5885639141328067, + "language_loss": 0.80114645, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.83181584, + "num_input_tokens_seen": 60670210, + "step": 2799, + "time_per_iteration": 2.7835659980773926 + }, + { + "auxiliary_loss_clip": 0.0152756, + "auxiliary_loss_mlp": 0.01555915, + "balance_loss_clip": 1.17327046, + "balance_loss_mlp": 1.33409047, + "epoch": 0.16834510747031414, + "flos": 29422987905600.0, + "grad_norm": 2.5207780791932977, + "language_loss": 0.70395052, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.73478526, + "num_input_tokens_seen": 60690895, + "step": 2800, + "time_per_iteration": 4.330533742904663 + }, + { + "auxiliary_loss_clip": 0.01538221, + "auxiliary_loss_mlp": 0.01550204, + "balance_loss_clip": 1.18314838, + "balance_loss_mlp": 1.32933331, + "epoch": 0.1684052307229821, + "flos": 18147060931680.0, + "grad_norm": 2.2481737237292116, + "language_loss": 0.8379041, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.86878836, + "num_input_tokens_seen": 60708280, + "step": 2801, + "time_per_iteration": 2.8598268032073975 + }, + { + "auxiliary_loss_clip": 0.01540706, + "auxiliary_loss_mlp": 0.01576649, + "balance_loss_clip": 1.1845777, + "balance_loss_mlp": 1.35844803, + "epoch": 0.16846535397565007, + "flos": 16576440160320.0, + "grad_norm": 2.709823633495395, + "language_loss": 0.82556146, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.85673505, + "num_input_tokens_seen": 60724150, + "step": 2802, + "time_per_iteration": 4.628204107284546 + }, + { + "auxiliary_loss_clip": 0.01524284, + "auxiliary_loss_mlp": 0.01546126, + "balance_loss_clip": 1.16913223, + "balance_loss_mlp": 1.32067752, + "epoch": 0.16852547722831807, + "flos": 30411481448160.0, + "grad_norm": 1.8048360307483415, + "language_loss": 0.80865097, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83935511, + "num_input_tokens_seen": 60746485, + "step": 2803, + "time_per_iteration": 2.8111939430236816 + }, + { + "auxiliary_loss_clip": 0.01536974, + "auxiliary_loss_mlp": 0.01531355, + "balance_loss_clip": 1.18177867, + "balance_loss_mlp": 1.30495262, + "epoch": 0.16858560048098603, + "flos": 33510053446560.0, + "grad_norm": 2.858710593104281, + "language_loss": 0.76858819, + "learning_rate": 3.801889452704297e-06, + "loss": 0.79927146, + "num_input_tokens_seen": 60762875, + "step": 2804, + "time_per_iteration": 2.996551752090454 + }, + { + "auxiliary_loss_clip": 0.01665847, + "auxiliary_loss_mlp": 0.01605064, + "balance_loss_clip": 1.31399703, + "balance_loss_mlp": 1.43683624, + "epoch": 0.168645723733654, + "flos": 67377317582880.0, + "grad_norm": 0.942518183828737, + "language_loss": 0.55379808, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.5865072, + "num_input_tokens_seen": 60825510, + "step": 2805, + "time_per_iteration": 3.231733560562134 + }, + { + "auxiliary_loss_clip": 0.0153411, + "auxiliary_loss_mlp": 0.01512524, + "balance_loss_clip": 1.17874455, + "balance_loss_mlp": 1.28135371, + "epoch": 0.16870584698632196, + "flos": 21326761992960.0, + "grad_norm": 1.9092288223204363, + "language_loss": 0.73020142, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.76066774, + "num_input_tokens_seen": 60844440, + "step": 2806, + "time_per_iteration": 2.7858452796936035 + }, + { + "auxiliary_loss_clip": 0.01534564, + "auxiliary_loss_mlp": 0.01493414, + "balance_loss_clip": 1.17866349, + "balance_loss_mlp": 1.26224387, + "epoch": 0.16876597023898993, + "flos": 20742852140640.0, + "grad_norm": 1.820009418958991, + "language_loss": 0.69966698, + "learning_rate": 3.80138214341862e-06, + "loss": 0.72994679, + "num_input_tokens_seen": 60863210, + "step": 2807, + "time_per_iteration": 2.788632869720459 + }, + { + "auxiliary_loss_clip": 0.01530907, + "auxiliary_loss_mlp": 0.01485417, + "balance_loss_clip": 1.17487597, + "balance_loss_mlp": 1.24032259, + "epoch": 0.1688260934916579, + "flos": 20305611940320.0, + "grad_norm": 2.59012847636787, + "language_loss": 0.70484191, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.7350052, + "num_input_tokens_seen": 60882510, + "step": 2808, + "time_per_iteration": 2.8305182456970215 + }, + { + "auxiliary_loss_clip": 0.01522016, + "auxiliary_loss_mlp": 0.01479129, + "balance_loss_clip": 1.16702843, + "balance_loss_mlp": 1.23708665, + "epoch": 0.16888621674432586, + "flos": 20342857757760.0, + "grad_norm": 3.1252488226388815, + "language_loss": 0.80501819, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.8350296, + "num_input_tokens_seen": 60901105, + "step": 2809, + "time_per_iteration": 2.7672078609466553 + }, + { + "auxiliary_loss_clip": 0.01528283, + "auxiliary_loss_mlp": 0.01490098, + "balance_loss_clip": 1.17215943, + "balance_loss_mlp": 1.24233377, + "epoch": 0.16894633999699385, + "flos": 16246196392320.0, + "grad_norm": 3.000735724126352, + "language_loss": 0.88621801, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.91640186, + "num_input_tokens_seen": 60915340, + "step": 2810, + "time_per_iteration": 2.770827054977417 + }, + { + "auxiliary_loss_clip": 0.0153245, + "auxiliary_loss_mlp": 0.01500358, + "balance_loss_clip": 1.17606199, + "balance_loss_mlp": 1.25335598, + "epoch": 0.16900646324966181, + "flos": 19612240325280.0, + "grad_norm": 1.9932171748935854, + "language_loss": 0.92184174, + "learning_rate": 3.800704774747416e-06, + "loss": 0.95216978, + "num_input_tokens_seen": 60933735, + "step": 2811, + "time_per_iteration": 2.8117058277130127 + }, + { + "auxiliary_loss_clip": 0.01529019, + "auxiliary_loss_mlp": 0.01464923, + "balance_loss_clip": 1.17155004, + "balance_loss_mlp": 1.22383463, + "epoch": 0.16906658650232978, + "flos": 22020285320640.0, + "grad_norm": 2.0576140101964295, + "language_loss": 0.79080051, + "learning_rate": 3.800535261856291e-06, + "loss": 0.82073998, + "num_input_tokens_seen": 60953105, + "step": 2812, + "time_per_iteration": 2.8212263584136963 + }, + { + "auxiliary_loss_clip": 0.01537418, + "auxiliary_loss_mlp": 0.01495051, + "balance_loss_clip": 1.1811223, + "balance_loss_mlp": 1.2539618, + "epoch": 0.16912670975499774, + "flos": 11765204974080.0, + "grad_norm": 2.8246483352486806, + "language_loss": 0.75385058, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.78417528, + "num_input_tokens_seen": 60969150, + "step": 2813, + "time_per_iteration": 2.836308240890503 + }, + { + "auxiliary_loss_clip": 0.01520013, + "auxiliary_loss_mlp": 0.01452771, + "balance_loss_clip": 1.16352558, + "balance_loss_mlp": 1.20710444, + "epoch": 0.1691868330076657, + "flos": 17163080840160.0, + "grad_norm": 4.181461259556963, + "language_loss": 0.69324255, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.72297037, + "num_input_tokens_seen": 60982825, + "step": 2814, + "time_per_iteration": 2.737643241882324 + }, + { + "auxiliary_loss_clip": 0.01530439, + "auxiliary_loss_mlp": 0.01474092, + "balance_loss_clip": 1.17332482, + "balance_loss_mlp": 1.22613657, + "epoch": 0.16924695626033368, + "flos": 22418610864480.0, + "grad_norm": 2.1693205600378898, + "language_loss": 0.61852801, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64857328, + "num_input_tokens_seen": 61000875, + "step": 2815, + "time_per_iteration": 2.835177421569824 + }, + { + "auxiliary_loss_clip": 0.01526224, + "auxiliary_loss_mlp": 0.01436841, + "balance_loss_clip": 1.16997564, + "balance_loss_mlp": 1.19651508, + "epoch": 0.16930707951300164, + "flos": 25742061109440.0, + "grad_norm": 2.364143147888709, + "language_loss": 0.82490349, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.85453415, + "num_input_tokens_seen": 61021940, + "step": 2816, + "time_per_iteration": 2.863206148147583 + }, + { + "auxiliary_loss_clip": 0.0152999, + "auxiliary_loss_mlp": 0.01478672, + "balance_loss_clip": 1.17329967, + "balance_loss_mlp": 1.23643851, + "epoch": 0.16936720276566963, + "flos": 22749233914080.0, + "grad_norm": 3.4506993270627064, + "language_loss": 0.87590259, + "learning_rate": 3.799686673382153e-06, + "loss": 0.90598917, + "num_input_tokens_seen": 61040285, + "step": 2817, + "time_per_iteration": 2.8705196380615234 + }, + { + "auxiliary_loss_clip": 0.01537922, + "auxiliary_loss_mlp": 0.01464386, + "balance_loss_clip": 1.18129885, + "balance_loss_mlp": 1.2240603, + "epoch": 0.1694273260183376, + "flos": 19576056496320.0, + "grad_norm": 2.2154106508312936, + "language_loss": 0.8139962, + "learning_rate": 3.799516750928672e-06, + "loss": 0.84401929, + "num_input_tokens_seen": 61059020, + "step": 2818, + "time_per_iteration": 2.815096378326416 + }, + { + "auxiliary_loss_clip": 0.01522673, + "auxiliary_loss_mlp": 0.01439668, + "balance_loss_clip": 1.16554356, + "balance_loss_mlp": 1.19819808, + "epoch": 0.16948744927100556, + "flos": 12459448936800.0, + "grad_norm": 3.218698537613874, + "language_loss": 0.8071292, + "learning_rate": 3.799346760237336e-06, + "loss": 0.83675265, + "num_input_tokens_seen": 61074245, + "step": 2819, + "time_per_iteration": 2.762108325958252 + }, + { + "auxiliary_loss_clip": 0.0165268, + "auxiliary_loss_mlp": 0.01470886, + "balance_loss_clip": 1.29890203, + "balance_loss_mlp": 1.28358459, + "epoch": 0.16954757252367353, + "flos": 71297649475200.0, + "grad_norm": 0.9548343841826371, + "language_loss": 0.60994709, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.64118278, + "num_input_tokens_seen": 61127080, + "step": 2820, + "time_per_iteration": 3.2177574634552 + }, + { + "auxiliary_loss_clip": 0.01524053, + "auxiliary_loss_mlp": 0.01426104, + "balance_loss_clip": 1.16800201, + "balance_loss_mlp": 1.17833972, + "epoch": 0.1696076957763415, + "flos": 29609103208320.0, + "grad_norm": 2.8964657139267294, + "language_loss": 0.78866625, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.81816781, + "num_input_tokens_seen": 61146955, + "step": 2821, + "time_per_iteration": 2.8296525478363037 + }, + { + "auxiliary_loss_clip": 0.01526607, + "auxiliary_loss_mlp": 0.0148343, + "balance_loss_clip": 1.17068505, + "balance_loss_mlp": 1.24806273, + "epoch": 0.16966781902900946, + "flos": 24390856929600.0, + "grad_norm": 2.0384191162249317, + "language_loss": 0.78729486, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.81739527, + "num_input_tokens_seen": 61166605, + "step": 2822, + "time_per_iteration": 2.853227376937866 + }, + { + "auxiliary_loss_clip": 0.01527074, + "auxiliary_loss_mlp": 0.01457477, + "balance_loss_clip": 1.16925144, + "balance_loss_mlp": 1.21963108, + "epoch": 0.16972794228167745, + "flos": 23041018163520.0, + "grad_norm": 2.066960968819207, + "language_loss": 0.75588179, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.78572726, + "num_input_tokens_seen": 61186535, + "step": 2823, + "time_per_iteration": 2.8130271434783936 + }, + { + "auxiliary_loss_clip": 0.01524421, + "auxiliary_loss_mlp": 0.01451205, + "balance_loss_clip": 1.16812301, + "balance_loss_mlp": 1.20458531, + "epoch": 0.16978806553434542, + "flos": 35231781464640.0, + "grad_norm": 2.0113006217428584, + "language_loss": 0.60210794, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.63186419, + "num_input_tokens_seen": 61208965, + "step": 2824, + "time_per_iteration": 2.966525077819824 + }, + { + "auxiliary_loss_clip": 0.0152557, + "auxiliary_loss_mlp": 0.01475034, + "balance_loss_clip": 1.16948771, + "balance_loss_mlp": 1.23089349, + "epoch": 0.16984818878701338, + "flos": 32016996491040.0, + "grad_norm": 1.6967123651337954, + "language_loss": 0.73547387, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.76547986, + "num_input_tokens_seen": 61230670, + "step": 2825, + "time_per_iteration": 2.88747239112854 + }, + { + "auxiliary_loss_clip": 0.01514441, + "auxiliary_loss_mlp": 0.01456627, + "balance_loss_clip": 1.15786529, + "balance_loss_mlp": 1.20676398, + "epoch": 0.16990831203968135, + "flos": 22820729224320.0, + "grad_norm": 3.056229378913752, + "language_loss": 0.85754657, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.88725722, + "num_input_tokens_seen": 61249510, + "step": 2826, + "time_per_iteration": 2.8122243881225586 + }, + { + "auxiliary_loss_clip": 0.01530821, + "auxiliary_loss_mlp": 0.01462915, + "balance_loss_clip": 1.17405164, + "balance_loss_mlp": 1.22010899, + "epoch": 0.1699684352923493, + "flos": 23041852583040.0, + "grad_norm": 2.0221011054089986, + "language_loss": 0.82585645, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.85579377, + "num_input_tokens_seen": 61269440, + "step": 2827, + "time_per_iteration": 2.838435173034668 + }, + { + "auxiliary_loss_clip": 0.0152705, + "auxiliary_loss_mlp": 0.01474121, + "balance_loss_clip": 1.16937494, + "balance_loss_mlp": 1.23665559, + "epoch": 0.17002855854501728, + "flos": 21436261683840.0, + "grad_norm": 1.8885084797200957, + "language_loss": 0.74046707, + "learning_rate": 3.797813774376267e-06, + "loss": 0.77047879, + "num_input_tokens_seen": 61288195, + "step": 2828, + "time_per_iteration": 2.8092169761657715 + }, + { + "auxiliary_loss_clip": 0.01636123, + "auxiliary_loss_mlp": 0.01438889, + "balance_loss_clip": 1.28199148, + "balance_loss_mlp": 1.25387573, + "epoch": 0.17008868179768524, + "flos": 71460362738880.0, + "grad_norm": 0.8194934437818969, + "language_loss": 0.56440341, + "learning_rate": 3.797643101661336e-06, + "loss": 0.59515357, + "num_input_tokens_seen": 61350850, + "step": 2829, + "time_per_iteration": 3.3794384002685547 + }, + { + "auxiliary_loss_clip": 0.01524471, + "auxiliary_loss_mlp": 0.01433715, + "balance_loss_clip": 1.16736269, + "balance_loss_mlp": 1.1857599, + "epoch": 0.17014880505035324, + "flos": 24902664621120.0, + "grad_norm": 3.0690298084958836, + "language_loss": 0.83370185, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.86328375, + "num_input_tokens_seen": 61370765, + "step": 2830, + "time_per_iteration": 4.382383823394775 + }, + { + "auxiliary_loss_clip": 0.01515511, + "auxiliary_loss_mlp": 0.01408718, + "balance_loss_clip": 1.15878868, + "balance_loss_mlp": 1.15446818, + "epoch": 0.1702089283030212, + "flos": 29865044982240.0, + "grad_norm": 2.504894172116749, + "language_loss": 0.78423798, + "learning_rate": 3.797301551737529e-06, + "loss": 0.81348026, + "num_input_tokens_seen": 61388935, + "step": 2831, + "time_per_iteration": 2.8306338787078857 + }, + { + "auxiliary_loss_clip": 0.01527084, + "auxiliary_loss_mlp": 0.01442008, + "balance_loss_clip": 1.17156458, + "balance_loss_mlp": 1.18966603, + "epoch": 0.17026905155568917, + "flos": 17745852847680.0, + "grad_norm": 2.280810894488559, + "language_loss": 0.79450154, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.82419252, + "num_input_tokens_seen": 61407350, + "step": 2832, + "time_per_iteration": 2.7884631156921387 + }, + { + "auxiliary_loss_clip": 0.01525955, + "auxiliary_loss_mlp": 0.01404779, + "balance_loss_clip": 1.17049766, + "balance_loss_mlp": 1.15167367, + "epoch": 0.17032917480835713, + "flos": 23150897136000.0, + "grad_norm": 2.456220367423627, + "language_loss": 0.89038563, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91969299, + "num_input_tokens_seen": 61429010, + "step": 2833, + "time_per_iteration": 2.8080503940582275 + }, + { + "auxiliary_loss_clip": 0.01525492, + "auxiliary_loss_mlp": 0.01407515, + "balance_loss_clip": 1.16944623, + "balance_loss_mlp": 1.15135837, + "epoch": 0.1703892980610251, + "flos": 39205706211360.0, + "grad_norm": 3.778639266673682, + "language_loss": 0.72296488, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.75229496, + "num_input_tokens_seen": 61450040, + "step": 2834, + "time_per_iteration": 2.9057846069335938 + }, + { + "auxiliary_loss_clip": 0.01528245, + "auxiliary_loss_mlp": 0.0139072, + "balance_loss_clip": 1.17142808, + "balance_loss_mlp": 1.13208365, + "epoch": 0.17044942131369306, + "flos": 23041321588800.0, + "grad_norm": 2.1612786519667093, + "language_loss": 0.86634219, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.89553183, + "num_input_tokens_seen": 61468585, + "step": 2835, + "time_per_iteration": 2.796140670776367 + }, + { + "auxiliary_loss_clip": 0.01524474, + "auxiliary_loss_mlp": 0.01418336, + "balance_loss_clip": 1.16833353, + "balance_loss_mlp": 1.1663754, + "epoch": 0.17050954456636103, + "flos": 17056842971040.0, + "grad_norm": 2.658081178146401, + "language_loss": 0.74289751, + "learning_rate": 3.796446484348989e-06, + "loss": 0.77232563, + "num_input_tokens_seen": 61486330, + "step": 2836, + "time_per_iteration": 2.7800960540771484 + }, + { + "auxiliary_loss_clip": 0.01530902, + "auxiliary_loss_mlp": 0.01414634, + "balance_loss_clip": 1.1748991, + "balance_loss_mlp": 1.15580642, + "epoch": 0.17056966781902902, + "flos": 16838867649600.0, + "grad_norm": 3.7980410098328115, + "language_loss": 0.80548489, + "learning_rate": 3.796275266481036e-06, + "loss": 0.8349402, + "num_input_tokens_seen": 61503950, + "step": 2837, + "time_per_iteration": 4.286612272262573 + }, + { + "auxiliary_loss_clip": 0.01530654, + "auxiliary_loss_mlp": 0.01373014, + "balance_loss_clip": 1.17446899, + "balance_loss_mlp": 1.11247015, + "epoch": 0.17062979107169698, + "flos": 17714485895040.0, + "grad_norm": 2.5735130245934505, + "language_loss": 0.83754271, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.86657941, + "num_input_tokens_seen": 61523550, + "step": 2838, + "time_per_iteration": 2.8277411460876465 + }, + { + "auxiliary_loss_clip": 0.01525497, + "auxiliary_loss_mlp": 0.013578, + "balance_loss_clip": 1.16939628, + "balance_loss_mlp": 1.09210634, + "epoch": 0.17068991432436495, + "flos": 22527048566880.0, + "grad_norm": 2.978289299161424, + "language_loss": 0.93548155, + "learning_rate": 3.795932626406812e-06, + "loss": 0.96431452, + "num_input_tokens_seen": 61542720, + "step": 2839, + "time_per_iteration": 4.309381008148193 + }, + { + "auxiliary_loss_clip": 0.01529733, + "auxiliary_loss_mlp": 0.01395705, + "balance_loss_clip": 1.17352188, + "balance_loss_mlp": 1.13477945, + "epoch": 0.17075003757703291, + "flos": 25885203442560.0, + "grad_norm": 2.335408927838386, + "language_loss": 0.83820033, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86745465, + "num_input_tokens_seen": 61563040, + "step": 2840, + "time_per_iteration": 4.5088605880737305 + }, + { + "auxiliary_loss_clip": 0.01524503, + "auxiliary_loss_mlp": 0.01398265, + "balance_loss_clip": 1.16798401, + "balance_loss_mlp": 1.14001036, + "epoch": 0.17081016082970088, + "flos": 20122796387520.0, + "grad_norm": 1.955247975464064, + "language_loss": 0.76715827, + "learning_rate": 3.79558971392481e-06, + "loss": 0.79638588, + "num_input_tokens_seen": 61581890, + "step": 2841, + "time_per_iteration": 2.8172008991241455 + }, + { + "auxiliary_loss_clip": 0.01522845, + "auxiliary_loss_mlp": 0.01400611, + "balance_loss_clip": 1.16586924, + "balance_loss_mlp": 1.14235651, + "epoch": 0.17087028408236885, + "flos": 24938810521920.0, + "grad_norm": 2.0238160574896913, + "language_loss": 0.76695168, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79618633, + "num_input_tokens_seen": 61602095, + "step": 2842, + "time_per_iteration": 2.8813812732696533 + }, + { + "auxiliary_loss_clip": 0.01527117, + "auxiliary_loss_mlp": 0.01367096, + "balance_loss_clip": 1.1709547, + "balance_loss_mlp": 1.10044909, + "epoch": 0.17093040733503684, + "flos": 19059621569280.0, + "grad_norm": 2.221752272797838, + "language_loss": 0.8581965, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88713861, + "num_input_tokens_seen": 61620400, + "step": 2843, + "time_per_iteration": 2.751884698867798 + }, + { + "auxiliary_loss_clip": 0.01520601, + "auxiliary_loss_mlp": 0.01384316, + "balance_loss_clip": 1.16384017, + "balance_loss_mlp": 1.12224627, + "epoch": 0.1709905305877048, + "flos": 13080642534720.0, + "grad_norm": 3.5695945622324605, + "language_loss": 0.6902687, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.71931785, + "num_input_tokens_seen": 61637680, + "step": 2844, + "time_per_iteration": 2.8173723220825195 + }, + { + "auxiliary_loss_clip": 0.0152042, + "auxiliary_loss_mlp": 0.01376672, + "balance_loss_clip": 1.16375434, + "balance_loss_mlp": 1.11441135, + "epoch": 0.17105065384037277, + "flos": 19211411522880.0, + "grad_norm": 1.951159718709336, + "language_loss": 0.78546786, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.81443882, + "num_input_tokens_seen": 61655630, + "step": 2845, + "time_per_iteration": 2.7506041526794434 + }, + { + "auxiliary_loss_clip": 0.01517826, + "auxiliary_loss_mlp": 0.01390886, + "balance_loss_clip": 1.16139364, + "balance_loss_mlp": 1.13224971, + "epoch": 0.17111077709304073, + "flos": 18517167560160.0, + "grad_norm": 2.4324396449559154, + "language_loss": 0.78366089, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.81274807, + "num_input_tokens_seen": 61673475, + "step": 2846, + "time_per_iteration": 2.8285303115844727 + }, + { + "auxiliary_loss_clip": 0.01514851, + "auxiliary_loss_mlp": 0.01351664, + "balance_loss_clip": 1.15864992, + "balance_loss_mlp": 1.08501625, + "epoch": 0.1711709003457087, + "flos": 25085025036000.0, + "grad_norm": 2.0878232655592623, + "language_loss": 0.80159342, + "learning_rate": 3.794559342552472e-06, + "loss": 0.83025861, + "num_input_tokens_seen": 61693370, + "step": 2847, + "time_per_iteration": 2.8011600971221924 + }, + { + "auxiliary_loss_clip": 0.01518192, + "auxiliary_loss_mlp": 0.01379135, + "balance_loss_clip": 1.16244197, + "balance_loss_mlp": 1.11820936, + "epoch": 0.17123102359837666, + "flos": 17568309309120.0, + "grad_norm": 3.8332707427870774, + "language_loss": 0.86635911, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.8953324, + "num_input_tokens_seen": 61710820, + "step": 2848, + "time_per_iteration": 2.8465723991394043 + }, + { + "auxiliary_loss_clip": 0.01523353, + "auxiliary_loss_mlp": 0.01401515, + "balance_loss_clip": 1.16671729, + "balance_loss_mlp": 1.14783812, + "epoch": 0.17129114685104463, + "flos": 26175849847200.0, + "grad_norm": 2.016113584904328, + "language_loss": 0.74839294, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77764165, + "num_input_tokens_seen": 61729855, + "step": 2849, + "time_per_iteration": 2.8485565185546875 + }, + { + "auxiliary_loss_clip": 0.01628185, + "auxiliary_loss_mlp": 0.01411629, + "balance_loss_clip": 1.27218246, + "balance_loss_mlp": 1.21936798, + "epoch": 0.17135127010371262, + "flos": 69276740852160.0, + "grad_norm": 0.7959624578852187, + "language_loss": 0.57397288, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.60437101, + "num_input_tokens_seen": 61790290, + "step": 2850, + "time_per_iteration": 3.3399696350097656 + }, + { + "auxiliary_loss_clip": 0.01523718, + "auxiliary_loss_mlp": 0.01410253, + "balance_loss_clip": 1.16696668, + "balance_loss_mlp": 1.15886497, + "epoch": 0.1714113933563806, + "flos": 23552370717120.0, + "grad_norm": 2.73175136214561, + "language_loss": 0.81137669, + "learning_rate": 3.793871067220031e-06, + "loss": 0.84071642, + "num_input_tokens_seen": 61809265, + "step": 2851, + "time_per_iteration": 2.899707794189453 + }, + { + "auxiliary_loss_clip": 0.01526249, + "auxiliary_loss_mlp": 0.01418894, + "balance_loss_clip": 1.16924143, + "balance_loss_mlp": 1.16197395, + "epoch": 0.17147151660904855, + "flos": 21144591218880.0, + "grad_norm": 2.049275928118445, + "language_loss": 0.93309152, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.96254301, + "num_input_tokens_seen": 61828980, + "step": 2852, + "time_per_iteration": 2.756409168243408 + }, + { + "auxiliary_loss_clip": 0.01512364, + "auxiliary_loss_mlp": 0.01414498, + "balance_loss_clip": 1.15540004, + "balance_loss_mlp": 1.15776873, + "epoch": 0.17153163986171652, + "flos": 18626894820000.0, + "grad_norm": 2.7356853457749235, + "language_loss": 0.69400394, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.72327256, + "num_input_tokens_seen": 61847915, + "step": 2853, + "time_per_iteration": 2.748422622680664 + }, + { + "auxiliary_loss_clip": 0.01519261, + "auxiliary_loss_mlp": 0.01422707, + "balance_loss_clip": 1.16290343, + "balance_loss_mlp": 1.16712248, + "epoch": 0.17159176311438448, + "flos": 18225231598080.0, + "grad_norm": 2.3196631034217554, + "language_loss": 0.66837084, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.69779056, + "num_input_tokens_seen": 61865570, + "step": 2854, + "time_per_iteration": 2.7862226963043213 + }, + { + "auxiliary_loss_clip": 0.01520302, + "auxiliary_loss_mlp": 0.01417633, + "balance_loss_clip": 1.16402555, + "balance_loss_mlp": 1.1614759, + "epoch": 0.17165188636705245, + "flos": 20740993660800.0, + "grad_norm": 2.120948700021145, + "language_loss": 0.89254558, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.92192489, + "num_input_tokens_seen": 61883340, + "step": 2855, + "time_per_iteration": 2.7612721920013428 + }, + { + "auxiliary_loss_clip": 0.01529385, + "auxiliary_loss_mlp": 0.01402752, + "balance_loss_clip": 1.17037094, + "balance_loss_mlp": 1.14297175, + "epoch": 0.17171200961972044, + "flos": 24902209483200.0, + "grad_norm": 2.5012490340060336, + "language_loss": 0.8306011, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.85992253, + "num_input_tokens_seen": 61900610, + "step": 2856, + "time_per_iteration": 2.7848563194274902 + }, + { + "auxiliary_loss_clip": 0.01522576, + "auxiliary_loss_mlp": 0.01368998, + "balance_loss_clip": 1.16473258, + "balance_loss_mlp": 1.10444891, + "epoch": 0.1717721328723884, + "flos": 20159321569920.0, + "grad_norm": 2.5957339485597997, + "language_loss": 0.86978441, + "learning_rate": 3.792836613639026e-06, + "loss": 0.89870018, + "num_input_tokens_seen": 61916795, + "step": 2857, + "time_per_iteration": 2.8171322345733643 + }, + { + "auxiliary_loss_clip": 0.01520171, + "auxiliary_loss_mlp": 0.01386554, + "balance_loss_clip": 1.1631192, + "balance_loss_mlp": 1.12257731, + "epoch": 0.17183225612505637, + "flos": 23363562515040.0, + "grad_norm": 2.5768210637096907, + "language_loss": 0.78946042, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.8185277, + "num_input_tokens_seen": 61936665, + "step": 2858, + "time_per_iteration": 2.8366570472717285 + }, + { + "auxiliary_loss_clip": 0.01518453, + "auxiliary_loss_mlp": 0.01361937, + "balance_loss_clip": 1.16100836, + "balance_loss_mlp": 1.09147525, + "epoch": 0.17189237937772434, + "flos": 18116224973280.0, + "grad_norm": 4.3972154592645865, + "language_loss": 0.7746464, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.80345029, + "num_input_tokens_seen": 61954415, + "step": 2859, + "time_per_iteration": 2.770151376724243 + }, + { + "auxiliary_loss_clip": 0.0151895, + "auxiliary_loss_mlp": 0.01363476, + "balance_loss_clip": 1.16154683, + "balance_loss_mlp": 1.09034336, + "epoch": 0.1719525026303923, + "flos": 23260662324000.0, + "grad_norm": 4.536948031825065, + "language_loss": 0.76815927, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79698348, + "num_input_tokens_seen": 61973940, + "step": 2860, + "time_per_iteration": 2.8000266551971436 + }, + { + "auxiliary_loss_clip": 0.01519278, + "auxiliary_loss_mlp": 0.01364404, + "balance_loss_clip": 1.16197646, + "balance_loss_mlp": 1.09947324, + "epoch": 0.17201262588306027, + "flos": 20812337258400.0, + "grad_norm": 4.248431622472994, + "language_loss": 0.81733793, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84617472, + "num_input_tokens_seen": 61991845, + "step": 2861, + "time_per_iteration": 2.844865083694458 + }, + { + "auxiliary_loss_clip": 0.01521065, + "auxiliary_loss_mlp": 0.01357642, + "balance_loss_clip": 1.16343725, + "balance_loss_mlp": 1.0871799, + "epoch": 0.17207274913572823, + "flos": 20377941670080.0, + "grad_norm": 2.397770398181378, + "language_loss": 0.85974169, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.8885287, + "num_input_tokens_seen": 62009395, + "step": 2862, + "time_per_iteration": 2.8173961639404297 + }, + { + "auxiliary_loss_clip": 0.0153233, + "auxiliary_loss_mlp": 0.01376127, + "balance_loss_clip": 1.17383671, + "balance_loss_mlp": 1.11787224, + "epoch": 0.17213287238839622, + "flos": 26800191482400.0, + "grad_norm": 2.155607992365697, + "language_loss": 0.78025746, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80934203, + "num_input_tokens_seen": 62029005, + "step": 2863, + "time_per_iteration": 3.0406036376953125 + }, + { + "auxiliary_loss_clip": 0.01524339, + "auxiliary_loss_mlp": 0.01361384, + "balance_loss_clip": 1.16578054, + "balance_loss_mlp": 1.09301984, + "epoch": 0.1721929956410642, + "flos": 26033200580160.0, + "grad_norm": 2.0326167681831593, + "language_loss": 0.72461289, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.75347012, + "num_input_tokens_seen": 62048730, + "step": 2864, + "time_per_iteration": 2.9077224731445312 + }, + { + "auxiliary_loss_clip": 0.01530479, + "auxiliary_loss_mlp": 0.01379021, + "balance_loss_clip": 1.17132545, + "balance_loss_mlp": 1.11199188, + "epoch": 0.17225311889373215, + "flos": 22275278890560.0, + "grad_norm": 1.8835977714279422, + "language_loss": 0.72475135, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75384641, + "num_input_tokens_seen": 62069000, + "step": 2865, + "time_per_iteration": 2.7893197536468506 + }, + { + "auxiliary_loss_clip": 0.01539539, + "auxiliary_loss_mlp": 0.01413311, + "balance_loss_clip": 1.18039405, + "balance_loss_mlp": 1.15753555, + "epoch": 0.17231324214640012, + "flos": 21289895457120.0, + "grad_norm": 3.477081143730137, + "language_loss": 0.78885055, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.81837904, + "num_input_tokens_seen": 62086750, + "step": 2866, + "time_per_iteration": 2.805556297302246 + }, + { + "auxiliary_loss_clip": 0.01525707, + "auxiliary_loss_mlp": 0.01378489, + "balance_loss_clip": 1.16656339, + "balance_loss_mlp": 1.11298585, + "epoch": 0.17237336539906808, + "flos": 19682673647040.0, + "grad_norm": 2.284151129148228, + "language_loss": 0.79697514, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82601702, + "num_input_tokens_seen": 62106240, + "step": 2867, + "time_per_iteration": 2.798452854156494 + }, + { + "auxiliary_loss_clip": 0.01524155, + "auxiliary_loss_mlp": 0.01374794, + "balance_loss_clip": 1.16521132, + "balance_loss_mlp": 1.10986352, + "epoch": 0.17243348865173605, + "flos": 17531556557760.0, + "grad_norm": 2.4572277961818005, + "language_loss": 0.79738307, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.82637256, + "num_input_tokens_seen": 62124895, + "step": 2868, + "time_per_iteration": 4.3428778648376465 + }, + { + "auxiliary_loss_clip": 0.01526417, + "auxiliary_loss_mlp": 0.01363639, + "balance_loss_clip": 1.16683984, + "balance_loss_mlp": 1.09432149, + "epoch": 0.17249361190440402, + "flos": 18261832636800.0, + "grad_norm": 2.040637228077456, + "language_loss": 0.84355849, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.87245905, + "num_input_tokens_seen": 62143510, + "step": 2869, + "time_per_iteration": 2.806331157684326 + }, + { + "auxiliary_loss_clip": 0.0152448, + "auxiliary_loss_mlp": 0.0135033, + "balance_loss_clip": 1.16541314, + "balance_loss_mlp": 1.07776952, + "epoch": 0.172553735157072, + "flos": 21176261596800.0, + "grad_norm": 2.83720043300823, + "language_loss": 0.77264464, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.80139267, + "num_input_tokens_seen": 62162285, + "step": 2870, + "time_per_iteration": 2.7819607257843018 + }, + { + "auxiliary_loss_clip": 0.01534178, + "auxiliary_loss_mlp": 0.01345699, + "balance_loss_clip": 1.17806196, + "balance_loss_mlp": 1.07485509, + "epoch": 0.17261385840973997, + "flos": 22275468531360.0, + "grad_norm": 1.9065975743161427, + "language_loss": 0.7723248, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.80112356, + "num_input_tokens_seen": 62180970, + "step": 2871, + "time_per_iteration": 2.8517580032348633 + }, + { + "auxiliary_loss_clip": 0.01534002, + "auxiliary_loss_mlp": 0.01360132, + "balance_loss_clip": 1.17562747, + "balance_loss_mlp": 1.09157753, + "epoch": 0.17267398166240794, + "flos": 27924393438720.0, + "grad_norm": 4.15631921372644, + "language_loss": 0.74299932, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.77194071, + "num_input_tokens_seen": 62198965, + "step": 2872, + "time_per_iteration": 2.872833728790283 + }, + { + "auxiliary_loss_clip": 0.01527133, + "auxiliary_loss_mlp": 0.01343889, + "balance_loss_clip": 1.16734195, + "balance_loss_mlp": 1.072855, + "epoch": 0.1727341049150759, + "flos": 21947310812160.0, + "grad_norm": 2.007970930064753, + "language_loss": 0.82706565, + "learning_rate": 3.790066109323988e-06, + "loss": 0.85577589, + "num_input_tokens_seen": 62219890, + "step": 2873, + "time_per_iteration": 2.855332851409912 + }, + { + "auxiliary_loss_clip": 0.01524441, + "auxiliary_loss_mlp": 0.01362476, + "balance_loss_clip": 1.16458774, + "balance_loss_mlp": 1.09830809, + "epoch": 0.17279422816774387, + "flos": 18109511688960.0, + "grad_norm": 4.960426954705287, + "language_loss": 0.75164378, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.78051299, + "num_input_tokens_seen": 62237140, + "step": 2874, + "time_per_iteration": 2.8084774017333984 + }, + { + "auxiliary_loss_clip": 0.01530183, + "auxiliary_loss_mlp": 0.01368195, + "balance_loss_clip": 1.17073607, + "balance_loss_mlp": 1.10402751, + "epoch": 0.17285435142041183, + "flos": 21837621480480.0, + "grad_norm": 2.0494758487744797, + "language_loss": 0.80883741, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.83782125, + "num_input_tokens_seen": 62255405, + "step": 2875, + "time_per_iteration": 4.230652570724487 + }, + { + "auxiliary_loss_clip": 0.015268, + "auxiliary_loss_mlp": 0.01398459, + "balance_loss_clip": 1.16754758, + "balance_loss_mlp": 1.13371861, + "epoch": 0.17291447467307983, + "flos": 18370384123680.0, + "grad_norm": 2.9870880550627072, + "language_loss": 0.87830555, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.9075582, + "num_input_tokens_seen": 62271280, + "step": 2876, + "time_per_iteration": 2.785918951034546 + }, + { + "auxiliary_loss_clip": 0.01527769, + "auxiliary_loss_mlp": 0.01354608, + "balance_loss_clip": 1.16746521, + "balance_loss_mlp": 1.0820477, + "epoch": 0.1729745979257478, + "flos": 18626629322880.0, + "grad_norm": 2.7253840667193248, + "language_loss": 0.84744811, + "learning_rate": 3.789370767013681e-06, + "loss": 0.8762719, + "num_input_tokens_seen": 62289140, + "step": 2877, + "time_per_iteration": 2.7894368171691895 + }, + { + "auxiliary_loss_clip": 0.01529209, + "auxiliary_loss_mlp": 0.01367291, + "balance_loss_clip": 1.16967356, + "balance_loss_mlp": 1.09701979, + "epoch": 0.17303472117841576, + "flos": 23000207099040.0, + "grad_norm": 4.651084813481411, + "language_loss": 0.79639339, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.82535839, + "num_input_tokens_seen": 62307490, + "step": 2878, + "time_per_iteration": 5.822048187255859 + }, + { + "auxiliary_loss_clip": 0.01527916, + "auxiliary_loss_mlp": 0.01371712, + "balance_loss_clip": 1.16756964, + "balance_loss_mlp": 1.10506439, + "epoch": 0.17309484443108372, + "flos": 25666659198720.0, + "grad_norm": 2.816662249351901, + "language_loss": 0.70489323, + "learning_rate": 3.78902268871344e-06, + "loss": 0.73388946, + "num_input_tokens_seen": 62328570, + "step": 2879, + "time_per_iteration": 2.8249242305755615 + }, + { + "auxiliary_loss_clip": 0.01522588, + "auxiliary_loss_mlp": 0.01353971, + "balance_loss_clip": 1.16277599, + "balance_loss_mlp": 1.08560681, + "epoch": 0.1731549676837517, + "flos": 13554597558240.0, + "grad_norm": 2.7894625363109884, + "language_loss": 0.83134615, + "learning_rate": 3.78884854780014e-06, + "loss": 0.86011171, + "num_input_tokens_seen": 62345735, + "step": 2880, + "time_per_iteration": 2.760070562362671 + }, + { + "auxiliary_loss_clip": 0.01523367, + "auxiliary_loss_mlp": 0.01359704, + "balance_loss_clip": 1.16404676, + "balance_loss_mlp": 1.09286559, + "epoch": 0.17321509093641965, + "flos": 22859302527360.0, + "grad_norm": 2.295447856041984, + "language_loss": 0.81767642, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.84650719, + "num_input_tokens_seen": 62365525, + "step": 2881, + "time_per_iteration": 2.8178329467773438 + }, + { + "auxiliary_loss_clip": 0.01527361, + "auxiliary_loss_mlp": 0.01349095, + "balance_loss_clip": 1.16719747, + "balance_loss_mlp": 1.08168507, + "epoch": 0.17327521418908762, + "flos": 24355090310400.0, + "grad_norm": 2.352807675824362, + "language_loss": 0.77420616, + "learning_rate": 3.788500062480197e-06, + "loss": 0.80297071, + "num_input_tokens_seen": 62385160, + "step": 2882, + "time_per_iteration": 2.8068108558654785 + }, + { + "auxiliary_loss_clip": 0.01529479, + "auxiliary_loss_mlp": 0.01352592, + "balance_loss_clip": 1.16976571, + "balance_loss_mlp": 1.08308387, + "epoch": 0.1733353374417556, + "flos": 33108238512000.0, + "grad_norm": 2.0395958097966638, + "language_loss": 0.76361954, + "learning_rate": 3.788325718086769e-06, + "loss": 0.7924403, + "num_input_tokens_seen": 62405280, + "step": 2883, + "time_per_iteration": 2.9170334339141846 + }, + { + "auxiliary_loss_clip": 0.01533671, + "auxiliary_loss_mlp": 0.01378821, + "balance_loss_clip": 1.17344332, + "balance_loss_mlp": 1.11503482, + "epoch": 0.17339546069442358, + "flos": 24391236211200.0, + "grad_norm": 2.3019081749620893, + "language_loss": 0.8563779, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.88550282, + "num_input_tokens_seen": 62423665, + "step": 2884, + "time_per_iteration": 2.8326330184936523 + }, + { + "auxiliary_loss_clip": 0.01527979, + "auxiliary_loss_mlp": 0.01348687, + "balance_loss_clip": 1.16953146, + "balance_loss_mlp": 1.07555437, + "epoch": 0.17345558394709154, + "flos": 27456658633440.0, + "grad_norm": 2.6329639605022592, + "language_loss": 0.74485338, + "learning_rate": 3.787976825866055e-06, + "loss": 0.77362007, + "num_input_tokens_seen": 62445170, + "step": 2885, + "time_per_iteration": 2.8823390007019043 + }, + { + "auxiliary_loss_clip": 0.01529724, + "auxiliary_loss_mlp": 0.01345116, + "balance_loss_clip": 1.17014122, + "balance_loss_mlp": 1.07369995, + "epoch": 0.1735157071997595, + "flos": 24684537587040.0, + "grad_norm": 1.7952726869347166, + "language_loss": 0.7098788, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.7386272, + "num_input_tokens_seen": 62466135, + "step": 2886, + "time_per_iteration": 2.863086462020874 + }, + { + "auxiliary_loss_clip": 0.01522291, + "auxiliary_loss_mlp": 0.01334914, + "balance_loss_clip": 1.16313052, + "balance_loss_mlp": 1.06120908, + "epoch": 0.17357583045242747, + "flos": 21691065612960.0, + "grad_norm": 2.461126284596029, + "language_loss": 0.69726473, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.72583675, + "num_input_tokens_seen": 62483910, + "step": 2887, + "time_per_iteration": 2.8063836097717285 + }, + { + "auxiliary_loss_clip": 0.01529485, + "auxiliary_loss_mlp": 0.01344704, + "balance_loss_clip": 1.16945362, + "balance_loss_mlp": 1.07214355, + "epoch": 0.17363595370509544, + "flos": 15377443143840.0, + "grad_norm": 3.928496137850331, + "language_loss": 0.85376883, + "learning_rate": 3.787452979049585e-06, + "loss": 0.88251078, + "num_input_tokens_seen": 62501530, + "step": 2888, + "time_per_iteration": 2.8368887901306152 + }, + { + "auxiliary_loss_clip": 0.01523277, + "auxiliary_loss_mlp": 0.01335923, + "balance_loss_clip": 1.16380572, + "balance_loss_mlp": 1.05592406, + "epoch": 0.1736960769577634, + "flos": 23443212379680.0, + "grad_norm": 2.2502562549339693, + "language_loss": 0.78513575, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.8137278, + "num_input_tokens_seen": 62521295, + "step": 2889, + "time_per_iteration": 2.8776755332946777 + }, + { + "auxiliary_loss_clip": 0.01531536, + "auxiliary_loss_mlp": 0.0135243, + "balance_loss_clip": 1.17152739, + "balance_loss_mlp": 1.07700896, + "epoch": 0.1737562002104314, + "flos": 18589800715200.0, + "grad_norm": 4.931819005570125, + "language_loss": 0.84204423, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.87088382, + "num_input_tokens_seen": 62539615, + "step": 2890, + "time_per_iteration": 2.910212278366089 + }, + { + "auxiliary_loss_clip": 0.01527647, + "auxiliary_loss_mlp": 0.01348902, + "balance_loss_clip": 1.16732121, + "balance_loss_mlp": 1.07271814, + "epoch": 0.17381632346309936, + "flos": 15999926299200.0, + "grad_norm": 2.7357375762909117, + "language_loss": 0.82957023, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.85833573, + "num_input_tokens_seen": 62556820, + "step": 2891, + "time_per_iteration": 2.7946932315826416 + }, + { + "auxiliary_loss_clip": 0.01526181, + "auxiliary_loss_mlp": 0.01361002, + "balance_loss_clip": 1.16565871, + "balance_loss_mlp": 1.08977699, + "epoch": 0.17387644671576732, + "flos": 13371478580160.0, + "grad_norm": 2.3606925069776423, + "language_loss": 0.81779218, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.84666407, + "num_input_tokens_seen": 62572450, + "step": 2892, + "time_per_iteration": 2.9595847129821777 + }, + { + "auxiliary_loss_clip": 0.0152844, + "auxiliary_loss_mlp": 0.01350333, + "balance_loss_clip": 1.16847038, + "balance_loss_mlp": 1.06995249, + "epoch": 0.1739365699684353, + "flos": 26617982780160.0, + "grad_norm": 2.810968456566989, + "language_loss": 0.74529809, + "learning_rate": 3.786578545502627e-06, + "loss": 0.77408582, + "num_input_tokens_seen": 62592580, + "step": 2893, + "time_per_iteration": 2.874505043029785 + }, + { + "auxiliary_loss_clip": 0.01535474, + "auxiliary_loss_mlp": 0.01348367, + "balance_loss_clip": 1.17566299, + "balance_loss_mlp": 1.07027507, + "epoch": 0.17399669322110325, + "flos": 23370237871200.0, + "grad_norm": 3.309402126190095, + "language_loss": 0.83331156, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.86214995, + "num_input_tokens_seen": 62611220, + "step": 2894, + "time_per_iteration": 2.8323891162872314 + }, + { + "auxiliary_loss_clip": 0.01520707, + "auxiliary_loss_mlp": 0.0134498, + "balance_loss_clip": 1.16117883, + "balance_loss_mlp": 1.06364632, + "epoch": 0.17405681647377122, + "flos": 22056279508800.0, + "grad_norm": 2.449852225640246, + "language_loss": 0.7450754, + "learning_rate": 3.786228297806741e-06, + "loss": 0.7737323, + "num_input_tokens_seen": 62629185, + "step": 2895, + "time_per_iteration": 2.877037286758423 + }, + { + "auxiliary_loss_clip": 0.01659897, + "auxiliary_loss_mlp": 0.01477692, + "balance_loss_clip": 1.29641318, + "balance_loss_mlp": 1.28619385, + "epoch": 0.1741169397264392, + "flos": 61463917065600.0, + "grad_norm": 0.9091585044472181, + "language_loss": 0.62713492, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.6585108, + "num_input_tokens_seen": 62691895, + "step": 2896, + "time_per_iteration": 3.4436798095703125 + }, + { + "auxiliary_loss_clip": 0.01516424, + "auxiliary_loss_mlp": 0.01347606, + "balance_loss_clip": 1.15803194, + "balance_loss_mlp": 1.07046819, + "epoch": 0.17417706297910718, + "flos": 27020214924480.0, + "grad_norm": 2.2588638647625716, + "language_loss": 0.75985515, + "learning_rate": 3.785877779175034e-06, + "loss": 0.78849554, + "num_input_tokens_seen": 62713790, + "step": 2897, + "time_per_iteration": 2.855879068374634 + }, + { + "auxiliary_loss_clip": 0.01528616, + "auxiliary_loss_mlp": 0.01334037, + "balance_loss_clip": 1.1690141, + "balance_loss_mlp": 1.0595696, + "epoch": 0.17423718623177514, + "flos": 33511570572960.0, + "grad_norm": 1.8926134493832927, + "language_loss": 0.68978053, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71840703, + "num_input_tokens_seen": 62736285, + "step": 2898, + "time_per_iteration": 2.935804605484009 + }, + { + "auxiliary_loss_clip": 0.01521654, + "auxiliary_loss_mlp": 0.01332092, + "balance_loss_clip": 1.16227937, + "balance_loss_mlp": 1.0494225, + "epoch": 0.1742973094844431, + "flos": 27201096141120.0, + "grad_norm": 2.9743592363888256, + "language_loss": 0.76158786, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.79012531, + "num_input_tokens_seen": 62756240, + "step": 2899, + "time_per_iteration": 2.782695770263672 + }, + { + "auxiliary_loss_clip": 0.01527009, + "auxiliary_loss_mlp": 0.01360641, + "balance_loss_clip": 1.16746676, + "balance_loss_mlp": 1.09361184, + "epoch": 0.17435743273711107, + "flos": 22712936300640.0, + "grad_norm": 6.4763640314610225, + "language_loss": 0.72609961, + "learning_rate": 3.785351493339121e-06, + "loss": 0.75497615, + "num_input_tokens_seen": 62775910, + "step": 2900, + "time_per_iteration": 2.8628172874450684 + }, + { + "auxiliary_loss_clip": 0.01522721, + "auxiliary_loss_mlp": 0.01356231, + "balance_loss_clip": 1.16329837, + "balance_loss_mlp": 1.08119166, + "epoch": 0.17441755598977904, + "flos": 41649707466720.0, + "grad_norm": 1.6731272833422017, + "language_loss": 0.69923651, + "learning_rate": 3.785175929316863e-06, + "loss": 0.72802603, + "num_input_tokens_seen": 62799385, + "step": 2901, + "time_per_iteration": 2.9649102687835693 + }, + { + "auxiliary_loss_clip": 0.01521954, + "auxiliary_loss_mlp": 0.01347049, + "balance_loss_clip": 1.16298246, + "balance_loss_mlp": 1.07067418, + "epoch": 0.174477679242447, + "flos": 26289521635680.0, + "grad_norm": 1.8882858681662862, + "language_loss": 0.76006246, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.7887525, + "num_input_tokens_seen": 62819380, + "step": 2902, + "time_per_iteration": 2.8218839168548584 + }, + { + "auxiliary_loss_clip": 0.01521843, + "auxiliary_loss_mlp": 0.01345634, + "balance_loss_clip": 1.16252494, + "balance_loss_mlp": 1.06849575, + "epoch": 0.174537802495115, + "flos": 17860359055680.0, + "grad_norm": 1.9985158822866147, + "language_loss": 0.81570685, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.84438163, + "num_input_tokens_seen": 62836205, + "step": 2903, + "time_per_iteration": 2.8472728729248047 + }, + { + "auxiliary_loss_clip": 0.01524736, + "auxiliary_loss_mlp": 0.01359977, + "balance_loss_clip": 1.16479456, + "balance_loss_mlp": 1.08665359, + "epoch": 0.17459792574778296, + "flos": 16942298834880.0, + "grad_norm": 1.9936832628080867, + "language_loss": 0.73224908, + "learning_rate": 3.784648831112429e-06, + "loss": 0.76109624, + "num_input_tokens_seen": 62854045, + "step": 2904, + "time_per_iteration": 2.7656118869781494 + }, + { + "auxiliary_loss_clip": 0.01528031, + "auxiliary_loss_mlp": 0.01339982, + "balance_loss_clip": 1.16899705, + "balance_loss_mlp": 1.06532359, + "epoch": 0.17465804900045093, + "flos": 25522379020800.0, + "grad_norm": 2.647330561164036, + "language_loss": 0.64645308, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.67513323, + "num_input_tokens_seen": 62873075, + "step": 2905, + "time_per_iteration": 2.8691303730010986 + }, + { + "auxiliary_loss_clip": 0.01519677, + "auxiliary_loss_mlp": 0.01372824, + "balance_loss_clip": 1.161924, + "balance_loss_mlp": 1.09225297, + "epoch": 0.1747181722531189, + "flos": 24131767118400.0, + "grad_norm": 2.111376040604052, + "language_loss": 0.79693592, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.82586092, + "num_input_tokens_seen": 62892675, + "step": 2906, + "time_per_iteration": 4.373985528945923 + }, + { + "auxiliary_loss_clip": 0.01519951, + "auxiliary_loss_mlp": 0.01334394, + "balance_loss_clip": 1.16185689, + "balance_loss_mlp": 1.05611157, + "epoch": 0.17477829550578686, + "flos": 17750745580320.0, + "grad_norm": 2.5086291669779213, + "language_loss": 0.81214452, + "learning_rate": 3.784121123841449e-06, + "loss": 0.84068793, + "num_input_tokens_seen": 62910675, + "step": 2907, + "time_per_iteration": 2.77669620513916 + }, + { + "auxiliary_loss_clip": 0.01519635, + "auxiliary_loss_mlp": 0.0134071, + "balance_loss_clip": 1.16112792, + "balance_loss_mlp": 1.06586146, + "epoch": 0.17483841875845482, + "flos": 15379149911040.0, + "grad_norm": 3.061552964984915, + "language_loss": 0.8062948, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83489823, + "num_input_tokens_seen": 62928130, + "step": 2908, + "time_per_iteration": 2.794403314590454 + }, + { + "auxiliary_loss_clip": 0.01523955, + "auxiliary_loss_mlp": 0.01347654, + "balance_loss_clip": 1.16551316, + "balance_loss_mlp": 1.06898999, + "epoch": 0.17489854201112282, + "flos": 17165053104480.0, + "grad_norm": 3.6140697402982314, + "language_loss": 0.80678755, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.83550364, + "num_input_tokens_seen": 62944290, + "step": 2909, + "time_per_iteration": 2.8193490505218506 + }, + { + "auxiliary_loss_clip": 0.01521907, + "auxiliary_loss_mlp": 0.01352039, + "balance_loss_clip": 1.16368365, + "balance_loss_mlp": 1.07909703, + "epoch": 0.17495866526379078, + "flos": 19757316994560.0, + "grad_norm": 4.45654653669087, + "language_loss": 0.76705569, + "learning_rate": 3.783592807684017e-06, + "loss": 0.79579508, + "num_input_tokens_seen": 62963505, + "step": 2910, + "time_per_iteration": 2.8433337211608887 + }, + { + "auxiliary_loss_clip": 0.01521484, + "auxiliary_loss_mlp": 0.01338115, + "balance_loss_clip": 1.16283011, + "balance_loss_mlp": 1.06364751, + "epoch": 0.17501878851645875, + "flos": 28513613233440.0, + "grad_norm": 1.7600550030582345, + "language_loss": 0.87205976, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.90065575, + "num_input_tokens_seen": 62985020, + "step": 2911, + "time_per_iteration": 2.8406152725219727 + }, + { + "auxiliary_loss_clip": 0.01526061, + "auxiliary_loss_mlp": 0.01356244, + "balance_loss_clip": 1.1679188, + "balance_loss_mlp": 1.08807075, + "epoch": 0.1750789117691267, + "flos": 17933143923360.0, + "grad_norm": 2.4622367448333646, + "language_loss": 0.89924955, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.92807257, + "num_input_tokens_seen": 63001745, + "step": 2912, + "time_per_iteration": 2.7872226238250732 + }, + { + "auxiliary_loss_clip": 0.01521182, + "auxiliary_loss_mlp": 0.01361753, + "balance_loss_clip": 1.16286325, + "balance_loss_mlp": 1.09262586, + "epoch": 0.17513903502179468, + "flos": 18261377498880.0, + "grad_norm": 4.018024535795325, + "language_loss": 0.72661275, + "learning_rate": 3.783063882820439e-06, + "loss": 0.75544208, + "num_input_tokens_seen": 63019750, + "step": 2913, + "time_per_iteration": 4.407833814620972 + }, + { + "auxiliary_loss_clip": 0.01522345, + "auxiliary_loss_mlp": 0.01360899, + "balance_loss_clip": 1.16427422, + "balance_loss_mlp": 1.09387016, + "epoch": 0.17519915827446264, + "flos": 20706857952480.0, + "grad_norm": 2.5617503100373846, + "language_loss": 0.69744492, + "learning_rate": 3.782887439295741e-06, + "loss": 0.72627735, + "num_input_tokens_seen": 63039500, + "step": 2914, + "time_per_iteration": 2.8000433444976807 + }, + { + "auxiliary_loss_clip": 0.01530482, + "auxiliary_loss_mlp": 0.01380073, + "balance_loss_clip": 1.1727097, + "balance_loss_mlp": 1.11628652, + "epoch": 0.1752592815271306, + "flos": 20525597454240.0, + "grad_norm": 2.022328914434288, + "language_loss": 0.93157238, + "learning_rate": 3.782710928163772e-06, + "loss": 0.96067792, + "num_input_tokens_seen": 63059785, + "step": 2915, + "time_per_iteration": 4.38176703453064 + }, + { + "auxiliary_loss_clip": 0.01534739, + "auxiliary_loss_mlp": 0.01385734, + "balance_loss_clip": 1.17689383, + "balance_loss_mlp": 1.12671602, + "epoch": 0.1753194047797986, + "flos": 21801361795200.0, + "grad_norm": 2.5775405829358826, + "language_loss": 0.80898613, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83819085, + "num_input_tokens_seen": 63079385, + "step": 2916, + "time_per_iteration": 4.469643831253052 + }, + { + "auxiliary_loss_clip": 0.01522933, + "auxiliary_loss_mlp": 0.01361761, + "balance_loss_clip": 1.16640544, + "balance_loss_mlp": 1.0884378, + "epoch": 0.17537952803246656, + "flos": 20670484482720.0, + "grad_norm": 1.6949810237332497, + "language_loss": 0.7373367, + "learning_rate": 3.782357703104799e-06, + "loss": 0.76618361, + "num_input_tokens_seen": 63098970, + "step": 2917, + "time_per_iteration": 2.7935264110565186 + }, + { + "auxiliary_loss_clip": 0.01537921, + "auxiliary_loss_mlp": 0.0136095, + "balance_loss_clip": 1.18134701, + "balance_loss_mlp": 1.09697282, + "epoch": 0.17543965128513453, + "flos": 23297339219040.0, + "grad_norm": 3.2348793348577263, + "language_loss": 0.76899064, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.7979793, + "num_input_tokens_seen": 63118750, + "step": 2918, + "time_per_iteration": 2.823439598083496 + }, + { + "auxiliary_loss_clip": 0.01522789, + "auxiliary_loss_mlp": 0.01341895, + "balance_loss_clip": 1.16570282, + "balance_loss_mlp": 1.06571126, + "epoch": 0.1754997745378025, + "flos": 29098167864480.0, + "grad_norm": 3.2494994388514846, + "language_loss": 0.74529052, + "learning_rate": 3.782004207697098e-06, + "loss": 0.77393734, + "num_input_tokens_seen": 63136865, + "step": 2919, + "time_per_iteration": 2.875857353210449 + }, + { + "auxiliary_loss_clip": 0.01527163, + "auxiliary_loss_mlp": 0.01343569, + "balance_loss_clip": 1.17154145, + "balance_loss_mlp": 1.06547773, + "epoch": 0.17555989779047046, + "flos": 30374425271520.0, + "grad_norm": 2.201414073783406, + "language_loss": 0.74293208, + "learning_rate": 3.781827358629228e-06, + "loss": 0.77163947, + "num_input_tokens_seen": 63158325, + "step": 2920, + "time_per_iteration": 2.8786380290985107 + }, + { + "auxiliary_loss_clip": 0.01535518, + "auxiliary_loss_mlp": 0.01346572, + "balance_loss_clip": 1.17881525, + "balance_loss_mlp": 1.07877994, + "epoch": 0.17562002104313842, + "flos": 23289601874400.0, + "grad_norm": 2.3106980214723603, + "language_loss": 0.79705805, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.82587898, + "num_input_tokens_seen": 63173115, + "step": 2921, + "time_per_iteration": 2.796273946762085 + }, + { + "auxiliary_loss_clip": 0.01536641, + "auxiliary_loss_mlp": 0.01367648, + "balance_loss_clip": 1.17929745, + "balance_loss_mlp": 1.09432471, + "epoch": 0.1756801442958064, + "flos": 24792823576800.0, + "grad_norm": 1.7473171022574514, + "language_loss": 0.87577254, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.90481544, + "num_input_tokens_seen": 63192880, + "step": 2922, + "time_per_iteration": 2.9656882286071777 + }, + { + "auxiliary_loss_clip": 0.01536767, + "auxiliary_loss_mlp": 0.0134752, + "balance_loss_clip": 1.1786375, + "balance_loss_mlp": 1.07667661, + "epoch": 0.17574026754847438, + "flos": 25773807343680.0, + "grad_norm": 2.8312623569370476, + "language_loss": 0.62419379, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.65303659, + "num_input_tokens_seen": 63214395, + "step": 2923, + "time_per_iteration": 2.9016923904418945 + }, + { + "auxiliary_loss_clip": 0.01536674, + "auxiliary_loss_mlp": 0.0134891, + "balance_loss_clip": 1.17914653, + "balance_loss_mlp": 1.07425237, + "epoch": 0.17580039080114235, + "flos": 17458506192960.0, + "grad_norm": 2.7443563404023075, + "language_loss": 0.8097136, + "learning_rate": 3.78111928675413e-06, + "loss": 0.8385694, + "num_input_tokens_seen": 63231020, + "step": 2924, + "time_per_iteration": 2.756019115447998 + }, + { + "auxiliary_loss_clip": 0.01533881, + "auxiliary_loss_mlp": 0.01352549, + "balance_loss_clip": 1.17624998, + "balance_loss_mlp": 1.07750916, + "epoch": 0.1758605140538103, + "flos": 14866318159200.0, + "grad_norm": 2.3645321333391096, + "language_loss": 0.71079141, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73965567, + "num_input_tokens_seen": 63246245, + "step": 2925, + "time_per_iteration": 2.754693031311035 + }, + { + "auxiliary_loss_clip": 0.01545188, + "auxiliary_loss_mlp": 0.0135793, + "balance_loss_clip": 1.18728781, + "balance_loss_mlp": 1.08479786, + "epoch": 0.17592063730647828, + "flos": 23006579029920.0, + "grad_norm": 2.6898862499829193, + "language_loss": 0.71756506, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74659628, + "num_input_tokens_seen": 63267790, + "step": 2926, + "time_per_iteration": 2.844472646713257 + }, + { + "auxiliary_loss_clip": 0.01538978, + "auxiliary_loss_mlp": 0.01356107, + "balance_loss_clip": 1.18282437, + "balance_loss_mlp": 1.08221173, + "epoch": 0.17598076055914624, + "flos": 20743648632000.0, + "grad_norm": 4.37922925352258, + "language_loss": 0.84905422, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.87800509, + "num_input_tokens_seen": 63286830, + "step": 2927, + "time_per_iteration": 2.782708168029785 + }, + { + "auxiliary_loss_clip": 0.01546011, + "auxiliary_loss_mlp": 0.01333084, + "balance_loss_clip": 1.18819821, + "balance_loss_mlp": 1.06319427, + "epoch": 0.1760408838118142, + "flos": 34095177000000.0, + "grad_norm": 2.337539959764646, + "language_loss": 0.72355103, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.75234199, + "num_input_tokens_seen": 63308870, + "step": 2928, + "time_per_iteration": 2.9677071571350098 + }, + { + "auxiliary_loss_clip": 0.01547304, + "auxiliary_loss_mlp": 0.01347804, + "balance_loss_clip": 1.18967068, + "balance_loss_mlp": 1.07848608, + "epoch": 0.1761010070644822, + "flos": 24170567990400.0, + "grad_norm": 2.1855825848629746, + "language_loss": 0.83253312, + "learning_rate": 3.780232677305744e-06, + "loss": 0.86148417, + "num_input_tokens_seen": 63329005, + "step": 2929, + "time_per_iteration": 2.8722472190856934 + }, + { + "auxiliary_loss_clip": 0.01540808, + "auxiliary_loss_mlp": 0.01356618, + "balance_loss_clip": 1.18312216, + "balance_loss_mlp": 1.08825421, + "epoch": 0.17616113031715017, + "flos": 26579030195520.0, + "grad_norm": 1.7308581780679537, + "language_loss": 0.79463136, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.82360566, + "num_input_tokens_seen": 63349390, + "step": 2930, + "time_per_iteration": 2.8138298988342285 + }, + { + "auxiliary_loss_clip": 0.01552586, + "auxiliary_loss_mlp": 0.01366885, + "balance_loss_clip": 1.19488764, + "balance_loss_mlp": 1.09318042, + "epoch": 0.17622125356981813, + "flos": 25669086600960.0, + "grad_norm": 2.181919067009824, + "language_loss": 0.76618439, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.79537916, + "num_input_tokens_seen": 63368835, + "step": 2931, + "time_per_iteration": 2.8269853591918945 + }, + { + "auxiliary_loss_clip": 0.01540338, + "auxiliary_loss_mlp": 0.01353475, + "balance_loss_clip": 1.18392122, + "balance_loss_mlp": 1.08110583, + "epoch": 0.1762813768224861, + "flos": 16510406505120.0, + "grad_norm": 3.769498083003341, + "language_loss": 0.75582016, + "learning_rate": 3.779699901503696e-06, + "loss": 0.78475827, + "num_input_tokens_seen": 63385220, + "step": 2932, + "time_per_iteration": 2.754047155380249 + }, + { + "auxiliary_loss_clip": 0.01540332, + "auxiliary_loss_mlp": 0.01344014, + "balance_loss_clip": 1.18447065, + "balance_loss_mlp": 1.06916547, + "epoch": 0.17634150007515406, + "flos": 11213079284160.0, + "grad_norm": 2.6090408974931667, + "language_loss": 0.90054792, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.92939138, + "num_input_tokens_seen": 63400865, + "step": 2933, + "time_per_iteration": 2.757356882095337 + }, + { + "auxiliary_loss_clip": 0.01538593, + "auxiliary_loss_mlp": 0.01341428, + "balance_loss_clip": 1.1825422, + "balance_loss_mlp": 1.0629555, + "epoch": 0.17640162332782203, + "flos": 23662211761440.0, + "grad_norm": 1.9793563487151002, + "language_loss": 0.88414985, + "learning_rate": 3.779344380192448e-06, + "loss": 0.91295004, + "num_input_tokens_seen": 63421390, + "step": 2934, + "time_per_iteration": 2.817838430404663 + }, + { + "auxiliary_loss_clip": 0.01543736, + "auxiliary_loss_mlp": 0.01341808, + "balance_loss_clip": 1.1860317, + "balance_loss_mlp": 1.07382584, + "epoch": 0.17646174658049, + "flos": 53800949260800.0, + "grad_norm": 2.3490728637978853, + "language_loss": 0.70723832, + "learning_rate": 3.779166518324077e-06, + "loss": 0.73609376, + "num_input_tokens_seen": 63444715, + "step": 2935, + "time_per_iteration": 3.0912373065948486 + }, + { + "auxiliary_loss_clip": 0.01536433, + "auxiliary_loss_mlp": 0.01347231, + "balance_loss_clip": 1.17953277, + "balance_loss_mlp": 1.07295477, + "epoch": 0.17652186983315798, + "flos": 24246197470080.0, + "grad_norm": 2.9180528585184624, + "language_loss": 0.69820464, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.72704136, + "num_input_tokens_seen": 63465525, + "step": 2936, + "time_per_iteration": 2.8242664337158203 + }, + { + "auxiliary_loss_clip": 0.01546145, + "auxiliary_loss_mlp": 0.01348806, + "balance_loss_clip": 1.18899417, + "balance_loss_mlp": 1.07834351, + "epoch": 0.17658199308582595, + "flos": 27456772417920.0, + "grad_norm": 2.419459484657842, + "language_loss": 0.7187134, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.7476629, + "num_input_tokens_seen": 63485815, + "step": 2937, + "time_per_iteration": 2.84881854057312 + }, + { + "auxiliary_loss_clip": 0.01545271, + "auxiliary_loss_mlp": 0.01356995, + "balance_loss_clip": 1.18834066, + "balance_loss_mlp": 1.08367193, + "epoch": 0.17664211633849392, + "flos": 22420848625920.0, + "grad_norm": 2.9655766252101077, + "language_loss": 0.75754094, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.78656363, + "num_input_tokens_seen": 63503905, + "step": 2938, + "time_per_iteration": 2.825392723083496 + }, + { + "auxiliary_loss_clip": 0.01542348, + "auxiliary_loss_mlp": 0.01343133, + "balance_loss_clip": 1.18535924, + "balance_loss_mlp": 1.07228994, + "epoch": 0.17670223959116188, + "flos": 24717307881600.0, + "grad_norm": 2.31129661098031, + "language_loss": 0.70745671, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.7363115, + "num_input_tokens_seen": 63521985, + "step": 2939, + "time_per_iteration": 2.785072088241577 + }, + { + "auxiliary_loss_clip": 0.01541633, + "auxiliary_loss_mlp": 0.01361013, + "balance_loss_clip": 1.18422985, + "balance_loss_mlp": 1.09341216, + "epoch": 0.17676236284382985, + "flos": 22529172543840.0, + "grad_norm": 2.7712923425508, + "language_loss": 0.7403332, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.76935971, + "num_input_tokens_seen": 63539830, + "step": 2940, + "time_per_iteration": 2.8169772624969482 + }, + { + "auxiliary_loss_clip": 0.01537221, + "auxiliary_loss_mlp": 0.01343899, + "balance_loss_clip": 1.17910433, + "balance_loss_mlp": 1.07858717, + "epoch": 0.1768224860964978, + "flos": 12386777853600.0, + "grad_norm": 2.6070769989547737, + "language_loss": 0.85840458, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.88721585, + "num_input_tokens_seen": 63555495, + "step": 2941, + "time_per_iteration": 2.762685537338257 + }, + { + "auxiliary_loss_clip": 0.01538961, + "auxiliary_loss_mlp": 0.01341778, + "balance_loss_clip": 1.1815064, + "balance_loss_mlp": 1.07303238, + "epoch": 0.1768826093491658, + "flos": 24355697160960.0, + "grad_norm": 2.6488279640553802, + "language_loss": 0.77336466, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.80217206, + "num_input_tokens_seen": 63575290, + "step": 2942, + "time_per_iteration": 2.818026542663574 + }, + { + "auxiliary_loss_clip": 0.01537495, + "auxiliary_loss_mlp": 0.01345887, + "balance_loss_clip": 1.18074346, + "balance_loss_mlp": 1.0637902, + "epoch": 0.17694273260183377, + "flos": 23589351037440.0, + "grad_norm": 2.1903821806007566, + "language_loss": 0.80485976, + "learning_rate": 3.77774119516197e-06, + "loss": 0.83369362, + "num_input_tokens_seen": 63594670, + "step": 2943, + "time_per_iteration": 2.838317394256592 + }, + { + "auxiliary_loss_clip": 0.01545273, + "auxiliary_loss_mlp": 0.01357942, + "balance_loss_clip": 1.18881559, + "balance_loss_mlp": 1.08633542, + "epoch": 0.17700285585450173, + "flos": 26763438731040.0, + "grad_norm": 2.028294579157089, + "language_loss": 0.80649257, + "learning_rate": 3.777562726341155e-06, + "loss": 0.83552474, + "num_input_tokens_seen": 63614780, + "step": 2944, + "time_per_iteration": 4.385870933532715 + }, + { + "auxiliary_loss_clip": 0.01536311, + "auxiliary_loss_mlp": 0.01323045, + "balance_loss_clip": 1.17897654, + "balance_loss_mlp": 1.05010307, + "epoch": 0.1770629791071697, + "flos": 42779560718880.0, + "grad_norm": 2.1362637632944534, + "language_loss": 0.73962563, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.76821917, + "num_input_tokens_seen": 63637190, + "step": 2945, + "time_per_iteration": 2.9375181198120117 + }, + { + "auxiliary_loss_clip": 0.01548251, + "auxiliary_loss_mlp": 0.01347688, + "balance_loss_clip": 1.18937469, + "balance_loss_mlp": 1.07722592, + "epoch": 0.17712310235983766, + "flos": 17347375591200.0, + "grad_norm": 3.7016230730270463, + "language_loss": 0.78115761, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.81011701, + "num_input_tokens_seen": 63652140, + "step": 2946, + "time_per_iteration": 2.775359869003296 + }, + { + "auxiliary_loss_clip": 0.01542271, + "auxiliary_loss_mlp": 0.01336381, + "balance_loss_clip": 1.18479133, + "balance_loss_mlp": 1.06229484, + "epoch": 0.17718322561250563, + "flos": 23880756005280.0, + "grad_norm": 2.1917737939719086, + "language_loss": 0.7627477, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.79153419, + "num_input_tokens_seen": 63671700, + "step": 2947, + "time_per_iteration": 2.848616600036621 + }, + { + "auxiliary_loss_clip": 0.01533827, + "auxiliary_loss_mlp": 0.01329607, + "balance_loss_clip": 1.17690206, + "balance_loss_mlp": 1.0501802, + "epoch": 0.1772433488651736, + "flos": 36469351784160.0, + "grad_norm": 2.7214824532098363, + "language_loss": 0.72657716, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.75521147, + "num_input_tokens_seen": 63691685, + "step": 2948, + "time_per_iteration": 2.9645447731018066 + }, + { + "auxiliary_loss_clip": 0.01537133, + "auxiliary_loss_mlp": 0.01325158, + "balance_loss_clip": 1.18011606, + "balance_loss_mlp": 1.04687619, + "epoch": 0.1773034721178416, + "flos": 26686974831840.0, + "grad_norm": 2.184970802033022, + "language_loss": 0.82072622, + "learning_rate": 3.776669371292171e-06, + "loss": 0.84934914, + "num_input_tokens_seen": 63711720, + "step": 2949, + "time_per_iteration": 2.8928472995758057 + }, + { + "auxiliary_loss_clip": 0.01651061, + "auxiliary_loss_mlp": 0.01524521, + "balance_loss_clip": 1.29168856, + "balance_loss_mlp": 1.30250549, + "epoch": 0.17736359537050955, + "flos": 57123678506400.0, + "grad_norm": 0.8729298941574081, + "language_loss": 0.64975381, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.68150961, + "num_input_tokens_seen": 63776280, + "step": 2950, + "time_per_iteration": 3.4011857509613037 + }, + { + "auxiliary_loss_clip": 0.0153212, + "auxiliary_loss_mlp": 0.0134338, + "balance_loss_clip": 1.17644572, + "balance_loss_mlp": 1.0668149, + "epoch": 0.17742371862317752, + "flos": 27200641003200.0, + "grad_norm": 3.001407616838095, + "language_loss": 0.84577417, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.87452912, + "num_input_tokens_seen": 63797535, + "step": 2951, + "time_per_iteration": 4.328538656234741 + }, + { + "auxiliary_loss_clip": 0.01535032, + "auxiliary_loss_mlp": 0.01362564, + "balance_loss_clip": 1.17939639, + "balance_loss_mlp": 1.09210205, + "epoch": 0.17748384187584548, + "flos": 20961813594240.0, + "grad_norm": 4.550831734149791, + "language_loss": 0.80114782, + "learning_rate": 3.776132549750806e-06, + "loss": 0.83012372, + "num_input_tokens_seen": 63817045, + "step": 2952, + "time_per_iteration": 2.8057408332824707 + }, + { + "auxiliary_loss_clip": 0.01530182, + "auxiliary_loss_mlp": 0.01376827, + "balance_loss_clip": 1.17421031, + "balance_loss_mlp": 1.10750961, + "epoch": 0.17754396512851345, + "flos": 25012353952800.0, + "grad_norm": 2.7762243710823418, + "language_loss": 0.79683733, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.82590747, + "num_input_tokens_seen": 63837665, + "step": 2953, + "time_per_iteration": 4.434277534484863 + }, + { + "auxiliary_loss_clip": 0.01532852, + "auxiliary_loss_mlp": 0.01413052, + "balance_loss_clip": 1.17558229, + "balance_loss_mlp": 1.15060103, + "epoch": 0.1776040883811814, + "flos": 32054318164800.0, + "grad_norm": 4.21272080066335, + "language_loss": 0.88307357, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.91253257, + "num_input_tokens_seen": 63858455, + "step": 2954, + "time_per_iteration": 4.375609397888184 + }, + { + "auxiliary_loss_clip": 0.0153794, + "auxiliary_loss_mlp": 0.01412769, + "balance_loss_clip": 1.18087721, + "balance_loss_mlp": 1.14574051, + "epoch": 0.17766421163384938, + "flos": 21575952554400.0, + "grad_norm": 2.307863919904669, + "language_loss": 0.85094547, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.88045257, + "num_input_tokens_seen": 63876935, + "step": 2955, + "time_per_iteration": 2.8227062225341797 + }, + { + "auxiliary_loss_clip": 0.01533491, + "auxiliary_loss_mlp": 0.01423236, + "balance_loss_clip": 1.17626238, + "balance_loss_mlp": 1.16402745, + "epoch": 0.17772433488651737, + "flos": 22421531332800.0, + "grad_norm": 2.23954110318601, + "language_loss": 0.71410096, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.7436682, + "num_input_tokens_seen": 63896815, + "step": 2956, + "time_per_iteration": 2.7997448444366455 + }, + { + "auxiliary_loss_clip": 0.01534688, + "auxiliary_loss_mlp": 0.01387103, + "balance_loss_clip": 1.17793083, + "balance_loss_mlp": 1.12312567, + "epoch": 0.17778445813918534, + "flos": 25631802855360.0, + "grad_norm": 2.252477516787974, + "language_loss": 0.82967764, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85889554, + "num_input_tokens_seen": 63916140, + "step": 2957, + "time_per_iteration": 2.8013031482696533 + }, + { + "auxiliary_loss_clip": 0.0153162, + "auxiliary_loss_mlp": 0.01403935, + "balance_loss_clip": 1.17496347, + "balance_loss_mlp": 1.14110219, + "epoch": 0.1778445813918533, + "flos": 25631196004800.0, + "grad_norm": 1.6972757406633694, + "language_loss": 0.74889505, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.77825069, + "num_input_tokens_seen": 63935220, + "step": 2958, + "time_per_iteration": 2.821805000305176 + }, + { + "auxiliary_loss_clip": 0.01539448, + "auxiliary_loss_mlp": 0.01352433, + "balance_loss_clip": 1.18254113, + "balance_loss_mlp": 1.07968187, + "epoch": 0.17790470464452127, + "flos": 22347684476640.0, + "grad_norm": 2.5570141627938425, + "language_loss": 0.80946743, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.83838618, + "num_input_tokens_seen": 63954550, + "step": 2959, + "time_per_iteration": 2.819932222366333 + }, + { + "auxiliary_loss_clip": 0.01537595, + "auxiliary_loss_mlp": 0.01367447, + "balance_loss_clip": 1.1807915, + "balance_loss_mlp": 1.09507787, + "epoch": 0.17796482789718923, + "flos": 18767609750880.0, + "grad_norm": 2.868281140567533, + "language_loss": 0.52103776, + "learning_rate": 3.774698062689362e-06, + "loss": 0.55008817, + "num_input_tokens_seen": 63972425, + "step": 2960, + "time_per_iteration": 2.7633275985717773 + }, + { + "auxiliary_loss_clip": 0.01528395, + "auxiliary_loss_mlp": 0.01328361, + "balance_loss_clip": 1.1716454, + "balance_loss_mlp": 1.05599177, + "epoch": 0.1780249511498572, + "flos": 23443212379680.0, + "grad_norm": 2.629205145158359, + "language_loss": 0.89742792, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.92599547, + "num_input_tokens_seen": 63992165, + "step": 2961, + "time_per_iteration": 2.813084363937378 + }, + { + "auxiliary_loss_clip": 0.01524289, + "auxiliary_loss_mlp": 0.01337458, + "balance_loss_clip": 1.16876769, + "balance_loss_mlp": 1.05726898, + "epoch": 0.1780850744025252, + "flos": 23369706876960.0, + "grad_norm": 2.3358714919231662, + "language_loss": 0.78979081, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81840825, + "num_input_tokens_seen": 64013470, + "step": 2962, + "time_per_iteration": 3.026071071624756 + }, + { + "auxiliary_loss_clip": 0.01527306, + "auxiliary_loss_mlp": 0.01339801, + "balance_loss_clip": 1.17197728, + "balance_loss_mlp": 1.05636859, + "epoch": 0.17814519765519315, + "flos": 13773710724480.0, + "grad_norm": 2.1825941146422005, + "language_loss": 0.75034666, + "learning_rate": 3.774159019458203e-06, + "loss": 0.77901769, + "num_input_tokens_seen": 64030975, + "step": 2963, + "time_per_iteration": 2.7549357414245605 + }, + { + "auxiliary_loss_clip": 0.01530933, + "auxiliary_loss_mlp": 0.0132475, + "balance_loss_clip": 1.17497027, + "balance_loss_mlp": 1.03807604, + "epoch": 0.17820532090786112, + "flos": 21978298483200.0, + "grad_norm": 1.757772764599874, + "language_loss": 0.79042971, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.81898654, + "num_input_tokens_seen": 64050075, + "step": 2964, + "time_per_iteration": 2.825363874435425 + }, + { + "auxiliary_loss_clip": 0.01525217, + "auxiliary_loss_mlp": 0.01332853, + "balance_loss_clip": 1.16917515, + "balance_loss_mlp": 1.05151939, + "epoch": 0.17826544416052909, + "flos": 24793089073920.0, + "grad_norm": 1.869246230597895, + "language_loss": 0.81324792, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.8418287, + "num_input_tokens_seen": 64071920, + "step": 2965, + "time_per_iteration": 2.8120105266571045 + }, + { + "auxiliary_loss_clip": 0.01530243, + "auxiliary_loss_mlp": 0.01352699, + "balance_loss_clip": 1.17528391, + "balance_loss_mlp": 1.06373549, + "epoch": 0.17832556741319705, + "flos": 13881200222880.0, + "grad_norm": 5.533867001602888, + "language_loss": 0.94556248, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.97439188, + "num_input_tokens_seen": 64086835, + "step": 2966, + "time_per_iteration": 2.773430347442627 + }, + { + "auxiliary_loss_clip": 0.01528652, + "auxiliary_loss_mlp": 0.01338037, + "balance_loss_clip": 1.17297339, + "balance_loss_mlp": 1.05136299, + "epoch": 0.17838569066586502, + "flos": 36644316207840.0, + "grad_norm": 3.1621444571106054, + "language_loss": 0.72888452, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.75755143, + "num_input_tokens_seen": 64107360, + "step": 2967, + "time_per_iteration": 2.897153615951538 + }, + { + "auxiliary_loss_clip": 0.01523308, + "auxiliary_loss_mlp": 0.01342417, + "balance_loss_clip": 1.16929698, + "balance_loss_mlp": 1.05707824, + "epoch": 0.17844581391853298, + "flos": 18728884735200.0, + "grad_norm": 3.2746757123147594, + "language_loss": 0.77319825, + "learning_rate": 3.773259268638157e-06, + "loss": 0.8018555, + "num_input_tokens_seen": 64124690, + "step": 2968, + "time_per_iteration": 2.7756235599517822 + }, + { + "auxiliary_loss_clip": 0.01521438, + "auxiliary_loss_mlp": 0.01329778, + "balance_loss_clip": 1.16696453, + "balance_loss_mlp": 1.03719032, + "epoch": 0.17850593717120097, + "flos": 27380574015840.0, + "grad_norm": 4.540464164893827, + "language_loss": 0.75571293, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78422511, + "num_input_tokens_seen": 64146315, + "step": 2969, + "time_per_iteration": 2.8518593311309814 + }, + { + "auxiliary_loss_clip": 0.01592723, + "auxiliary_loss_mlp": 0.01727562, + "balance_loss_clip": 1.23696053, + "balance_loss_mlp": 1.53530121, + "epoch": 0.17856606042386894, + "flos": 67002280293600.0, + "grad_norm": 1.2101748565379704, + "language_loss": 0.68996197, + "learning_rate": 3.772898897567171e-06, + "loss": 0.7231648, + "num_input_tokens_seen": 64210875, + "step": 2970, + "time_per_iteration": 3.3738808631896973 + }, + { + "auxiliary_loss_clip": 0.01525807, + "auxiliary_loss_mlp": 0.01367876, + "balance_loss_clip": 1.17097628, + "balance_loss_mlp": 1.07910395, + "epoch": 0.1786261836765369, + "flos": 36980021630880.0, + "grad_norm": 3.254955032858972, + "language_loss": 0.67577553, + "learning_rate": 3.772718611185505e-06, + "loss": 0.70471239, + "num_input_tokens_seen": 64230740, + "step": 2971, + "time_per_iteration": 2.8894755840301514 + }, + { + "auxiliary_loss_clip": 0.01516543, + "auxiliary_loss_mlp": 0.01370142, + "balance_loss_clip": 1.16160882, + "balance_loss_mlp": 1.07164192, + "epoch": 0.17868630692920487, + "flos": 24827679920160.0, + "grad_norm": 2.364956303012872, + "language_loss": 0.90096611, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.92983294, + "num_input_tokens_seen": 64252300, + "step": 2972, + "time_per_iteration": 2.839916706085205 + }, + { + "auxiliary_loss_clip": 0.01522911, + "auxiliary_loss_mlp": 0.01373059, + "balance_loss_clip": 1.16694343, + "balance_loss_mlp": 1.07742, + "epoch": 0.17874643018187283, + "flos": 16983830534400.0, + "grad_norm": 3.3482986791165152, + "language_loss": 0.88611364, + "learning_rate": 3.77235783676401e-06, + "loss": 0.91507339, + "num_input_tokens_seen": 64270105, + "step": 2973, + "time_per_iteration": 2.820136785507202 + }, + { + "auxiliary_loss_clip": 0.01528512, + "auxiliary_loss_mlp": 0.01392003, + "balance_loss_clip": 1.17188692, + "balance_loss_mlp": 1.10704541, + "epoch": 0.1788065534345408, + "flos": 21034332964800.0, + "grad_norm": 3.0734094273855606, + "language_loss": 0.76583064, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.79503578, + "num_input_tokens_seen": 64287250, + "step": 2974, + "time_per_iteration": 2.849947929382324 + }, + { + "auxiliary_loss_clip": 0.01518365, + "auxiliary_loss_mlp": 0.01375683, + "balance_loss_clip": 1.16256189, + "balance_loss_mlp": 1.07527542, + "epoch": 0.17886667668720876, + "flos": 23989990199040.0, + "grad_norm": 7.397888558772097, + "language_loss": 0.74709153, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.77603197, + "num_input_tokens_seen": 64307140, + "step": 2975, + "time_per_iteration": 2.8602712154388428 + }, + { + "auxiliary_loss_clip": 0.01514159, + "auxiliary_loss_mlp": 0.01367361, + "balance_loss_clip": 1.15971863, + "balance_loss_mlp": 1.06428337, + "epoch": 0.17892679993987676, + "flos": 25741871468640.0, + "grad_norm": 2.746349312844683, + "language_loss": 0.73244387, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.76125908, + "num_input_tokens_seen": 64328760, + "step": 2976, + "time_per_iteration": 2.8335793018341064 + }, + { + "auxiliary_loss_clip": 0.01522684, + "auxiliary_loss_mlp": 0.01375743, + "balance_loss_clip": 1.16787803, + "balance_loss_mlp": 1.0783875, + "epoch": 0.17898692319254472, + "flos": 25701894823680.0, + "grad_norm": 2.4143196337347375, + "language_loss": 0.77595973, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.80494404, + "num_input_tokens_seen": 64348800, + "step": 2977, + "time_per_iteration": 2.7699761390686035 + }, + { + "auxiliary_loss_clip": 0.01524154, + "auxiliary_loss_mlp": 0.01375834, + "balance_loss_clip": 1.16790366, + "balance_loss_mlp": 1.08038568, + "epoch": 0.1790470464452127, + "flos": 19319811297120.0, + "grad_norm": 3.9405546515468606, + "language_loss": 0.79529512, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.82429498, + "num_input_tokens_seen": 64367955, + "step": 2978, + "time_per_iteration": 2.8099255561828613 + }, + { + "auxiliary_loss_clip": 0.01514406, + "auxiliary_loss_mlp": 0.01363584, + "balance_loss_clip": 1.1594727, + "balance_loss_mlp": 1.06317639, + "epoch": 0.17910716969788065, + "flos": 30046874402880.0, + "grad_norm": 2.107569149714019, + "language_loss": 0.7681607, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.79694062, + "num_input_tokens_seen": 64389805, + "step": 2979, + "time_per_iteration": 2.9045844078063965 + }, + { + "auxiliary_loss_clip": 0.01508365, + "auxiliary_loss_mlp": 0.01349112, + "balance_loss_clip": 1.15311837, + "balance_loss_mlp": 1.06243777, + "epoch": 0.17916729295054862, + "flos": 19429766125920.0, + "grad_norm": 3.020057100896926, + "language_loss": 0.69279265, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.72136748, + "num_input_tokens_seen": 64408220, + "step": 2980, + "time_per_iteration": 2.751148223876953 + }, + { + "auxiliary_loss_clip": 0.01518122, + "auxiliary_loss_mlp": 0.01350819, + "balance_loss_clip": 1.16229856, + "balance_loss_mlp": 1.06013918, + "epoch": 0.17922741620321658, + "flos": 14613334781760.0, + "grad_norm": 2.5921478013458468, + "language_loss": 0.70930922, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.73799872, + "num_input_tokens_seen": 64426380, + "step": 2981, + "time_per_iteration": 2.81801176071167 + }, + { + "auxiliary_loss_clip": 0.01524048, + "auxiliary_loss_mlp": 0.01344871, + "balance_loss_clip": 1.16787767, + "balance_loss_mlp": 1.05991292, + "epoch": 0.17928753945588458, + "flos": 17167215009600.0, + "grad_norm": 7.075957746050915, + "language_loss": 0.82469654, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.85338575, + "num_input_tokens_seen": 64444355, + "step": 2982, + "time_per_iteration": 4.345630168914795 + }, + { + "auxiliary_loss_clip": 0.015108, + "auxiliary_loss_mlp": 0.01339516, + "balance_loss_clip": 1.15450048, + "balance_loss_mlp": 1.06237841, + "epoch": 0.17934766270855254, + "flos": 31398382008000.0, + "grad_norm": 2.409380164833714, + "language_loss": 0.82933176, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85783488, + "num_input_tokens_seen": 64467800, + "step": 2983, + "time_per_iteration": 2.8304924964904785 + }, + { + "auxiliary_loss_clip": 0.01513413, + "auxiliary_loss_mlp": 0.01349973, + "balance_loss_clip": 1.15758657, + "balance_loss_mlp": 1.07169104, + "epoch": 0.1794077859612205, + "flos": 20816319715200.0, + "grad_norm": 9.168641330188102, + "language_loss": 0.85657823, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.88521206, + "num_input_tokens_seen": 64487230, + "step": 2984, + "time_per_iteration": 2.773266553878784 + }, + { + "auxiliary_loss_clip": 0.01513086, + "auxiliary_loss_mlp": 0.0133243, + "balance_loss_clip": 1.15687823, + "balance_loss_mlp": 1.05166817, + "epoch": 0.17946790921388847, + "flos": 28989009527040.0, + "grad_norm": 1.7639938154087178, + "language_loss": 0.89334798, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.92180312, + "num_input_tokens_seen": 64509165, + "step": 2985, + "time_per_iteration": 2.849030017852783 + }, + { + "auxiliary_loss_clip": 0.01512528, + "auxiliary_loss_mlp": 0.01326109, + "balance_loss_clip": 1.15670729, + "balance_loss_mlp": 1.05679131, + "epoch": 0.17952803246655644, + "flos": 20739324821760.0, + "grad_norm": 2.2293890089396218, + "language_loss": 0.70035416, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72874051, + "num_input_tokens_seen": 64527940, + "step": 2986, + "time_per_iteration": 2.81658935546875 + }, + { + "auxiliary_loss_clip": 0.01511629, + "auxiliary_loss_mlp": 0.0135246, + "balance_loss_clip": 1.15549135, + "balance_loss_mlp": 1.08619428, + "epoch": 0.1795881557192244, + "flos": 28258467950880.0, + "grad_norm": 3.4110376562880678, + "language_loss": 0.78084534, + "learning_rate": 3.769824891588688e-06, + "loss": 0.80948627, + "num_input_tokens_seen": 64545230, + "step": 2987, + "time_per_iteration": 2.812487840652466 + }, + { + "auxiliary_loss_clip": 0.01510151, + "auxiliary_loss_mlp": 0.01367441, + "balance_loss_clip": 1.15333414, + "balance_loss_mlp": 1.0933547, + "epoch": 0.17964827897189237, + "flos": 18554034096000.0, + "grad_norm": 4.814386419730938, + "language_loss": 0.780325, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80910087, + "num_input_tokens_seen": 64563820, + "step": 2988, + "time_per_iteration": 4.227558612823486 + }, + { + "auxiliary_loss_clip": 0.01534774, + "auxiliary_loss_mlp": 0.01373215, + "balance_loss_clip": 1.17843187, + "balance_loss_mlp": 1.14204407, + "epoch": 0.17970840222456036, + "flos": 58170771784800.0, + "grad_norm": 0.7876487639692634, + "language_loss": 0.62678361, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.65586352, + "num_input_tokens_seen": 64621315, + "step": 2989, + "time_per_iteration": 3.209601402282715 + }, + { + "auxiliary_loss_clip": 0.01518597, + "auxiliary_loss_mlp": 0.01376977, + "balance_loss_clip": 1.16206098, + "balance_loss_mlp": 1.11166501, + "epoch": 0.17976852547722832, + "flos": 20302767328320.0, + "grad_norm": 6.366838484565396, + "language_loss": 0.70341676, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.73237252, + "num_input_tokens_seen": 64639885, + "step": 2990, + "time_per_iteration": 2.815239667892456 + }, + { + "auxiliary_loss_clip": 0.01510242, + "auxiliary_loss_mlp": 0.01391441, + "balance_loss_clip": 1.1544776, + "balance_loss_mlp": 1.12441254, + "epoch": 0.1798286487298963, + "flos": 39672113531040.0, + "grad_norm": 2.978865739838583, + "language_loss": 0.69186187, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.72087872, + "num_input_tokens_seen": 64661220, + "step": 2991, + "time_per_iteration": 2.924851894378662 + }, + { + "auxiliary_loss_clip": 0.01510781, + "auxiliary_loss_mlp": 0.01414487, + "balance_loss_clip": 1.15427971, + "balance_loss_mlp": 1.14936602, + "epoch": 0.17988877198256426, + "flos": 25522682446080.0, + "grad_norm": 15.00950528972259, + "language_loss": 0.82717854, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.85643125, + "num_input_tokens_seen": 64682530, + "step": 2992, + "time_per_iteration": 5.8916497230529785 + }, + { + "auxiliary_loss_clip": 0.01504279, + "auxiliary_loss_mlp": 0.01419131, + "balance_loss_clip": 1.14967811, + "balance_loss_mlp": 1.15210235, + "epoch": 0.17994889523523222, + "flos": 18809520732000.0, + "grad_norm": 8.347383078352676, + "language_loss": 0.82308304, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.85231715, + "num_input_tokens_seen": 64701025, + "step": 2993, + "time_per_iteration": 2.787748336791992 + }, + { + "auxiliary_loss_clip": 0.01506681, + "auxiliary_loss_mlp": 0.01393671, + "balance_loss_clip": 1.15088308, + "balance_loss_mlp": 1.13217318, + "epoch": 0.18000901848790019, + "flos": 21106510981920.0, + "grad_norm": 3.645692216989879, + "language_loss": 0.78977644, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.81877989, + "num_input_tokens_seen": 64719570, + "step": 2994, + "time_per_iteration": 2.8099589347839355 + }, + { + "auxiliary_loss_clip": 0.01512019, + "auxiliary_loss_mlp": 0.01447161, + "balance_loss_clip": 1.15572143, + "balance_loss_mlp": 1.19138515, + "epoch": 0.18006914174056818, + "flos": 19648500010560.0, + "grad_norm": 2.9943738472933914, + "language_loss": 0.80428493, + "learning_rate": 3.768371587287296e-06, + "loss": 0.83387673, + "num_input_tokens_seen": 64738110, + "step": 2995, + "time_per_iteration": 2.8131301403045654 + }, + { + "auxiliary_loss_clip": 0.01514991, + "auxiliary_loss_mlp": 0.01394134, + "balance_loss_clip": 1.15961695, + "balance_loss_mlp": 1.12271845, + "epoch": 0.18012926499323614, + "flos": 19501906214880.0, + "grad_norm": 1.8546866637379984, + "language_loss": 0.84527397, + "learning_rate": 3.768189622421512e-06, + "loss": 0.87436527, + "num_input_tokens_seen": 64756345, + "step": 2996, + "time_per_iteration": 2.8126535415649414 + }, + { + "auxiliary_loss_clip": 0.01504642, + "auxiliary_loss_mlp": 0.01376057, + "balance_loss_clip": 1.14949512, + "balance_loss_mlp": 1.1059761, + "epoch": 0.1801893882459041, + "flos": 19466632661760.0, + "grad_norm": 1.6082536859809382, + "language_loss": 0.88058925, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90939617, + "num_input_tokens_seen": 64776375, + "step": 2997, + "time_per_iteration": 2.815946340560913 + }, + { + "auxiliary_loss_clip": 0.01504325, + "auxiliary_loss_mlp": 0.01388859, + "balance_loss_clip": 1.14824629, + "balance_loss_mlp": 1.12087631, + "epoch": 0.18024951149857207, + "flos": 26873052206400.0, + "grad_norm": 2.2724558775388197, + "language_loss": 0.85239601, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.88132787, + "num_input_tokens_seen": 64796210, + "step": 2998, + "time_per_iteration": 2.8535854816436768 + }, + { + "auxiliary_loss_clip": 0.01515036, + "auxiliary_loss_mlp": 0.01378267, + "balance_loss_clip": 1.15962243, + "balance_loss_mlp": 1.11371768, + "epoch": 0.18030963475124004, + "flos": 30229234817760.0, + "grad_norm": 2.5295537182946046, + "language_loss": 0.84228677, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.87121975, + "num_input_tokens_seen": 64818590, + "step": 2999, + "time_per_iteration": 2.8804874420166016 + }, + { + "auxiliary_loss_clip": 0.01506062, + "auxiliary_loss_mlp": 0.01341811, + "balance_loss_clip": 1.15056372, + "balance_loss_mlp": 1.06467319, + "epoch": 0.180369758003908, + "flos": 22309566311520.0, + "grad_norm": 2.8817235840029394, + "language_loss": 0.75152075, + "learning_rate": 3.76746109252814e-06, + "loss": 0.77999949, + "num_input_tokens_seen": 64838350, + "step": 3000, + "time_per_iteration": 2.8152546882629395 + }, + { + "auxiliary_loss_clip": 0.01505409, + "auxiliary_loss_mlp": 0.0133351, + "balance_loss_clip": 1.14849293, + "balance_loss_mlp": 1.05713534, + "epoch": 0.18042988125657597, + "flos": 23734275994080.0, + "grad_norm": 10.531696659270652, + "language_loss": 0.71274298, + "learning_rate": 3.76727879248177e-06, + "loss": 0.7411322, + "num_input_tokens_seen": 64858065, + "step": 3001, + "time_per_iteration": 2.7900214195251465 + }, + { + "auxiliary_loss_clip": 0.01503572, + "auxiliary_loss_mlp": 0.01330576, + "balance_loss_clip": 1.1485033, + "balance_loss_mlp": 1.05229378, + "epoch": 0.18049000450924396, + "flos": 24095659145760.0, + "grad_norm": 3.7036480294053953, + "language_loss": 0.88552707, + "learning_rate": 3.767096425420011e-06, + "loss": 0.91386855, + "num_input_tokens_seen": 64877305, + "step": 3002, + "time_per_iteration": 2.841750144958496 + }, + { + "auxiliary_loss_clip": 0.01502941, + "auxiliary_loss_mlp": 0.01323148, + "balance_loss_clip": 1.14818668, + "balance_loss_mlp": 1.04124141, + "epoch": 0.18055012776191193, + "flos": 22165324061760.0, + "grad_norm": 4.412649756661646, + "language_loss": 0.81065744, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.83891833, + "num_input_tokens_seen": 64896955, + "step": 3003, + "time_per_iteration": 2.7812163829803467 + }, + { + "auxiliary_loss_clip": 0.01507189, + "auxiliary_loss_mlp": 0.01324987, + "balance_loss_clip": 1.15119839, + "balance_loss_mlp": 1.04289019, + "epoch": 0.1806102510145799, + "flos": 28916072946720.0, + "grad_norm": 4.475499727085918, + "language_loss": 0.67871797, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.70703971, + "num_input_tokens_seen": 64917080, + "step": 3004, + "time_per_iteration": 2.883841037750244 + }, + { + "auxiliary_loss_clip": 0.01509575, + "auxiliary_loss_mlp": 0.01350745, + "balance_loss_clip": 1.15462613, + "balance_loss_mlp": 1.0732255, + "epoch": 0.18067037426724786, + "flos": 19027685694240.0, + "grad_norm": 3.324733753406748, + "language_loss": 0.8518827, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.88048583, + "num_input_tokens_seen": 64935215, + "step": 3005, + "time_per_iteration": 2.7536818981170654 + }, + { + "auxiliary_loss_clip": 0.01509068, + "auxiliary_loss_mlp": 0.0134553, + "balance_loss_clip": 1.15293586, + "balance_loss_mlp": 1.06534004, + "epoch": 0.18073049751991582, + "flos": 27456089711040.0, + "grad_norm": 2.290933571290884, + "language_loss": 0.83174753, + "learning_rate": 3.766366287157432e-06, + "loss": 0.86029351, + "num_input_tokens_seen": 64956275, + "step": 3006, + "time_per_iteration": 2.8184876441955566 + }, + { + "auxiliary_loss_clip": 0.01506096, + "auxiliary_loss_mlp": 0.01351723, + "balance_loss_clip": 1.15089869, + "balance_loss_mlp": 1.06600237, + "epoch": 0.1807906207725838, + "flos": 28731474770400.0, + "grad_norm": 2.0398425756026812, + "language_loss": 0.77446723, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.80304551, + "num_input_tokens_seen": 64979390, + "step": 3007, + "time_per_iteration": 2.847867488861084 + }, + { + "auxiliary_loss_clip": 0.01554783, + "auxiliary_loss_mlp": 0.0137973, + "balance_loss_clip": 1.20309663, + "balance_loss_mlp": 1.15008545, + "epoch": 0.18085074402525175, + "flos": 64474001573760.0, + "grad_norm": 1.0736858388693462, + "language_loss": 0.56922507, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59857011, + "num_input_tokens_seen": 65043135, + "step": 3008, + "time_per_iteration": 3.4514827728271484 + }, + { + "auxiliary_loss_clip": 0.01508749, + "auxiliary_loss_mlp": 0.01337107, + "balance_loss_clip": 1.1536572, + "balance_loss_mlp": 1.05310249, + "epoch": 0.18091086727791975, + "flos": 23479206567840.0, + "grad_norm": 2.2351581966472507, + "language_loss": 0.67480552, + "learning_rate": 3.765817980138021e-06, + "loss": 0.70326412, + "num_input_tokens_seen": 65062845, + "step": 3009, + "time_per_iteration": 2.9825265407562256 + }, + { + "auxiliary_loss_clip": 0.01502501, + "auxiliary_loss_mlp": 0.01335005, + "balance_loss_clip": 1.14761925, + "balance_loss_mlp": 1.05138254, + "epoch": 0.1809709905305877, + "flos": 24172767823680.0, + "grad_norm": 3.7451956844631433, + "language_loss": 0.75737298, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.785748, + "num_input_tokens_seen": 65082110, + "step": 3010, + "time_per_iteration": 2.8062283992767334 + }, + { + "auxiliary_loss_clip": 0.01498208, + "auxiliary_loss_mlp": 0.01310747, + "balance_loss_clip": 1.14352906, + "balance_loss_mlp": 1.03418159, + "epoch": 0.18103111378325568, + "flos": 21652947447840.0, + "grad_norm": 1.8257870976971748, + "language_loss": 0.67343509, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.70152462, + "num_input_tokens_seen": 65101985, + "step": 3011, + "time_per_iteration": 2.83512806892395 + }, + { + "auxiliary_loss_clip": 0.01506468, + "auxiliary_loss_mlp": 0.01331251, + "balance_loss_clip": 1.15219164, + "balance_loss_mlp": 1.06193292, + "epoch": 0.18109123703592364, + "flos": 53690842719360.0, + "grad_norm": 2.068642147423667, + "language_loss": 0.71458977, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.74296695, + "num_input_tokens_seen": 65129295, + "step": 3012, + "time_per_iteration": 3.101539134979248 + }, + { + "auxiliary_loss_clip": 0.01509791, + "auxiliary_loss_mlp": 0.01342533, + "balance_loss_clip": 1.15477324, + "balance_loss_mlp": 1.07016373, + "epoch": 0.1811513602885916, + "flos": 35848499539680.0, + "grad_norm": 27.721275361489976, + "language_loss": 0.62408233, + "learning_rate": 3.765085966704609e-06, + "loss": 0.65260559, + "num_input_tokens_seen": 65150625, + "step": 3013, + "time_per_iteration": 3.0093986988067627 + }, + { + "auxiliary_loss_clip": 0.01509212, + "auxiliary_loss_mlp": 0.01346277, + "balance_loss_clip": 1.15438104, + "balance_loss_mlp": 1.07505155, + "epoch": 0.18121148354125957, + "flos": 23734844916480.0, + "grad_norm": 2.187151849061087, + "language_loss": 0.76361388, + "learning_rate": 3.764902795998309e-06, + "loss": 0.79216874, + "num_input_tokens_seen": 65170880, + "step": 3014, + "time_per_iteration": 2.789174795150757 + }, + { + "auxiliary_loss_clip": 0.015119, + "auxiliary_loss_mlp": 0.01367181, + "balance_loss_clip": 1.15783775, + "balance_loss_mlp": 1.09290457, + "epoch": 0.18127160679392756, + "flos": 28730716207200.0, + "grad_norm": 2.1975632743309435, + "language_loss": 0.66118026, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.68997109, + "num_input_tokens_seen": 65192530, + "step": 3015, + "time_per_iteration": 2.8931849002838135 + }, + { + "auxiliary_loss_clip": 0.01497795, + "auxiliary_loss_mlp": 0.01349501, + "balance_loss_clip": 1.14421868, + "balance_loss_mlp": 1.07522476, + "epoch": 0.18133173004659553, + "flos": 20487137935680.0, + "grad_norm": 3.0275451422976762, + "language_loss": 0.78422713, + "learning_rate": 3.764536253816785e-06, + "loss": 0.81270009, + "num_input_tokens_seen": 65211675, + "step": 3016, + "time_per_iteration": 2.840879440307617 + }, + { + "auxiliary_loss_clip": 0.01502864, + "auxiliary_loss_mlp": 0.01348601, + "balance_loss_clip": 1.15021443, + "balance_loss_mlp": 1.0710814, + "epoch": 0.1813918532992635, + "flos": 22854030513120.0, + "grad_norm": 4.596910980049935, + "language_loss": 0.83649039, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.86500502, + "num_input_tokens_seen": 65231185, + "step": 3017, + "time_per_iteration": 2.8260014057159424 + }, + { + "auxiliary_loss_clip": 0.01504, + "auxiliary_loss_mlp": 0.01370107, + "balance_loss_clip": 1.15096545, + "balance_loss_mlp": 1.10174346, + "epoch": 0.18145197655193146, + "flos": 36068105772000.0, + "grad_norm": 5.380419312506158, + "language_loss": 0.6731441, + "learning_rate": 3.764169443989697e-06, + "loss": 0.70188522, + "num_input_tokens_seen": 65251645, + "step": 3018, + "time_per_iteration": 2.8954617977142334 + }, + { + "auxiliary_loss_clip": 0.01502141, + "auxiliary_loss_mlp": 0.01363635, + "balance_loss_clip": 1.14958751, + "balance_loss_mlp": 1.08783221, + "epoch": 0.18151209980459942, + "flos": 24026211956160.0, + "grad_norm": 2.6001360173440546, + "language_loss": 0.75854433, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.78720212, + "num_input_tokens_seen": 65271125, + "step": 3019, + "time_per_iteration": 2.797581911087036 + }, + { + "auxiliary_loss_clip": 0.01517029, + "auxiliary_loss_mlp": 0.01365582, + "balance_loss_clip": 1.16312003, + "balance_loss_mlp": 1.09130573, + "epoch": 0.1815722230572674, + "flos": 23953920154560.0, + "grad_norm": 7.1703588646671745, + "language_loss": 0.81976837, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.84859449, + "num_input_tokens_seen": 65290600, + "step": 3020, + "time_per_iteration": 4.328613996505737 + }, + { + "auxiliary_loss_clip": 0.01508121, + "auxiliary_loss_mlp": 0.01352329, + "balance_loss_clip": 1.15571153, + "balance_loss_mlp": 1.0751915, + "epoch": 0.18163234630993536, + "flos": 24388619168160.0, + "grad_norm": 5.829741080097862, + "language_loss": 0.77605784, + "learning_rate": 3.763618727535352e-06, + "loss": 0.80466241, + "num_input_tokens_seen": 65311040, + "step": 3021, + "time_per_iteration": 2.7819032669067383 + }, + { + "auxiliary_loss_clip": 0.01501569, + "auxiliary_loss_mlp": 0.01346159, + "balance_loss_clip": 1.14887762, + "balance_loss_mlp": 1.06825829, + "epoch": 0.18169246956260335, + "flos": 24683665239360.0, + "grad_norm": 2.152472918276546, + "language_loss": 0.8495636, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87804091, + "num_input_tokens_seen": 65332115, + "step": 3022, + "time_per_iteration": 2.8788466453552246 + }, + { + "auxiliary_loss_clip": 0.01502035, + "auxiliary_loss_mlp": 0.01345502, + "balance_loss_clip": 1.14891601, + "balance_loss_mlp": 1.07027209, + "epoch": 0.1817525928152713, + "flos": 24245590619520.0, + "grad_norm": 10.600382061873663, + "language_loss": 0.69264287, + "learning_rate": 3.763251248837859e-06, + "loss": 0.72111821, + "num_input_tokens_seen": 65352210, + "step": 3023, + "time_per_iteration": 2.8211348056793213 + }, + { + "auxiliary_loss_clip": 0.01500745, + "auxiliary_loss_mlp": 0.01331745, + "balance_loss_clip": 1.14606106, + "balance_loss_mlp": 1.05479813, + "epoch": 0.18181271606793928, + "flos": 16473957179040.0, + "grad_norm": 1.9040410415246225, + "language_loss": 0.74282956, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.77115452, + "num_input_tokens_seen": 65370600, + "step": 3024, + "time_per_iteration": 2.8130903244018555 + }, + { + "auxiliary_loss_clip": 0.01504283, + "auxiliary_loss_mlp": 0.01340605, + "balance_loss_clip": 1.15190244, + "balance_loss_mlp": 1.06480241, + "epoch": 0.18187283932060724, + "flos": 18582632292960.0, + "grad_norm": 2.9548008491452897, + "language_loss": 0.88359046, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.91203922, + "num_input_tokens_seen": 65387270, + "step": 3025, + "time_per_iteration": 2.7826592922210693 + }, + { + "auxiliary_loss_clip": 0.01504399, + "auxiliary_loss_mlp": 0.01340109, + "balance_loss_clip": 1.15126705, + "balance_loss_mlp": 1.06468809, + "epoch": 0.1819329625732752, + "flos": 20268935045280.0, + "grad_norm": 3.512420451082122, + "language_loss": 0.78919131, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.81763637, + "num_input_tokens_seen": 65406550, + "step": 3026, + "time_per_iteration": 4.282029628753662 + }, + { + "auxiliary_loss_clip": 0.01506311, + "auxiliary_loss_mlp": 0.01330799, + "balance_loss_clip": 1.15317976, + "balance_loss_mlp": 1.0548054, + "epoch": 0.18199308582594317, + "flos": 25917215173920.0, + "grad_norm": 6.5339449527699935, + "language_loss": 0.76198637, + "learning_rate": 3.762515489146692e-06, + "loss": 0.79035747, + "num_input_tokens_seen": 65425955, + "step": 3027, + "time_per_iteration": 2.9120335578918457 + }, + { + "auxiliary_loss_clip": 0.01504151, + "auxiliary_loss_mlp": 0.01355992, + "balance_loss_clip": 1.15062952, + "balance_loss_mlp": 1.07809138, + "epoch": 0.18205320907861114, + "flos": 15379339551840.0, + "grad_norm": 5.258422828562379, + "language_loss": 0.8534041, + "learning_rate": 3.762331382119546e-06, + "loss": 0.88200557, + "num_input_tokens_seen": 65442820, + "step": 3028, + "time_per_iteration": 2.7932844161987305 + }, + { + "auxiliary_loss_clip": 0.01507849, + "auxiliary_loss_mlp": 0.01367517, + "balance_loss_clip": 1.1551342, + "balance_loss_mlp": 1.09667325, + "epoch": 0.18211333233127913, + "flos": 25626379128480.0, + "grad_norm": 1.9765250684314541, + "language_loss": 0.82968664, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.85844028, + "num_input_tokens_seen": 65461825, + "step": 3029, + "time_per_iteration": 4.321113109588623 + }, + { + "auxiliary_loss_clip": 0.0151377, + "auxiliary_loss_mlp": 0.0133266, + "balance_loss_clip": 1.16068625, + "balance_loss_mlp": 1.04922831, + "epoch": 0.1821734555839471, + "flos": 14977790114400.0, + "grad_norm": 2.6444380340930884, + "language_loss": 0.77964872, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80811298, + "num_input_tokens_seen": 65479480, + "step": 3030, + "time_per_iteration": 2.75650954246521 + }, + { + "auxiliary_loss_clip": 0.01500198, + "auxiliary_loss_mlp": 0.01337432, + "balance_loss_clip": 1.14765143, + "balance_loss_mlp": 1.05514455, + "epoch": 0.18223357883661506, + "flos": 20196112249440.0, + "grad_norm": 3.5098824924730336, + "language_loss": 0.84814519, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87652147, + "num_input_tokens_seen": 65497775, + "step": 3031, + "time_per_iteration": 4.232495546340942 + }, + { + "auxiliary_loss_clip": 0.01502993, + "auxiliary_loss_mlp": 0.01331095, + "balance_loss_clip": 1.15024614, + "balance_loss_mlp": 1.05014277, + "epoch": 0.18229370208928303, + "flos": 15233807744640.0, + "grad_norm": 3.7184868340810833, + "language_loss": 0.80333209, + "learning_rate": 3.76159428580299e-06, + "loss": 0.83167297, + "num_input_tokens_seen": 65516505, + "step": 3032, + "time_per_iteration": 2.7930712699890137 + }, + { + "auxiliary_loss_clip": 0.01505435, + "auxiliary_loss_mlp": 0.01336808, + "balance_loss_clip": 1.15051103, + "balance_loss_mlp": 1.05165911, + "epoch": 0.182353825341951, + "flos": 23842675768320.0, + "grad_norm": 2.4379495739981603, + "language_loss": 0.81121373, + "learning_rate": 3.761409844706795e-06, + "loss": 0.83963609, + "num_input_tokens_seen": 65536160, + "step": 3033, + "time_per_iteration": 2.869410276412964 + }, + { + "auxiliary_loss_clip": 0.01674638, + "auxiliary_loss_mlp": 0.01640861, + "balance_loss_clip": 1.3306936, + "balance_loss_mlp": 1.42266083, + "epoch": 0.18241394859461896, + "flos": 61196672700000.0, + "grad_norm": 1.035106505287732, + "language_loss": 0.63429856, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.66745353, + "num_input_tokens_seen": 65589375, + "step": 3034, + "time_per_iteration": 3.2572083473205566 + }, + { + "auxiliary_loss_clip": 0.01504908, + "auxiliary_loss_mlp": 0.0133905, + "balance_loss_clip": 1.15109277, + "balance_loss_mlp": 1.05542755, + "epoch": 0.18247407184728695, + "flos": 18473284314720.0, + "grad_norm": 2.453719538268217, + "language_loss": 0.80219167, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.83063126, + "num_input_tokens_seen": 65606720, + "step": 3035, + "time_per_iteration": 2.7857518196105957 + }, + { + "auxiliary_loss_clip": 0.01501068, + "auxiliary_loss_mlp": 0.0133492, + "balance_loss_clip": 1.14687741, + "balance_loss_mlp": 1.04881787, + "epoch": 0.18253419509995492, + "flos": 21797151769440.0, + "grad_norm": 1.898297182815477, + "language_loss": 0.84839791, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87675774, + "num_input_tokens_seen": 65625495, + "step": 3036, + "time_per_iteration": 2.796999216079712 + }, + { + "auxiliary_loss_clip": 0.01504163, + "auxiliary_loss_mlp": 0.01344373, + "balance_loss_clip": 1.14974737, + "balance_loss_mlp": 1.05464625, + "epoch": 0.18259431835262288, + "flos": 20151432512640.0, + "grad_norm": 3.2661900516007725, + "language_loss": 0.80085325, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82933861, + "num_input_tokens_seen": 65643515, + "step": 3037, + "time_per_iteration": 2.8479018211364746 + }, + { + "auxiliary_loss_clip": 0.01507958, + "auxiliary_loss_mlp": 0.01364091, + "balance_loss_clip": 1.1531024, + "balance_loss_mlp": 1.07074118, + "epoch": 0.18265444160529085, + "flos": 16983489180960.0, + "grad_norm": 15.518133365920043, + "language_loss": 0.79787314, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.82659364, + "num_input_tokens_seen": 65658155, + "step": 3038, + "time_per_iteration": 2.8053126335144043 + }, + { + "auxiliary_loss_clip": 0.0150193, + "auxiliary_loss_mlp": 0.01359951, + "balance_loss_clip": 1.1477685, + "balance_loss_mlp": 1.06488371, + "epoch": 0.1827145648579588, + "flos": 34426141403040.0, + "grad_norm": 2.0104400682877, + "language_loss": 0.67244965, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.7010684, + "num_input_tokens_seen": 65679310, + "step": 3039, + "time_per_iteration": 2.864518642425537 + }, + { + "auxiliary_loss_clip": 0.01502014, + "auxiliary_loss_mlp": 0.01359754, + "balance_loss_clip": 1.14757073, + "balance_loss_mlp": 1.06640351, + "epoch": 0.18277468811062678, + "flos": 53291000049120.0, + "grad_norm": 1.9055817535516475, + "language_loss": 0.73856568, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.76718342, + "num_input_tokens_seen": 65705235, + "step": 3040, + "time_per_iteration": 3.0824975967407227 + }, + { + "auxiliary_loss_clip": 0.01500825, + "auxiliary_loss_mlp": 0.01353844, + "balance_loss_clip": 1.14651752, + "balance_loss_mlp": 1.05305529, + "epoch": 0.18283481136329474, + "flos": 31653413506080.0, + "grad_norm": 2.3210519159858083, + "language_loss": 0.6063332, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.63487989, + "num_input_tokens_seen": 65727575, + "step": 3041, + "time_per_iteration": 2.8391873836517334 + }, + { + "auxiliary_loss_clip": 0.01498347, + "auxiliary_loss_mlp": 0.01347901, + "balance_loss_clip": 1.14450192, + "balance_loss_mlp": 1.05168986, + "epoch": 0.18289493461596273, + "flos": 53141447856960.0, + "grad_norm": 1.8221256048748615, + "language_loss": 0.6051721, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.63363463, + "num_input_tokens_seen": 65751370, + "step": 3042, + "time_per_iteration": 3.046205759048462 + }, + { + "auxiliary_loss_clip": 0.01504124, + "auxiliary_loss_mlp": 0.01355482, + "balance_loss_clip": 1.14996696, + "balance_loss_mlp": 1.06728137, + "epoch": 0.1829550578686307, + "flos": 25591522785120.0, + "grad_norm": 2.103734310886282, + "language_loss": 0.87631428, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.90491033, + "num_input_tokens_seen": 65771040, + "step": 3043, + "time_per_iteration": 2.8248612880706787 + }, + { + "auxiliary_loss_clip": 0.01498761, + "auxiliary_loss_mlp": 0.01332496, + "balance_loss_clip": 1.14374352, + "balance_loss_mlp": 1.04677474, + "epoch": 0.18301518112129866, + "flos": 22603626250560.0, + "grad_norm": 2.177832320854251, + "language_loss": 0.70226747, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.73058003, + "num_input_tokens_seen": 65789345, + "step": 3044, + "time_per_iteration": 2.8062753677368164 + }, + { + "auxiliary_loss_clip": 0.01496899, + "auxiliary_loss_mlp": 0.01332919, + "balance_loss_clip": 1.14442611, + "balance_loss_mlp": 1.03499079, + "epoch": 0.18307530437396663, + "flos": 34023605833440.0, + "grad_norm": 6.55920048601006, + "language_loss": 0.63980502, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66810316, + "num_input_tokens_seen": 65810990, + "step": 3045, + "time_per_iteration": 2.8655505180358887 + }, + { + "auxiliary_loss_clip": 0.0149772, + "auxiliary_loss_mlp": 0.01322917, + "balance_loss_clip": 1.14291644, + "balance_loss_mlp": 1.03662372, + "epoch": 0.1831354276266346, + "flos": 21281437477440.0, + "grad_norm": 4.559151256870633, + "language_loss": 0.79719341, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.82539982, + "num_input_tokens_seen": 65827230, + "step": 3046, + "time_per_iteration": 2.761220693588257 + }, + { + "auxiliary_loss_clip": 0.01495011, + "auxiliary_loss_mlp": 0.01330473, + "balance_loss_clip": 1.13978827, + "balance_loss_mlp": 1.04170036, + "epoch": 0.18319555087930256, + "flos": 21035091528000.0, + "grad_norm": 2.322642881360922, + "language_loss": 0.78841984, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81667471, + "num_input_tokens_seen": 65845900, + "step": 3047, + "time_per_iteration": 2.8484995365142822 + }, + { + "auxiliary_loss_clip": 0.01506025, + "auxiliary_loss_mlp": 0.01344333, + "balance_loss_clip": 1.15127087, + "balance_loss_mlp": 1.06490636, + "epoch": 0.18325567413197055, + "flos": 34385633763840.0, + "grad_norm": 1.7094267817161175, + "language_loss": 0.80658007, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83508372, + "num_input_tokens_seen": 65868730, + "step": 3048, + "time_per_iteration": 2.9082679748535156 + }, + { + "auxiliary_loss_clip": 0.01500584, + "auxiliary_loss_mlp": 0.01326391, + "balance_loss_clip": 1.14524162, + "balance_loss_mlp": 1.0507791, + "epoch": 0.18331579738463852, + "flos": 20560567582080.0, + "grad_norm": 2.060052206919062, + "language_loss": 0.8667419, + "learning_rate": 3.758449708105424e-06, + "loss": 0.89501166, + "num_input_tokens_seen": 65888420, + "step": 3049, + "time_per_iteration": 2.815793037414551 + }, + { + "auxiliary_loss_clip": 0.01495847, + "auxiliary_loss_mlp": 0.01341157, + "balance_loss_clip": 1.14125133, + "balance_loss_mlp": 1.0636375, + "epoch": 0.18337592063730648, + "flos": 19609812923040.0, + "grad_norm": 2.651986017595356, + "language_loss": 0.77529204, + "learning_rate": 3.75826413248424e-06, + "loss": 0.80366206, + "num_input_tokens_seen": 65905840, + "step": 3050, + "time_per_iteration": 2.7499282360076904 + }, + { + "auxiliary_loss_clip": 0.0149534, + "auxiliary_loss_mlp": 0.0132528, + "balance_loss_clip": 1.14054334, + "balance_loss_mlp": 1.05062222, + "epoch": 0.18343604388997445, + "flos": 20853034538400.0, + "grad_norm": 3.417767521695052, + "language_loss": 0.99408287, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.02228904, + "num_input_tokens_seen": 65922845, + "step": 3051, + "time_per_iteration": 2.7649917602539062 + }, + { + "auxiliary_loss_clip": 0.0149914, + "auxiliary_loss_mlp": 0.01330375, + "balance_loss_clip": 1.14454412, + "balance_loss_mlp": 1.0576241, + "epoch": 0.1834961671426424, + "flos": 24396963363360.0, + "grad_norm": 2.035319483891819, + "language_loss": 0.86399281, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.89228791, + "num_input_tokens_seen": 65945555, + "step": 3052, + "time_per_iteration": 2.90446400642395 + }, + { + "auxiliary_loss_clip": 0.01499724, + "auxiliary_loss_mlp": 0.01362903, + "balance_loss_clip": 1.14530492, + "balance_loss_mlp": 1.09205973, + "epoch": 0.18355629039531038, + "flos": 21253749556320.0, + "grad_norm": 1.9269079589663554, + "language_loss": 0.73256373, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.76119006, + "num_input_tokens_seen": 65963965, + "step": 3053, + "time_per_iteration": 2.812422037124634 + }, + { + "auxiliary_loss_clip": 0.01504958, + "auxiliary_loss_mlp": 0.01359283, + "balance_loss_clip": 1.14984345, + "balance_loss_mlp": 1.08538759, + "epoch": 0.18361641364797834, + "flos": 28658955399840.0, + "grad_norm": 2.1795598808634193, + "language_loss": 0.62028062, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64892304, + "num_input_tokens_seen": 65985965, + "step": 3054, + "time_per_iteration": 2.8411669731140137 + }, + { + "auxiliary_loss_clip": 0.01496805, + "auxiliary_loss_mlp": 0.01351865, + "balance_loss_clip": 1.14315939, + "balance_loss_mlp": 1.08159411, + "epoch": 0.18367653690064634, + "flos": 20920699104480.0, + "grad_norm": 2.2691256223998306, + "language_loss": 0.78056979, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80905652, + "num_input_tokens_seen": 66005645, + "step": 3055, + "time_per_iteration": 2.758007049560547 + }, + { + "auxiliary_loss_clip": 0.01497101, + "auxiliary_loss_mlp": 0.01354463, + "balance_loss_clip": 1.14114225, + "balance_loss_mlp": 1.08743405, + "epoch": 0.1837366601533143, + "flos": 28768037880960.0, + "grad_norm": 2.1487657829156466, + "language_loss": 0.6983881, + "learning_rate": 3.757149278859014e-06, + "loss": 0.7269038, + "num_input_tokens_seen": 66025675, + "step": 3056, + "time_per_iteration": 2.8327465057373047 + }, + { + "auxiliary_loss_clip": 0.01494405, + "auxiliary_loss_mlp": 0.01353167, + "balance_loss_clip": 1.13830471, + "balance_loss_mlp": 1.08651924, + "epoch": 0.18379678340598227, + "flos": 21253559915520.0, + "grad_norm": 1.8631828807842676, + "language_loss": 0.80660844, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.83508414, + "num_input_tokens_seen": 66046125, + "step": 3057, + "time_per_iteration": 2.8011481761932373 + }, + { + "auxiliary_loss_clip": 0.01489539, + "auxiliary_loss_mlp": 0.01363241, + "balance_loss_clip": 1.13418365, + "balance_loss_mlp": 1.08705711, + "epoch": 0.18385690665865023, + "flos": 20451750598080.0, + "grad_norm": 2.6417323671908717, + "language_loss": 0.8253327, + "learning_rate": 3.756777127858533e-06, + "loss": 0.85386044, + "num_input_tokens_seen": 66064375, + "step": 3058, + "time_per_iteration": 2.7938339710235596 + }, + { + "auxiliary_loss_clip": 0.01496766, + "auxiliary_loss_mlp": 0.01363935, + "balance_loss_clip": 1.14097857, + "balance_loss_mlp": 1.09633422, + "epoch": 0.1839170299113182, + "flos": 26142965768160.0, + "grad_norm": 4.295339610735951, + "language_loss": 0.85541511, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88402212, + "num_input_tokens_seen": 66084590, + "step": 3059, + "time_per_iteration": 4.41278862953186 + }, + { + "auxiliary_loss_clip": 0.01494963, + "auxiliary_loss_mlp": 0.01356673, + "balance_loss_clip": 1.13760579, + "balance_loss_mlp": 1.09155154, + "epoch": 0.18397715316398616, + "flos": 31760675435520.0, + "grad_norm": 2.239604585690103, + "language_loss": 0.72834682, + "learning_rate": 3.756404710389396e-06, + "loss": 0.75686312, + "num_input_tokens_seen": 66107105, + "step": 3060, + "time_per_iteration": 2.8703956604003906 + }, + { + "auxiliary_loss_clip": 0.01497719, + "auxiliary_loss_mlp": 0.01350452, + "balance_loss_clip": 1.14135909, + "balance_loss_mlp": 1.07998967, + "epoch": 0.18403727641665413, + "flos": 24614521475040.0, + "grad_norm": 2.0340849343887073, + "language_loss": 0.72974575, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.75822753, + "num_input_tokens_seen": 66129295, + "step": 3061, + "time_per_iteration": 2.8219151496887207 + }, + { + "auxiliary_loss_clip": 0.01498291, + "auxiliary_loss_mlp": 0.01349406, + "balance_loss_clip": 1.14341855, + "balance_loss_mlp": 1.07913446, + "epoch": 0.18409739966932212, + "flos": 23442112463040.0, + "grad_norm": 2.1354026845056433, + "language_loss": 0.81624913, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.84472609, + "num_input_tokens_seen": 66146910, + "step": 3062, + "time_per_iteration": 2.8392221927642822 + }, + { + "auxiliary_loss_clip": 0.015018, + "auxiliary_loss_mlp": 0.01345764, + "balance_loss_clip": 1.14613223, + "balance_loss_mlp": 1.0711056, + "epoch": 0.18415752292199009, + "flos": 21874298375520.0, + "grad_norm": 2.147914698623575, + "language_loss": 0.73004377, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.75851935, + "num_input_tokens_seen": 66165370, + "step": 3063, + "time_per_iteration": 2.7641384601593018 + }, + { + "auxiliary_loss_clip": 0.0149967, + "auxiliary_loss_mlp": 0.0133155, + "balance_loss_clip": 1.14377093, + "balance_loss_mlp": 1.05879903, + "epoch": 0.18421764617465805, + "flos": 25413031042560.0, + "grad_norm": 2.1112038646519466, + "language_loss": 0.65998173, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.68829393, + "num_input_tokens_seen": 66186210, + "step": 3064, + "time_per_iteration": 4.301867485046387 + }, + { + "auxiliary_loss_clip": 0.01501199, + "auxiliary_loss_mlp": 0.01330568, + "balance_loss_clip": 1.14534652, + "balance_loss_mlp": 1.0459919, + "epoch": 0.18427776942732602, + "flos": 27200565146880.0, + "grad_norm": 2.5153296336903774, + "language_loss": 0.68703377, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.71535146, + "num_input_tokens_seen": 66204800, + "step": 3065, + "time_per_iteration": 2.7950127124786377 + }, + { + "auxiliary_loss_clip": 0.0150605, + "auxiliary_loss_mlp": 0.0134361, + "balance_loss_clip": 1.1494379, + "balance_loss_mlp": 1.06265795, + "epoch": 0.18433789267999398, + "flos": 27854680752000.0, + "grad_norm": 3.8268936168468586, + "language_loss": 0.72952569, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.75802231, + "num_input_tokens_seen": 66222195, + "step": 3066, + "time_per_iteration": 2.8690216541290283 + }, + { + "auxiliary_loss_clip": 0.01497085, + "auxiliary_loss_mlp": 0.01332462, + "balance_loss_clip": 1.14101887, + "balance_loss_mlp": 1.0507462, + "epoch": 0.18439801593266195, + "flos": 17858804001120.0, + "grad_norm": 2.532695848010257, + "language_loss": 0.82165849, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84995389, + "num_input_tokens_seen": 66239505, + "step": 3067, + "time_per_iteration": 4.324210166931152 + }, + { + "auxiliary_loss_clip": 0.01668697, + "auxiliary_loss_mlp": 0.01650536, + "balance_loss_clip": 1.3122139, + "balance_loss_mlp": 1.37892914, + "epoch": 0.18445813918532994, + "flos": 56395791901440.0, + "grad_norm": 0.874979819449545, + "language_loss": 0.59620059, + "learning_rate": 3.754912376956657e-06, + "loss": 0.62939286, + "num_input_tokens_seen": 66295695, + "step": 3068, + "time_per_iteration": 3.249361991882324 + }, + { + "auxiliary_loss_clip": 0.01503459, + "auxiliary_loss_mlp": 0.01327025, + "balance_loss_clip": 1.14689612, + "balance_loss_mlp": 1.05122232, + "epoch": 0.1845182624379979, + "flos": 20959158623040.0, + "grad_norm": 1.8615262183525962, + "language_loss": 0.76877451, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.79707932, + "num_input_tokens_seen": 66315315, + "step": 3069, + "time_per_iteration": 2.8095438480377197 + }, + { + "auxiliary_loss_clip": 0.01500055, + "auxiliary_loss_mlp": 0.01351336, + "balance_loss_clip": 1.14339995, + "balance_loss_mlp": 1.07744122, + "epoch": 0.18457838569066587, + "flos": 20487251720160.0, + "grad_norm": 2.602049248583971, + "language_loss": 0.84976041, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.87827438, + "num_input_tokens_seen": 66333675, + "step": 3070, + "time_per_iteration": 4.208964824676514 + }, + { + "auxiliary_loss_clip": 0.01505134, + "auxiliary_loss_mlp": 0.01360509, + "balance_loss_clip": 1.15006185, + "balance_loss_mlp": 1.08298993, + "epoch": 0.18463850894333383, + "flos": 25012391880960.0, + "grad_norm": 2.4182709137586724, + "language_loss": 0.77908778, + "learning_rate": 3.754351653708265e-06, + "loss": 0.8077442, + "num_input_tokens_seen": 66354075, + "step": 3071, + "time_per_iteration": 2.838231086730957 + }, + { + "auxiliary_loss_clip": 0.01506721, + "auxiliary_loss_mlp": 0.01372709, + "balance_loss_clip": 1.15336561, + "balance_loss_mlp": 1.09404492, + "epoch": 0.1846986321960018, + "flos": 16802532108000.0, + "grad_norm": 2.933501037412007, + "language_loss": 0.77897823, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.80777258, + "num_input_tokens_seen": 66372520, + "step": 3072, + "time_per_iteration": 2.7469217777252197 + }, + { + "auxiliary_loss_clip": 0.01501579, + "auxiliary_loss_mlp": 0.01334935, + "balance_loss_clip": 1.14851665, + "balance_loss_mlp": 1.056844, + "epoch": 0.18475875544866976, + "flos": 20816471427840.0, + "grad_norm": 2.7258652597405715, + "language_loss": 0.86308038, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.89144564, + "num_input_tokens_seen": 66390745, + "step": 3073, + "time_per_iteration": 2.7787282466888428 + }, + { + "auxiliary_loss_clip": 0.01504162, + "auxiliary_loss_mlp": 0.01344877, + "balance_loss_clip": 1.15009761, + "balance_loss_mlp": 1.07288885, + "epoch": 0.18481887870133773, + "flos": 22603626250560.0, + "grad_norm": 2.7018146873134445, + "language_loss": 0.92059237, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.94908273, + "num_input_tokens_seen": 66410525, + "step": 3074, + "time_per_iteration": 2.719729423522949 + }, + { + "auxiliary_loss_clip": 0.01506136, + "auxiliary_loss_mlp": 0.01332459, + "balance_loss_clip": 1.15281367, + "balance_loss_mlp": 1.05551231, + "epoch": 0.18487900195400572, + "flos": 29461295711520.0, + "grad_norm": 2.5265530895309416, + "language_loss": 0.64821875, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67660463, + "num_input_tokens_seen": 66432535, + "step": 3075, + "time_per_iteration": 2.7652883529663086 + }, + { + "auxiliary_loss_clip": 0.01512375, + "auxiliary_loss_mlp": 0.01357019, + "balance_loss_clip": 1.15910399, + "balance_loss_mlp": 1.08541262, + "epoch": 0.1849391252066737, + "flos": 20630735406720.0, + "grad_norm": 2.060433198325368, + "language_loss": 0.72697914, + "learning_rate": 3.753415784551761e-06, + "loss": 0.75567311, + "num_input_tokens_seen": 66450620, + "step": 3076, + "time_per_iteration": 2.636173725128174 + }, + { + "auxiliary_loss_clip": 0.01507493, + "auxiliary_loss_mlp": 0.01394135, + "balance_loss_clip": 1.15498328, + "balance_loss_mlp": 1.12176514, + "epoch": 0.18499924845934165, + "flos": 14430291660000.0, + "grad_norm": 3.2343488203417463, + "language_loss": 0.80890739, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83792365, + "num_input_tokens_seen": 66467865, + "step": 3077, + "time_per_iteration": 2.6230320930480957 + }, + { + "auxiliary_loss_clip": 0.01511231, + "auxiliary_loss_mlp": 0.01375116, + "balance_loss_clip": 1.15695477, + "balance_loss_mlp": 1.10808754, + "epoch": 0.18505937171200962, + "flos": 23729648758560.0, + "grad_norm": 2.541251049111509, + "language_loss": 0.78891128, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.81777477, + "num_input_tokens_seen": 66486245, + "step": 3078, + "time_per_iteration": 2.7685585021972656 + }, + { + "auxiliary_loss_clip": 0.01508841, + "auxiliary_loss_mlp": 0.01364699, + "balance_loss_clip": 1.15670788, + "balance_loss_mlp": 1.09404635, + "epoch": 0.18511949496467758, + "flos": 25959846790080.0, + "grad_norm": 2.2119619701031437, + "language_loss": 0.77835929, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.80709469, + "num_input_tokens_seen": 66506510, + "step": 3079, + "time_per_iteration": 2.8213884830474854 + }, + { + "auxiliary_loss_clip": 0.01501164, + "auxiliary_loss_mlp": 0.01344241, + "balance_loss_clip": 1.14840007, + "balance_loss_mlp": 1.07454181, + "epoch": 0.18517961821734555, + "flos": 42416888009760.0, + "grad_norm": 4.284532442550426, + "language_loss": 0.81865203, + "learning_rate": 3.752665892369369e-06, + "loss": 0.84710616, + "num_input_tokens_seen": 66530960, + "step": 3080, + "time_per_iteration": 2.968203067779541 + }, + { + "auxiliary_loss_clip": 0.0150622, + "auxiliary_loss_mlp": 0.01357483, + "balance_loss_clip": 1.15276313, + "balance_loss_mlp": 1.08396876, + "epoch": 0.18523974147001354, + "flos": 24099869171520.0, + "grad_norm": 1.8844722300280452, + "language_loss": 0.74365878, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.77229583, + "num_input_tokens_seen": 66550275, + "step": 3081, + "time_per_iteration": 2.8046107292175293 + }, + { + "auxiliary_loss_clip": 0.01510454, + "auxiliary_loss_mlp": 0.01344878, + "balance_loss_clip": 1.15704322, + "balance_loss_mlp": 1.06526041, + "epoch": 0.1852998647226815, + "flos": 27377425978560.0, + "grad_norm": 2.93160646427336, + "language_loss": 0.72348475, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.75203812, + "num_input_tokens_seen": 66569040, + "step": 3082, + "time_per_iteration": 2.8370273113250732 + }, + { + "auxiliary_loss_clip": 0.01506398, + "auxiliary_loss_mlp": 0.0134316, + "balance_loss_clip": 1.15389037, + "balance_loss_mlp": 1.06430507, + "epoch": 0.18535998797534947, + "flos": 18334731288960.0, + "grad_norm": 2.947979895023895, + "language_loss": 0.69971645, + "learning_rate": 3.752102775364407e-06, + "loss": 0.72821206, + "num_input_tokens_seen": 66587775, + "step": 3083, + "time_per_iteration": 2.8484458923339844 + }, + { + "auxiliary_loss_clip": 0.01507741, + "auxiliary_loss_mlp": 0.01328009, + "balance_loss_clip": 1.15578091, + "balance_loss_mlp": 1.05029893, + "epoch": 0.18542011122801744, + "flos": 37848547310400.0, + "grad_norm": 9.76373568218088, + "language_loss": 0.69434202, + "learning_rate": 3.751914936806767e-06, + "loss": 0.72269952, + "num_input_tokens_seen": 66610800, + "step": 3084, + "time_per_iteration": 2.9244680404663086 + }, + { + "auxiliary_loss_clip": 0.01506384, + "auxiliary_loss_mlp": 0.01322366, + "balance_loss_clip": 1.15384197, + "balance_loss_mlp": 1.04179525, + "epoch": 0.1854802344806854, + "flos": 25188152796000.0, + "grad_norm": 1.6361276504357336, + "language_loss": 0.7780447, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80633223, + "num_input_tokens_seen": 66630960, + "step": 3085, + "time_per_iteration": 2.8001372814178467 + }, + { + "auxiliary_loss_clip": 0.0150496, + "auxiliary_loss_mlp": 0.01336795, + "balance_loss_clip": 1.15239334, + "balance_loss_mlp": 1.05755877, + "epoch": 0.18554035773335337, + "flos": 26687050688160.0, + "grad_norm": 2.276719865100415, + "language_loss": 0.73709404, + "learning_rate": 3.751539060400244e-06, + "loss": 0.76551163, + "num_input_tokens_seen": 66650585, + "step": 3086, + "time_per_iteration": 2.84568190574646 + }, + { + "auxiliary_loss_clip": 0.01510694, + "auxiliary_loss_mlp": 0.01344751, + "balance_loss_clip": 1.15798461, + "balance_loss_mlp": 1.06799471, + "epoch": 0.18560048098602133, + "flos": 22349125746720.0, + "grad_norm": 3.5927017198574256, + "language_loss": 0.70241129, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.73096573, + "num_input_tokens_seen": 66670045, + "step": 3087, + "time_per_iteration": 2.7797746658325195 + }, + { + "auxiliary_loss_clip": 0.01510541, + "auxiliary_loss_mlp": 0.0133212, + "balance_loss_clip": 1.15739584, + "balance_loss_mlp": 1.04792523, + "epoch": 0.18566060423868933, + "flos": 17750442155040.0, + "grad_norm": 3.4515561002217123, + "language_loss": 0.72773659, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.75616324, + "num_input_tokens_seen": 66688790, + "step": 3088, + "time_per_iteration": 2.745363473892212 + }, + { + "auxiliary_loss_clip": 0.01508329, + "auxiliary_loss_mlp": 0.01316911, + "balance_loss_clip": 1.15555978, + "balance_loss_mlp": 1.03691185, + "epoch": 0.1857207274913573, + "flos": 24679379357280.0, + "grad_norm": 2.2346008211888693, + "language_loss": 0.91985059, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94810295, + "num_input_tokens_seen": 66708090, + "step": 3089, + "time_per_iteration": 2.858945608139038 + }, + { + "auxiliary_loss_clip": 0.01508943, + "auxiliary_loss_mlp": 0.01315683, + "balance_loss_clip": 1.1559757, + "balance_loss_mlp": 1.03511202, + "epoch": 0.18578085074402526, + "flos": 28150371601920.0, + "grad_norm": 3.4393655162345644, + "language_loss": 0.58479035, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.61303657, + "num_input_tokens_seen": 66727320, + "step": 3090, + "time_per_iteration": 2.8025174140930176 + }, + { + "auxiliary_loss_clip": 0.01509908, + "auxiliary_loss_mlp": 0.01332731, + "balance_loss_clip": 1.15611458, + "balance_loss_mlp": 1.0559746, + "epoch": 0.18584097399669322, + "flos": 23954185651680.0, + "grad_norm": 2.104462786821982, + "language_loss": 0.81887925, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84730566, + "num_input_tokens_seen": 66747505, + "step": 3091, + "time_per_iteration": 2.823539972305298 + }, + { + "auxiliary_loss_clip": 0.01508597, + "auxiliary_loss_mlp": 0.01332476, + "balance_loss_clip": 1.15587533, + "balance_loss_mlp": 1.05057001, + "epoch": 0.18590109724936119, + "flos": 17203588479360.0, + "grad_norm": 15.418678288934121, + "language_loss": 0.84618717, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.87459791, + "num_input_tokens_seen": 66766425, + "step": 3092, + "time_per_iteration": 2.7599618434906006 + }, + { + "auxiliary_loss_clip": 0.01507792, + "auxiliary_loss_mlp": 0.01332919, + "balance_loss_clip": 1.1544292, + "balance_loss_mlp": 1.05463648, + "epoch": 0.18596122050202915, + "flos": 17236358773920.0, + "grad_norm": 2.4255982166403953, + "language_loss": 0.9354282, + "learning_rate": 3.750221401168038e-06, + "loss": 0.96383536, + "num_input_tokens_seen": 66781130, + "step": 3093, + "time_per_iteration": 2.720224380493164 + }, + { + "auxiliary_loss_clip": 0.01508165, + "auxiliary_loss_mlp": 0.01334436, + "balance_loss_clip": 1.15524769, + "balance_loss_mlp": 1.05920529, + "epoch": 0.18602134375469712, + "flos": 19022451608160.0, + "grad_norm": 2.825265732241812, + "language_loss": 0.77034217, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79876816, + "num_input_tokens_seen": 66797535, + "step": 3094, + "time_per_iteration": 2.7646751403808594 + }, + { + "auxiliary_loss_clip": 0.01510632, + "auxiliary_loss_mlp": 0.01328038, + "balance_loss_clip": 1.15704131, + "balance_loss_mlp": 1.05109143, + "epoch": 0.1860814670073651, + "flos": 50953843513440.0, + "grad_norm": 1.9136184193399675, + "language_loss": 0.70249301, + "learning_rate": 3.749844329677425e-06, + "loss": 0.73087972, + "num_input_tokens_seen": 66821720, + "step": 3095, + "time_per_iteration": 3.0215559005737305 + }, + { + "auxiliary_loss_clip": 0.01509725, + "auxiliary_loss_mlp": 0.01336001, + "balance_loss_clip": 1.15607786, + "balance_loss_mlp": 1.05771875, + "epoch": 0.18614159026003307, + "flos": 19393013374560.0, + "grad_norm": 2.98581986565183, + "language_loss": 0.80811572, + "learning_rate": 3.749655694397135e-06, + "loss": 0.836573, + "num_input_tokens_seen": 66839060, + "step": 3096, + "time_per_iteration": 2.7626726627349854 + }, + { + "auxiliary_loss_clip": 0.01510857, + "auxiliary_loss_mlp": 0.01326085, + "balance_loss_clip": 1.15691519, + "balance_loss_mlp": 1.04913783, + "epoch": 0.18620171351270104, + "flos": 21800944585440.0, + "grad_norm": 2.80845255882355, + "language_loss": 0.75477159, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.78314102, + "num_input_tokens_seen": 66857760, + "step": 3097, + "time_per_iteration": 4.2843241691589355 + }, + { + "auxiliary_loss_clip": 0.01519012, + "auxiliary_loss_mlp": 0.01357752, + "balance_loss_clip": 1.16506958, + "balance_loss_mlp": 1.08824372, + "epoch": 0.186261836765369, + "flos": 16364760913440.0, + "grad_norm": 2.686130003228834, + "language_loss": 0.66760528, + "learning_rate": 3.749278224802352e-06, + "loss": 0.69637293, + "num_input_tokens_seen": 66876460, + "step": 3098, + "time_per_iteration": 2.763155698776245 + }, + { + "auxiliary_loss_clip": 0.01512401, + "auxiliary_loss_mlp": 0.01347169, + "balance_loss_clip": 1.15838146, + "balance_loss_mlp": 1.07460904, + "epoch": 0.18632196001803697, + "flos": 23372437704480.0, + "grad_norm": 1.8993238735322016, + "language_loss": 0.69458628, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72318196, + "num_input_tokens_seen": 66897960, + "step": 3099, + "time_per_iteration": 2.792233467102051 + }, + { + "auxiliary_loss_clip": 0.0150723, + "auxiliary_loss_mlp": 0.01336065, + "balance_loss_clip": 1.15369022, + "balance_loss_mlp": 1.05759203, + "epoch": 0.18638208327070493, + "flos": 22494202416000.0, + "grad_norm": 1.9802557225484516, + "language_loss": 0.7203393, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.74877226, + "num_input_tokens_seen": 66917675, + "step": 3100, + "time_per_iteration": 2.802403450012207 + }, + { + "auxiliary_loss_clip": 0.01509782, + "auxiliary_loss_mlp": 0.01332582, + "balance_loss_clip": 1.15654945, + "balance_loss_mlp": 1.05449069, + "epoch": 0.18644220652337293, + "flos": 29167615054080.0, + "grad_norm": 1.9474443043255774, + "language_loss": 0.80058926, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82901293, + "num_input_tokens_seen": 66936000, + "step": 3101, + "time_per_iteration": 2.8401012420654297 + }, + { + "auxiliary_loss_clip": 0.01515326, + "auxiliary_loss_mlp": 0.01311675, + "balance_loss_clip": 1.16187477, + "balance_loss_mlp": 1.0368259, + "epoch": 0.1865023297760409, + "flos": 24246121613760.0, + "grad_norm": 2.11060011925519, + "language_loss": 0.77139127, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.79966134, + "num_input_tokens_seen": 66955700, + "step": 3102, + "time_per_iteration": 4.375356674194336 + }, + { + "auxiliary_loss_clip": 0.0150491, + "auxiliary_loss_mlp": 0.01330202, + "balance_loss_clip": 1.15145564, + "balance_loss_mlp": 1.04505348, + "epoch": 0.18656245302870886, + "flos": 19130358316320.0, + "grad_norm": 3.1868720822682204, + "language_loss": 0.76858413, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.7969352, + "num_input_tokens_seen": 66972815, + "step": 3103, + "time_per_iteration": 2.7730791568756104 + }, + { + "auxiliary_loss_clip": 0.01512815, + "auxiliary_loss_mlp": 0.01320291, + "balance_loss_clip": 1.15976667, + "balance_loss_mlp": 1.04029238, + "epoch": 0.18662257628137682, + "flos": 17788067254080.0, + "grad_norm": 1.907725923062327, + "language_loss": 0.79145223, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81978327, + "num_input_tokens_seen": 66992280, + "step": 3104, + "time_per_iteration": 2.7624902725219727 + }, + { + "auxiliary_loss_clip": 0.01520578, + "auxiliary_loss_mlp": 0.0135365, + "balance_loss_clip": 1.16687489, + "balance_loss_mlp": 1.08204365, + "epoch": 0.1866826995340448, + "flos": 24026667094080.0, + "grad_norm": 5.451301363605311, + "language_loss": 0.85138857, + "learning_rate": 3.747954992113354e-06, + "loss": 0.88013089, + "num_input_tokens_seen": 67012220, + "step": 3105, + "time_per_iteration": 2.8600852489471436 + }, + { + "auxiliary_loss_clip": 0.01511424, + "auxiliary_loss_mlp": 0.01346629, + "balance_loss_clip": 1.15750134, + "balance_loss_mlp": 1.07311487, + "epoch": 0.18674282278671275, + "flos": 26143686403200.0, + "grad_norm": 2.1968345511500518, + "language_loss": 0.86954719, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89812773, + "num_input_tokens_seen": 67032030, + "step": 3106, + "time_per_iteration": 4.293106555938721 + }, + { + "auxiliary_loss_clip": 0.01520905, + "auxiliary_loss_mlp": 0.0134735, + "balance_loss_clip": 1.16785502, + "balance_loss_mlp": 1.06925845, + "epoch": 0.18680294603938072, + "flos": 19203598321920.0, + "grad_norm": 2.220316372927671, + "language_loss": 0.78272915, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.81141174, + "num_input_tokens_seen": 67048920, + "step": 3107, + "time_per_iteration": 4.176691770553589 + }, + { + "auxiliary_loss_clip": 0.01516693, + "auxiliary_loss_mlp": 0.01341393, + "balance_loss_clip": 1.16293466, + "balance_loss_mlp": 1.06768882, + "epoch": 0.1868630692920487, + "flos": 28547217947520.0, + "grad_norm": 2.3838073962249644, + "language_loss": 0.74343145, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.77201223, + "num_input_tokens_seen": 67068645, + "step": 3108, + "time_per_iteration": 2.838031768798828 + }, + { + "auxiliary_loss_clip": 0.01513713, + "auxiliary_loss_mlp": 0.01314124, + "balance_loss_clip": 1.16156554, + "balance_loss_mlp": 1.03317153, + "epoch": 0.18692319254471668, + "flos": 17239924020960.0, + "grad_norm": 4.083196963630566, + "language_loss": 0.74542063, + "learning_rate": 3.747197400772658e-06, + "loss": 0.77369899, + "num_input_tokens_seen": 67087075, + "step": 3109, + "time_per_iteration": 2.83948016166687 + }, + { + "auxiliary_loss_clip": 0.01516797, + "auxiliary_loss_mlp": 0.01329892, + "balance_loss_clip": 1.16191506, + "balance_loss_mlp": 1.05466199, + "epoch": 0.18698331579738464, + "flos": 23187422318400.0, + "grad_norm": 3.00549096086685, + "language_loss": 0.84252226, + "learning_rate": 3.747007837284772e-06, + "loss": 0.8709892, + "num_input_tokens_seen": 67108040, + "step": 3110, + "time_per_iteration": 2.9310739040374756 + }, + { + "auxiliary_loss_clip": 0.01515969, + "auxiliary_loss_mlp": 0.01364547, + "balance_loss_clip": 1.16253388, + "balance_loss_mlp": 1.09275019, + "epoch": 0.1870434390500526, + "flos": 25518889630080.0, + "grad_norm": 1.7119150841801278, + "language_loss": 0.84844732, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.87725258, + "num_input_tokens_seen": 67127605, + "step": 3111, + "time_per_iteration": 2.795297622680664 + }, + { + "auxiliary_loss_clip": 0.01519552, + "auxiliary_loss_mlp": 0.01325346, + "balance_loss_clip": 1.16592598, + "balance_loss_mlp": 1.04954302, + "epoch": 0.18710356230272057, + "flos": 19502892347040.0, + "grad_norm": 2.154439819503712, + "language_loss": 0.76715606, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.79560506, + "num_input_tokens_seen": 67145785, + "step": 3112, + "time_per_iteration": 2.7722396850585938 + }, + { + "auxiliary_loss_clip": 0.01513189, + "auxiliary_loss_mlp": 0.01331329, + "balance_loss_clip": 1.15972948, + "balance_loss_mlp": 1.06105733, + "epoch": 0.18716368555538854, + "flos": 26763287018400.0, + "grad_norm": 2.23821494339102, + "language_loss": 0.6452812, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.67372638, + "num_input_tokens_seen": 67165930, + "step": 3113, + "time_per_iteration": 2.8133792877197266 + }, + { + "auxiliary_loss_clip": 0.01510726, + "auxiliary_loss_mlp": 0.01330868, + "balance_loss_clip": 1.15878677, + "balance_loss_mlp": 1.05315816, + "epoch": 0.1872238088080565, + "flos": 25192059396480.0, + "grad_norm": 2.6603096321267614, + "language_loss": 0.81537437, + "learning_rate": 3.746248920938024e-06, + "loss": 0.84379029, + "num_input_tokens_seen": 67185830, + "step": 3114, + "time_per_iteration": 2.828535556793213 + }, + { + "auxiliary_loss_clip": 0.01516744, + "auxiliary_loss_mlp": 0.01327423, + "balance_loss_clip": 1.16449022, + "balance_loss_mlp": 1.05047572, + "epoch": 0.1872839320607245, + "flos": 24136621922880.0, + "grad_norm": 2.2398074696233476, + "language_loss": 0.57366335, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60210502, + "num_input_tokens_seen": 67206930, + "step": 3115, + "time_per_iteration": 2.770796060562134 + }, + { + "auxiliary_loss_clip": 0.01519763, + "auxiliary_loss_mlp": 0.01325766, + "balance_loss_clip": 1.16865075, + "balance_loss_mlp": 1.04958224, + "epoch": 0.18734405531339246, + "flos": 21175654746240.0, + "grad_norm": 2.0360721449270796, + "language_loss": 0.71701884, + "learning_rate": 3.745869065428261e-06, + "loss": 0.7454741, + "num_input_tokens_seen": 67226290, + "step": 3116, + "time_per_iteration": 2.9084198474884033 + }, + { + "auxiliary_loss_clip": 0.01518263, + "auxiliary_loss_mlp": 0.01319093, + "balance_loss_clip": 1.1665864, + "balance_loss_mlp": 1.04233623, + "epoch": 0.18740417856606043, + "flos": 17239582667520.0, + "grad_norm": 2.4825362693762942, + "language_loss": 0.79370415, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.82207769, + "num_input_tokens_seen": 67244410, + "step": 3117, + "time_per_iteration": 2.7914328575134277 + }, + { + "auxiliary_loss_clip": 0.01515383, + "auxiliary_loss_mlp": 0.01339639, + "balance_loss_clip": 1.1641711, + "balance_loss_mlp": 1.05563509, + "epoch": 0.1874643018187284, + "flos": 32560588344960.0, + "grad_norm": 5.422720581137283, + "language_loss": 0.8450464, + "learning_rate": 3.745488945104381e-06, + "loss": 0.87359667, + "num_input_tokens_seen": 67264470, + "step": 3118, + "time_per_iteration": 2.8416810035705566 + }, + { + "auxiliary_loss_clip": 0.01514743, + "auxiliary_loss_mlp": 0.01335907, + "balance_loss_clip": 1.1626941, + "balance_loss_mlp": 1.06391954, + "epoch": 0.18752442507139636, + "flos": 23260396826880.0, + "grad_norm": 2.416305517213692, + "language_loss": 0.76507843, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.79358494, + "num_input_tokens_seen": 67284315, + "step": 3119, + "time_per_iteration": 2.7770843505859375 + }, + { + "auxiliary_loss_clip": 0.01517775, + "auxiliary_loss_mlp": 0.01319317, + "balance_loss_clip": 1.16663551, + "balance_loss_mlp": 1.04465866, + "epoch": 0.18758454832406432, + "flos": 21763016061120.0, + "grad_norm": 2.574360366236675, + "language_loss": 0.82233822, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.85070908, + "num_input_tokens_seen": 67302780, + "step": 3120, + "time_per_iteration": 2.779236316680908 + }, + { + "auxiliary_loss_clip": 0.01510297, + "auxiliary_loss_mlp": 0.01327932, + "balance_loss_clip": 1.15892601, + "balance_loss_mlp": 1.05575323, + "epoch": 0.1876446715767323, + "flos": 29572805594880.0, + "grad_norm": 5.648187179025866, + "language_loss": 0.85243005, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.88081235, + "num_input_tokens_seen": 67323405, + "step": 3121, + "time_per_iteration": 2.78743052482605 + }, + { + "auxiliary_loss_clip": 0.01523927, + "auxiliary_loss_mlp": 0.01345462, + "balance_loss_clip": 1.1729275, + "balance_loss_mlp": 1.06622589, + "epoch": 0.18770479482940028, + "flos": 30342830749920.0, + "grad_norm": 5.6601048306104005, + "language_loss": 0.71143997, + "learning_rate": 3.744727910244937e-06, + "loss": 0.74013394, + "num_input_tokens_seen": 67345800, + "step": 3122, + "time_per_iteration": 2.8507652282714844 + }, + { + "auxiliary_loss_clip": 0.0151926, + "auxiliary_loss_mlp": 0.01335475, + "balance_loss_clip": 1.16926301, + "balance_loss_mlp": 1.05986273, + "epoch": 0.18776491808206824, + "flos": 14467120267680.0, + "grad_norm": 2.1854787466001575, + "language_loss": 0.70870996, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.73725736, + "num_input_tokens_seen": 67363575, + "step": 3123, + "time_per_iteration": 2.769986391067505 + }, + { + "auxiliary_loss_clip": 0.0151732, + "auxiliary_loss_mlp": 0.01327229, + "balance_loss_clip": 1.16667151, + "balance_loss_mlp": 1.05142689, + "epoch": 0.1878250413347362, + "flos": 24500849686560.0, + "grad_norm": 6.461439016691642, + "language_loss": 0.74852133, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.77696687, + "num_input_tokens_seen": 67381765, + "step": 3124, + "time_per_iteration": 2.773958683013916 + }, + { + "auxiliary_loss_clip": 0.01521877, + "auxiliary_loss_mlp": 0.0134709, + "balance_loss_clip": 1.17081857, + "balance_loss_mlp": 1.07586515, + "epoch": 0.18788516458740417, + "flos": 39789350566560.0, + "grad_norm": 1.7411299570975374, + "language_loss": 0.80719048, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.83588016, + "num_input_tokens_seen": 67405000, + "step": 3125, + "time_per_iteration": 2.9650022983551025 + }, + { + "auxiliary_loss_clip": 0.01619599, + "auxiliary_loss_mlp": 0.01302498, + "balance_loss_clip": 1.28076923, + "balance_loss_mlp": 1.07514191, + "epoch": 0.18794528784007214, + "flos": 64705479684480.0, + "grad_norm": 0.9429141992554901, + "language_loss": 0.63598067, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.6652016, + "num_input_tokens_seen": 67467140, + "step": 3126, + "time_per_iteration": 3.2990899085998535 + }, + { + "auxiliary_loss_clip": 0.01530158, + "auxiliary_loss_mlp": 0.01336358, + "balance_loss_clip": 1.17976928, + "balance_loss_mlp": 1.06398857, + "epoch": 0.1880054110927401, + "flos": 28624630050720.0, + "grad_norm": 2.505914871069609, + "language_loss": 0.81264114, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.84130633, + "num_input_tokens_seen": 67487980, + "step": 3127, + "time_per_iteration": 2.8164381980895996 + }, + { + "auxiliary_loss_clip": 0.01621803, + "auxiliary_loss_mlp": 0.01299057, + "balance_loss_clip": 1.28543174, + "balance_loss_mlp": 1.07933044, + "epoch": 0.1880655343454081, + "flos": 64495658917440.0, + "grad_norm": 0.7794285313364316, + "language_loss": 0.61849117, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.64769971, + "num_input_tokens_seen": 67552500, + "step": 3128, + "time_per_iteration": 3.3592686653137207 + }, + { + "auxiliary_loss_clip": 0.01523014, + "auxiliary_loss_mlp": 0.01342308, + "balance_loss_clip": 1.1728282, + "balance_loss_mlp": 1.06688654, + "epoch": 0.18812565759807606, + "flos": 32127216816960.0, + "grad_norm": 1.992609418121661, + "language_loss": 0.71291554, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.7415688, + "num_input_tokens_seen": 67573295, + "step": 3129, + "time_per_iteration": 2.8635997772216797 + }, + { + "auxiliary_loss_clip": 0.01523343, + "auxiliary_loss_mlp": 0.0133359, + "balance_loss_clip": 1.17418003, + "balance_loss_mlp": 1.06484485, + "epoch": 0.18818578085074403, + "flos": 20626259883840.0, + "grad_norm": 2.4960121338566683, + "language_loss": 0.85347939, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.88204873, + "num_input_tokens_seen": 67590010, + "step": 3130, + "time_per_iteration": 2.760443925857544 + }, + { + "auxiliary_loss_clip": 0.01530055, + "auxiliary_loss_mlp": 0.01337572, + "balance_loss_clip": 1.17997861, + "balance_loss_mlp": 1.06463027, + "epoch": 0.188245904103412, + "flos": 28843060510080.0, + "grad_norm": 4.162164449337514, + "language_loss": 0.76768339, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.79635966, + "num_input_tokens_seen": 67611110, + "step": 3131, + "time_per_iteration": 2.881633996963501 + }, + { + "auxiliary_loss_clip": 0.01538544, + "auxiliary_loss_mlp": 0.01346916, + "balance_loss_clip": 1.1878978, + "balance_loss_mlp": 1.07607269, + "epoch": 0.18830602735607996, + "flos": 29422684480320.0, + "grad_norm": 1.9234564932339546, + "language_loss": 0.81265914, + "learning_rate": 3.74282069289017e-06, + "loss": 0.84151381, + "num_input_tokens_seen": 67631990, + "step": 3132, + "time_per_iteration": 2.8438379764556885 + }, + { + "auxiliary_loss_clip": 0.0153332, + "auxiliary_loss_mlp": 0.01348422, + "balance_loss_clip": 1.18464422, + "balance_loss_mlp": 1.07662439, + "epoch": 0.18836615060874792, + "flos": 28875261882240.0, + "grad_norm": 2.034432190920446, + "language_loss": 0.79942048, + "learning_rate": 3.742629607551614e-06, + "loss": 0.82823789, + "num_input_tokens_seen": 67650490, + "step": 3133, + "time_per_iteration": 2.906350612640381 + }, + { + "auxiliary_loss_clip": 0.01537359, + "auxiliary_loss_mlp": 0.01339465, + "balance_loss_clip": 1.18760371, + "balance_loss_mlp": 1.06862187, + "epoch": 0.18842627386141592, + "flos": 22603891747680.0, + "grad_norm": 2.1041069190797135, + "language_loss": 0.82894981, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85771805, + "num_input_tokens_seen": 67668860, + "step": 3134, + "time_per_iteration": 2.8335120677948 + }, + { + "auxiliary_loss_clip": 0.01535569, + "auxiliary_loss_mlp": 0.01320867, + "balance_loss_clip": 1.1858933, + "balance_loss_mlp": 1.04792523, + "epoch": 0.18848639711408388, + "flos": 24576024028320.0, + "grad_norm": 2.3452844867320866, + "language_loss": 0.82797396, + "learning_rate": 3.742247238639684e-06, + "loss": 0.8565383, + "num_input_tokens_seen": 67690220, + "step": 3135, + "time_per_iteration": 2.838146686553955 + }, + { + "auxiliary_loss_clip": 0.01542212, + "auxiliary_loss_mlp": 0.01340523, + "balance_loss_clip": 1.19168019, + "balance_loss_mlp": 1.06987, + "epoch": 0.18854652036675185, + "flos": 34169744491200.0, + "grad_norm": 2.5945940966659164, + "language_loss": 0.78084719, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.80967456, + "num_input_tokens_seen": 67709820, + "step": 3136, + "time_per_iteration": 4.395394325256348 + }, + { + "auxiliary_loss_clip": 0.01546477, + "auxiliary_loss_mlp": 0.01349521, + "balance_loss_clip": 1.1963886, + "balance_loss_mlp": 1.0754354, + "epoch": 0.1886066436194198, + "flos": 24202162512000.0, + "grad_norm": 2.05865036561683, + "language_loss": 0.81120723, + "learning_rate": 3.741864605462996e-06, + "loss": 0.84016722, + "num_input_tokens_seen": 67729490, + "step": 3137, + "time_per_iteration": 2.8133513927459717 + }, + { + "auxiliary_loss_clip": 0.01546876, + "auxiliary_loss_mlp": 0.01335152, + "balance_loss_clip": 1.1974175, + "balance_loss_mlp": 1.05343664, + "epoch": 0.18866676687208778, + "flos": 21253218562080.0, + "grad_norm": 2.012437499238867, + "language_loss": 0.81156534, + "learning_rate": 3.741673189793504e-06, + "loss": 0.84038568, + "num_input_tokens_seen": 67749665, + "step": 3138, + "time_per_iteration": 2.764559507369995 + }, + { + "auxiliary_loss_clip": 0.01550907, + "auxiliary_loss_mlp": 0.01335189, + "balance_loss_clip": 1.2030648, + "balance_loss_mlp": 1.0569073, + "epoch": 0.18872689012475574, + "flos": 37311896309760.0, + "grad_norm": 6.377031506862428, + "language_loss": 0.63768494, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.66654599, + "num_input_tokens_seen": 67776230, + "step": 3139, + "time_per_iteration": 2.9341135025024414 + }, + { + "auxiliary_loss_clip": 0.01551157, + "auxiliary_loss_mlp": 0.01329732, + "balance_loss_clip": 1.20262063, + "balance_loss_mlp": 1.05240321, + "epoch": 0.1887870133774237, + "flos": 21654426646080.0, + "grad_norm": 2.908311816126771, + "language_loss": 0.71407437, + "learning_rate": 3.741290160328514e-06, + "loss": 0.74288327, + "num_input_tokens_seen": 67795080, + "step": 3140, + "time_per_iteration": 4.26977801322937 + }, + { + "auxiliary_loss_clip": 0.01555063, + "auxiliary_loss_mlp": 0.01343359, + "balance_loss_clip": 1.20720673, + "balance_loss_mlp": 1.07404113, + "epoch": 0.1888471366300917, + "flos": 15926496652800.0, + "grad_norm": 2.8849097864557227, + "language_loss": 0.87517226, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.90415645, + "num_input_tokens_seen": 67813110, + "step": 3141, + "time_per_iteration": 2.8057315349578857 + }, + { + "auxiliary_loss_clip": 0.01546859, + "auxiliary_loss_mlp": 0.01334371, + "balance_loss_clip": 1.20070028, + "balance_loss_mlp": 1.05761456, + "epoch": 0.18890725988275966, + "flos": 18553958239680.0, + "grad_norm": 2.493778069397931, + "language_loss": 0.77368605, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.80249834, + "num_input_tokens_seen": 67831070, + "step": 3142, + "time_per_iteration": 2.7200496196746826 + }, + { + "auxiliary_loss_clip": 0.01552404, + "auxiliary_loss_mlp": 0.0134111, + "balance_loss_clip": 1.2060827, + "balance_loss_mlp": 1.0691216, + "epoch": 0.18896738313542763, + "flos": 28843439791680.0, + "grad_norm": 2.220734694131723, + "language_loss": 0.79237592, + "learning_rate": 3.740715120924971e-06, + "loss": 0.821311, + "num_input_tokens_seen": 67852170, + "step": 3143, + "time_per_iteration": 4.400701999664307 + }, + { + "auxiliary_loss_clip": 0.01548782, + "auxiliary_loss_mlp": 0.01362747, + "balance_loss_clip": 1.20177388, + "balance_loss_mlp": 1.09171271, + "epoch": 0.1890275063880956, + "flos": 22414590479520.0, + "grad_norm": 2.475858821470502, + "language_loss": 0.71407074, + "learning_rate": 3.740523309097912e-06, + "loss": 0.743186, + "num_input_tokens_seen": 67869945, + "step": 3144, + "time_per_iteration": 2.7620389461517334 + }, + { + "auxiliary_loss_clip": 0.01552757, + "auxiliary_loss_mlp": 0.01336721, + "balance_loss_clip": 1.20636344, + "balance_loss_mlp": 1.05786669, + "epoch": 0.18908762964076356, + "flos": 24246235398240.0, + "grad_norm": 3.205347085321298, + "language_loss": 0.73467392, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.7635687, + "num_input_tokens_seen": 67890240, + "step": 3145, + "time_per_iteration": 4.29331111907959 + }, + { + "auxiliary_loss_clip": 0.01544726, + "auxiliary_loss_mlp": 0.01333622, + "balance_loss_clip": 1.19864607, + "balance_loss_mlp": 1.0570569, + "epoch": 0.18914775289343153, + "flos": 16984816666560.0, + "grad_norm": 4.016482245185989, + "language_loss": 0.76272863, + "learning_rate": 3.740139487448616e-06, + "loss": 0.79151207, + "num_input_tokens_seen": 67907825, + "step": 3146, + "time_per_iteration": 2.8171188831329346 + }, + { + "auxiliary_loss_clip": 0.01548967, + "auxiliary_loss_mlp": 0.0132282, + "balance_loss_clip": 1.20109284, + "balance_loss_mlp": 1.04739881, + "epoch": 0.1892078761460995, + "flos": 21545951015520.0, + "grad_norm": 2.203238650243551, + "language_loss": 0.78455901, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.81327683, + "num_input_tokens_seen": 67926670, + "step": 3147, + "time_per_iteration": 2.741567850112915 + }, + { + "auxiliary_loss_clip": 0.01555764, + "auxiliary_loss_mlp": 0.01350086, + "balance_loss_clip": 1.20850277, + "balance_loss_mlp": 1.07828903, + "epoch": 0.18926799939876748, + "flos": 23003696489760.0, + "grad_norm": 12.779225335833615, + "language_loss": 0.66535944, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69441795, + "num_input_tokens_seen": 67943645, + "step": 3148, + "time_per_iteration": 2.830429792404175 + }, + { + "auxiliary_loss_clip": 0.01548472, + "auxiliary_loss_mlp": 0.01333916, + "balance_loss_clip": 1.20176375, + "balance_loss_mlp": 1.06097484, + "epoch": 0.18932812265143545, + "flos": 22275203034240.0, + "grad_norm": 2.207007090380393, + "language_loss": 0.75656509, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78538901, + "num_input_tokens_seen": 67962345, + "step": 3149, + "time_per_iteration": 2.8021981716156006 + }, + { + "auxiliary_loss_clip": 0.01558564, + "auxiliary_loss_mlp": 0.01322276, + "balance_loss_clip": 1.21198869, + "balance_loss_mlp": 1.05333948, + "epoch": 0.1893882459041034, + "flos": 18626401753920.0, + "grad_norm": 4.597256586892112, + "language_loss": 0.80916482, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.8379733, + "num_input_tokens_seen": 67979760, + "step": 3150, + "time_per_iteration": 2.790437698364258 + }, + { + "auxiliary_loss_clip": 0.01545063, + "auxiliary_loss_mlp": 0.01333156, + "balance_loss_clip": 1.19629014, + "balance_loss_mlp": 1.05830693, + "epoch": 0.18944836915677138, + "flos": 22895182931040.0, + "grad_norm": 2.583112996042772, + "language_loss": 0.85336077, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.88214302, + "num_input_tokens_seen": 67996895, + "step": 3151, + "time_per_iteration": 2.828056812286377 + }, + { + "auxiliary_loss_clip": 0.0155223, + "auxiliary_loss_mlp": 0.01332698, + "balance_loss_clip": 1.20210934, + "balance_loss_mlp": 1.0580399, + "epoch": 0.18950849240943934, + "flos": 26798939853120.0, + "grad_norm": 3.2256355731643644, + "language_loss": 0.74529332, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.77414256, + "num_input_tokens_seen": 68018365, + "step": 3152, + "time_per_iteration": 2.847017288208008 + }, + { + "auxiliary_loss_clip": 0.01552825, + "auxiliary_loss_mlp": 0.01344877, + "balance_loss_clip": 1.20332122, + "balance_loss_mlp": 1.07117295, + "epoch": 0.1895686156621073, + "flos": 24973477224480.0, + "grad_norm": 2.4327777519552134, + "language_loss": 0.7541346, + "learning_rate": 3.738794033491209e-06, + "loss": 0.78311157, + "num_input_tokens_seen": 68037985, + "step": 3153, + "time_per_iteration": 2.8545591831207275 + }, + { + "auxiliary_loss_clip": 0.01545968, + "auxiliary_loss_mlp": 0.01328522, + "balance_loss_clip": 1.19728553, + "balance_loss_mlp": 1.05367315, + "epoch": 0.1896287389147753, + "flos": 21946703961600.0, + "grad_norm": 2.8048681772756856, + "language_loss": 0.79609603, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.8248409, + "num_input_tokens_seen": 68057975, + "step": 3154, + "time_per_iteration": 2.8219990730285645 + }, + { + "auxiliary_loss_clip": 0.0155102, + "auxiliary_loss_mlp": 0.01345731, + "balance_loss_clip": 1.20283389, + "balance_loss_mlp": 1.06802142, + "epoch": 0.18968886216744327, + "flos": 18180589789440.0, + "grad_norm": 3.304779208141082, + "language_loss": 0.73204571, + "learning_rate": 3.738409024548223e-06, + "loss": 0.76101327, + "num_input_tokens_seen": 68074175, + "step": 3155, + "time_per_iteration": 2.6934075355529785 + }, + { + "auxiliary_loss_clip": 0.01552921, + "auxiliary_loss_mlp": 0.01337508, + "balance_loss_clip": 1.20377159, + "balance_loss_mlp": 1.06876254, + "epoch": 0.18974898542011123, + "flos": 20414580636960.0, + "grad_norm": 3.0161016742483806, + "language_loss": 0.73944336, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76834762, + "num_input_tokens_seen": 68095230, + "step": 3156, + "time_per_iteration": 2.7556004524230957 + }, + { + "auxiliary_loss_clip": 0.01545105, + "auxiliary_loss_mlp": 0.01343556, + "balance_loss_clip": 1.19523335, + "balance_loss_mlp": 1.0736661, + "epoch": 0.1898091086727792, + "flos": 23987145587040.0, + "grad_norm": 3.580148580995599, + "language_loss": 0.6846205, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.71350718, + "num_input_tokens_seen": 68113805, + "step": 3157, + "time_per_iteration": 2.8223416805267334 + }, + { + "auxiliary_loss_clip": 0.0155091, + "auxiliary_loss_mlp": 0.01341772, + "balance_loss_clip": 1.20295691, + "balance_loss_mlp": 1.06673205, + "epoch": 0.18986923192544716, + "flos": 27639587970720.0, + "grad_norm": 2.103314959976021, + "language_loss": 0.79765433, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82658112, + "num_input_tokens_seen": 68133190, + "step": 3158, + "time_per_iteration": 2.868889093399048 + }, + { + "auxiliary_loss_clip": 0.01545435, + "auxiliary_loss_mlp": 0.01340657, + "balance_loss_clip": 1.19756651, + "balance_loss_mlp": 1.06065869, + "epoch": 0.18992935517811513, + "flos": 25486536545280.0, + "grad_norm": 3.3153791021357306, + "language_loss": 0.72216916, + "learning_rate": 3.737638215672964e-06, + "loss": 0.75103009, + "num_input_tokens_seen": 68152330, + "step": 3159, + "time_per_iteration": 2.88472580909729 + }, + { + "auxiliary_loss_clip": 0.01554093, + "auxiliary_loss_mlp": 0.01339341, + "balance_loss_clip": 1.20439684, + "balance_loss_mlp": 1.06868815, + "epoch": 0.1899894784307831, + "flos": 17422739573760.0, + "grad_norm": 3.5111836105832537, + "language_loss": 0.85283887, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.88177323, + "num_input_tokens_seen": 68170185, + "step": 3160, + "time_per_iteration": 2.7590227127075195 + }, + { + "auxiliary_loss_clip": 0.01550112, + "auxiliary_loss_mlp": 0.01324812, + "balance_loss_clip": 1.20109594, + "balance_loss_mlp": 1.05263329, + "epoch": 0.19004960168345109, + "flos": 27494814726720.0, + "grad_norm": 3.0558555246960126, + "language_loss": 0.73723459, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.76598388, + "num_input_tokens_seen": 68191665, + "step": 3161, + "time_per_iteration": 2.8595449924468994 + }, + { + "auxiliary_loss_clip": 0.01551455, + "auxiliary_loss_mlp": 0.01342828, + "balance_loss_clip": 1.20176911, + "balance_loss_mlp": 1.07484519, + "epoch": 0.19010972493611905, + "flos": 38657752619040.0, + "grad_norm": 2.413502855108309, + "language_loss": 0.81302482, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.84196764, + "num_input_tokens_seen": 68214635, + "step": 3162, + "time_per_iteration": 2.9304120540618896 + }, + { + "auxiliary_loss_clip": 0.01541614, + "auxiliary_loss_mlp": 0.01343748, + "balance_loss_clip": 1.19116497, + "balance_loss_mlp": 1.07519305, + "epoch": 0.19016984818878702, + "flos": 19246760932320.0, + "grad_norm": 2.12594579869327, + "language_loss": 0.75871181, + "learning_rate": 3.73686635253511e-06, + "loss": 0.78756541, + "num_input_tokens_seen": 68232150, + "step": 3163, + "time_per_iteration": 2.7708489894866943 + }, + { + "auxiliary_loss_clip": 0.01547651, + "auxiliary_loss_mlp": 0.01323044, + "balance_loss_clip": 1.1984278, + "balance_loss_mlp": 1.04647827, + "epoch": 0.19022997144145498, + "flos": 37599925671360.0, + "grad_norm": 2.247210337073213, + "language_loss": 0.74906522, + "learning_rate": 3.736673222076982e-06, + "loss": 0.77777219, + "num_input_tokens_seen": 68253370, + "step": 3164, + "time_per_iteration": 2.9026219844818115 + }, + { + "auxiliary_loss_clip": 0.01552145, + "auxiliary_loss_mlp": 0.01344854, + "balance_loss_clip": 1.20249057, + "balance_loss_mlp": 1.0743916, + "epoch": 0.19029009469412295, + "flos": 61535261027520.0, + "grad_norm": 2.4102629030064073, + "language_loss": 0.67201436, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.7009843, + "num_input_tokens_seen": 68278895, + "step": 3165, + "time_per_iteration": 3.1145474910736084 + }, + { + "auxiliary_loss_clip": 0.01543502, + "auxiliary_loss_mlp": 0.01330459, + "balance_loss_clip": 1.19364107, + "balance_loss_mlp": 1.05560994, + "epoch": 0.1903502179467909, + "flos": 13956602133600.0, + "grad_norm": 2.270946512222568, + "language_loss": 0.74488533, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.7736249, + "num_input_tokens_seen": 68294880, + "step": 3166, + "time_per_iteration": 2.7732746601104736 + }, + { + "auxiliary_loss_clip": 0.01660053, + "auxiliary_loss_mlp": 0.01310463, + "balance_loss_clip": 1.31936526, + "balance_loss_mlp": 1.09073639, + "epoch": 0.1904103411994589, + "flos": 66906624381120.0, + "grad_norm": 0.7802507955093285, + "language_loss": 0.50349128, + "learning_rate": 3.736093435602968e-06, + "loss": 0.53319645, + "num_input_tokens_seen": 68359665, + "step": 3167, + "time_per_iteration": 3.3319432735443115 + }, + { + "auxiliary_loss_clip": 0.01542617, + "auxiliary_loss_mlp": 0.01326771, + "balance_loss_clip": 1.19185615, + "balance_loss_mlp": 1.05649996, + "epoch": 0.19047046445212687, + "flos": 21910937342400.0, + "grad_norm": 1.9095537868736454, + "language_loss": 0.74509823, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.77379215, + "num_input_tokens_seen": 68378950, + "step": 3168, + "time_per_iteration": 2.8224048614501953 + }, + { + "auxiliary_loss_clip": 0.01654137, + "auxiliary_loss_mlp": 0.01324181, + "balance_loss_clip": 1.31255913, + "balance_loss_mlp": 1.10292816, + "epoch": 0.19053058770479483, + "flos": 59260648392000.0, + "grad_norm": 0.877680914385746, + "language_loss": 0.59939927, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62918246, + "num_input_tokens_seen": 68434235, + "step": 3169, + "time_per_iteration": 3.1841819286346436 + }, + { + "auxiliary_loss_clip": 0.01534402, + "auxiliary_loss_mlp": 0.01329584, + "balance_loss_clip": 1.1845839, + "balance_loss_mlp": 1.05511665, + "epoch": 0.1905907109574628, + "flos": 23953578801120.0, + "grad_norm": 1.6954033462469198, + "language_loss": 0.78375852, + "learning_rate": 3.735513056633436e-06, + "loss": 0.81239831, + "num_input_tokens_seen": 68453830, + "step": 3170, + "time_per_iteration": 2.7897274494171143 + }, + { + "auxiliary_loss_clip": 0.01544048, + "auxiliary_loss_mlp": 0.0133476, + "balance_loss_clip": 1.19364905, + "balance_loss_mlp": 1.06868482, + "epoch": 0.19065083421013077, + "flos": 20814347450880.0, + "grad_norm": 2.195088990590669, + "language_loss": 0.78526103, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.81404912, + "num_input_tokens_seen": 68473005, + "step": 3171, + "time_per_iteration": 2.802096128463745 + }, + { + "auxiliary_loss_clip": 0.01532117, + "auxiliary_loss_mlp": 0.01358816, + "balance_loss_clip": 1.18140674, + "balance_loss_mlp": 1.08987999, + "epoch": 0.19071095746279873, + "flos": 31287668616000.0, + "grad_norm": 2.3315766478306457, + "language_loss": 0.78872424, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.81763351, + "num_input_tokens_seen": 68493470, + "step": 3172, + "time_per_iteration": 2.8472964763641357 + }, + { + "auxiliary_loss_clip": 0.01532462, + "auxiliary_loss_mlp": 0.01343982, + "balance_loss_clip": 1.1827426, + "balance_loss_mlp": 1.07580829, + "epoch": 0.1907710807154667, + "flos": 14357886073920.0, + "grad_norm": 2.370563929182003, + "language_loss": 0.80463642, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.83340085, + "num_input_tokens_seen": 68511290, + "step": 3173, + "time_per_iteration": 2.8261492252349854 + }, + { + "auxiliary_loss_clip": 0.01530053, + "auxiliary_loss_mlp": 0.01341817, + "balance_loss_clip": 1.17931437, + "balance_loss_mlp": 1.07803082, + "epoch": 0.1908312039681347, + "flos": 26909501532480.0, + "grad_norm": 1.9959058599146993, + "language_loss": 0.79217839, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.82089704, + "num_input_tokens_seen": 68532575, + "step": 3174, + "time_per_iteration": 2.811305046081543 + }, + { + "auxiliary_loss_clip": 0.01534288, + "auxiliary_loss_mlp": 0.01337787, + "balance_loss_clip": 1.18227828, + "balance_loss_mlp": 1.07056737, + "epoch": 0.19089132722080265, + "flos": 14496059818080.0, + "grad_norm": 1.914251443377858, + "language_loss": 0.81451559, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.84323633, + "num_input_tokens_seen": 68548760, + "step": 3175, + "time_per_iteration": 4.335779666900635 + }, + { + "auxiliary_loss_clip": 0.01534232, + "auxiliary_loss_mlp": 0.0134885, + "balance_loss_clip": 1.1822058, + "balance_loss_mlp": 1.0841099, + "epoch": 0.19095145047347062, + "flos": 13954516084800.0, + "grad_norm": 3.1562697224066483, + "language_loss": 0.858953, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.88778389, + "num_input_tokens_seen": 68563100, + "step": 3176, + "time_per_iteration": 2.748314619064331 + }, + { + "auxiliary_loss_clip": 0.01532506, + "auxiliary_loss_mlp": 0.01352591, + "balance_loss_clip": 1.18151486, + "balance_loss_mlp": 1.08346391, + "epoch": 0.19101157372613858, + "flos": 25304214058560.0, + "grad_norm": 3.3462719383841537, + "language_loss": 0.8152585, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.84410948, + "num_input_tokens_seen": 68581650, + "step": 3177, + "time_per_iteration": 2.801055431365967 + }, + { + "auxiliary_loss_clip": 0.01533118, + "auxiliary_loss_mlp": 0.01343657, + "balance_loss_clip": 1.18164802, + "balance_loss_mlp": 1.07548332, + "epoch": 0.19107169697880655, + "flos": 20560302084960.0, + "grad_norm": 2.6950928653898085, + "language_loss": 0.75194949, + "learning_rate": 3.73396248424356e-06, + "loss": 0.78071725, + "num_input_tokens_seen": 68600360, + "step": 3178, + "time_per_iteration": 4.290988922119141 + }, + { + "auxiliary_loss_clip": 0.0152649, + "auxiliary_loss_mlp": 0.01330411, + "balance_loss_clip": 1.17459345, + "balance_loss_mlp": 1.06223798, + "epoch": 0.19113182023147451, + "flos": 22165361989920.0, + "grad_norm": 1.723349628332624, + "language_loss": 0.81598961, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.8445586, + "num_input_tokens_seen": 68617885, + "step": 3179, + "time_per_iteration": 2.827888011932373 + }, + { + "auxiliary_loss_clip": 0.01524757, + "auxiliary_loss_mlp": 0.01330114, + "balance_loss_clip": 1.17447948, + "balance_loss_mlp": 1.057935, + "epoch": 0.19119194348414248, + "flos": 18583428784320.0, + "grad_norm": 2.607453315823415, + "language_loss": 0.79413021, + "learning_rate": 3.733574183478691e-06, + "loss": 0.82267892, + "num_input_tokens_seen": 68634550, + "step": 3180, + "time_per_iteration": 2.763775587081909 + }, + { + "auxiliary_loss_clip": 0.0152693, + "auxiliary_loss_mlp": 0.01324664, + "balance_loss_clip": 1.17506146, + "balance_loss_mlp": 1.05229414, + "epoch": 0.19125206673681047, + "flos": 19028671826400.0, + "grad_norm": 3.037457461710687, + "language_loss": 0.79620475, + "learning_rate": 3.733379934486615e-06, + "loss": 0.82472074, + "num_input_tokens_seen": 68651895, + "step": 3181, + "time_per_iteration": 2.7794437408447266 + }, + { + "auxiliary_loss_clip": 0.01524351, + "auxiliary_loss_mlp": 0.01337879, + "balance_loss_clip": 1.17283225, + "balance_loss_mlp": 1.06379282, + "epoch": 0.19131218998947844, + "flos": 21692355170400.0, + "grad_norm": 4.005522414890241, + "language_loss": 0.7425282, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.77115047, + "num_input_tokens_seen": 68671500, + "step": 3182, + "time_per_iteration": 4.23300313949585 + }, + { + "auxiliary_loss_clip": 0.01525476, + "auxiliary_loss_mlp": 0.01333307, + "balance_loss_clip": 1.17353678, + "balance_loss_mlp": 1.06074667, + "epoch": 0.1913723132421464, + "flos": 18444496476960.0, + "grad_norm": 3.147402476225945, + "language_loss": 0.65301007, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.68159789, + "num_input_tokens_seen": 68690570, + "step": 3183, + "time_per_iteration": 4.260906219482422 + }, + { + "auxiliary_loss_clip": 0.01521111, + "auxiliary_loss_mlp": 0.01336895, + "balance_loss_clip": 1.16806221, + "balance_loss_mlp": 1.06414378, + "epoch": 0.19143243649481437, + "flos": 27162446981760.0, + "grad_norm": 1.8789965291730581, + "language_loss": 0.7361567, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.76473677, + "num_input_tokens_seen": 68709735, + "step": 3184, + "time_per_iteration": 2.803792715072632 + }, + { + "auxiliary_loss_clip": 0.01522812, + "auxiliary_loss_mlp": 0.01323223, + "balance_loss_clip": 1.16934299, + "balance_loss_mlp": 1.04932785, + "epoch": 0.19149255974748233, + "flos": 21720725798400.0, + "grad_norm": 2.089751176776337, + "language_loss": 0.88354433, + "learning_rate": 3.732602281292598e-06, + "loss": 0.91200459, + "num_input_tokens_seen": 68727565, + "step": 3185, + "time_per_iteration": 2.850545644760132 + }, + { + "auxiliary_loss_clip": 0.01526028, + "auxiliary_loss_mlp": 0.01344778, + "balance_loss_clip": 1.17156506, + "balance_loss_mlp": 1.07965624, + "epoch": 0.1915526830001503, + "flos": 22965350755680.0, + "grad_norm": 2.2596614389302214, + "language_loss": 0.73299301, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.76170111, + "num_input_tokens_seen": 68748110, + "step": 3186, + "time_per_iteration": 2.8652892112731934 + }, + { + "auxiliary_loss_clip": 0.01513166, + "auxiliary_loss_mlp": 0.01339784, + "balance_loss_clip": 1.16011596, + "balance_loss_mlp": 1.06874967, + "epoch": 0.1916128062528183, + "flos": 26143155408960.0, + "grad_norm": 2.198420513322204, + "language_loss": 0.83483315, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.86336267, + "num_input_tokens_seen": 68769765, + "step": 3187, + "time_per_iteration": 2.8053689002990723 + }, + { + "auxiliary_loss_clip": 0.01695208, + "auxiliary_loss_mlp": 0.01446556, + "balance_loss_clip": 1.34064627, + "balance_loss_mlp": 1.23293304, + "epoch": 0.19167292950548626, + "flos": 54931712424480.0, + "grad_norm": 0.9388589892599369, + "language_loss": 0.5580771, + "learning_rate": 3.732018351516544e-06, + "loss": 0.58949476, + "num_input_tokens_seen": 68826815, + "step": 3188, + "time_per_iteration": 3.3971846103668213 + }, + { + "auxiliary_loss_clip": 0.01511181, + "auxiliary_loss_mlp": 0.01326077, + "balance_loss_clip": 1.15791059, + "balance_loss_mlp": 1.05218124, + "epoch": 0.19173305275815422, + "flos": 29938247059680.0, + "grad_norm": 2.031926438843726, + "language_loss": 0.70222521, + "learning_rate": 3.731823576891397e-06, + "loss": 0.73059779, + "num_input_tokens_seen": 68847585, + "step": 3189, + "time_per_iteration": 2.922135591506958 + }, + { + "auxiliary_loss_clip": 0.01512975, + "auxiliary_loss_mlp": 0.01309273, + "balance_loss_clip": 1.16006613, + "balance_loss_mlp": 1.03175354, + "epoch": 0.1917931760108222, + "flos": 24754705411680.0, + "grad_norm": 2.1214894935820965, + "language_loss": 0.74288672, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.77110922, + "num_input_tokens_seen": 68866620, + "step": 3190, + "time_per_iteration": 2.839953660964966 + }, + { + "auxiliary_loss_clip": 0.01516733, + "auxiliary_loss_mlp": 0.01324083, + "balance_loss_clip": 1.16322851, + "balance_loss_mlp": 1.04503751, + "epoch": 0.19185329926349015, + "flos": 18845552848320.0, + "grad_norm": 1.9843549911795195, + "language_loss": 0.84499073, + "learning_rate": 3.73143383063572e-06, + "loss": 0.8733989, + "num_input_tokens_seen": 68885515, + "step": 3191, + "time_per_iteration": 2.7652244567871094 + }, + { + "auxiliary_loss_clip": 0.01514757, + "auxiliary_loss_mlp": 0.01322568, + "balance_loss_clip": 1.16074753, + "balance_loss_mlp": 1.04333258, + "epoch": 0.19191342251615812, + "flos": 22088822234400.0, + "grad_norm": 1.9272866893718257, + "language_loss": 0.8961246, + "learning_rate": 3.73123885901997e-06, + "loss": 0.92449784, + "num_input_tokens_seen": 68903225, + "step": 3192, + "time_per_iteration": 2.8510923385620117 + }, + { + "auxiliary_loss_clip": 0.01517858, + "auxiliary_loss_mlp": 0.01343682, + "balance_loss_clip": 1.16295719, + "balance_loss_mlp": 1.05700731, + "epoch": 0.19197354576882608, + "flos": 22201242393600.0, + "grad_norm": 4.820555136378324, + "language_loss": 0.75118965, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77980512, + "num_input_tokens_seen": 68922860, + "step": 3193, + "time_per_iteration": 2.82393741607666 + }, + { + "auxiliary_loss_clip": 0.01512506, + "auxiliary_loss_mlp": 0.01344287, + "balance_loss_clip": 1.15884459, + "balance_loss_mlp": 1.05627751, + "epoch": 0.19203366902149407, + "flos": 24898302882720.0, + "grad_norm": 2.471246817993934, + "language_loss": 0.75276697, + "learning_rate": 3.730848718849612e-06, + "loss": 0.78133488, + "num_input_tokens_seen": 68943000, + "step": 3194, + "time_per_iteration": 2.85227632522583 + }, + { + "auxiliary_loss_clip": 0.01685617, + "auxiliary_loss_mlp": 0.01372253, + "balance_loss_clip": 1.33187485, + "balance_loss_mlp": 1.14260864, + "epoch": 0.19209379227416204, + "flos": 68422931663040.0, + "grad_norm": 0.8617727458605496, + "language_loss": 0.68410695, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.71468568, + "num_input_tokens_seen": 69000255, + "step": 3195, + "time_per_iteration": 3.286346435546875 + }, + { + "auxiliary_loss_clip": 0.01518251, + "auxiliary_loss_mlp": 0.01342714, + "balance_loss_clip": 1.16406739, + "balance_loss_mlp": 1.06462252, + "epoch": 0.19215391552683, + "flos": 22057417353600.0, + "grad_norm": 3.101477177765926, + "language_loss": 0.73729563, + "learning_rate": 3.730458316143429e-06, + "loss": 0.76590532, + "num_input_tokens_seen": 69019665, + "step": 3196, + "time_per_iteration": 2.800072193145752 + }, + { + "auxiliary_loss_clip": 0.01526463, + "auxiliary_loss_mlp": 0.01341464, + "balance_loss_clip": 1.17165852, + "balance_loss_mlp": 1.06451678, + "epoch": 0.19221403877949797, + "flos": 20305043017920.0, + "grad_norm": 1.9481846022027594, + "language_loss": 0.83628821, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.86496747, + "num_input_tokens_seen": 69039055, + "step": 3197, + "time_per_iteration": 2.8282012939453125 + }, + { + "auxiliary_loss_clip": 0.01512591, + "auxiliary_loss_mlp": 0.0132661, + "balance_loss_clip": 1.15665698, + "balance_loss_mlp": 1.05290568, + "epoch": 0.19227416203216594, + "flos": 23187574031040.0, + "grad_norm": 2.315765329971978, + "language_loss": 0.8029108, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.83130282, + "num_input_tokens_seen": 69056370, + "step": 3198, + "time_per_iteration": 2.7894155979156494 + }, + { + "auxiliary_loss_clip": 0.01507283, + "auxiliary_loss_mlp": 0.01333402, + "balance_loss_clip": 1.15360212, + "balance_loss_mlp": 1.06217694, + "epoch": 0.1923342852848339, + "flos": 25779041429760.0, + "grad_norm": 2.2257733751662996, + "language_loss": 0.7887646, + "learning_rate": 3.729872219959029e-06, + "loss": 0.81717145, + "num_input_tokens_seen": 69075915, + "step": 3199, + "time_per_iteration": 2.821150064468384 + }, + { + "auxiliary_loss_clip": 0.01511774, + "auxiliary_loss_mlp": 0.0133162, + "balance_loss_clip": 1.15711832, + "balance_loss_mlp": 1.06287503, + "epoch": 0.19239440853750187, + "flos": 17130234689280.0, + "grad_norm": 2.830951110398247, + "language_loss": 0.83574766, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.86418158, + "num_input_tokens_seen": 69094145, + "step": 3200, + "time_per_iteration": 2.7823286056518555 + }, + { + "auxiliary_loss_clip": 0.01514693, + "auxiliary_loss_mlp": 0.01352893, + "balance_loss_clip": 1.16033268, + "balance_loss_mlp": 1.08872485, + "epoch": 0.19245453179016986, + "flos": 16436938930560.0, + "grad_norm": 1.830030645000052, + "language_loss": 0.79262805, + "learning_rate": 3.729481161172443e-06, + "loss": 0.8213039, + "num_input_tokens_seen": 69111110, + "step": 3201, + "time_per_iteration": 2.776665210723877 + }, + { + "auxiliary_loss_clip": 0.01519092, + "auxiliary_loss_mlp": 0.01352153, + "balance_loss_clip": 1.16418076, + "balance_loss_mlp": 1.08436155, + "epoch": 0.19251465504283782, + "flos": 20232371934720.0, + "grad_norm": 2.244127545470584, + "language_loss": 0.69184387, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.72055632, + "num_input_tokens_seen": 69130280, + "step": 3202, + "time_per_iteration": 2.969653606414795 + }, + { + "auxiliary_loss_clip": 0.01517409, + "auxiliary_loss_mlp": 0.01345071, + "balance_loss_clip": 1.16216886, + "balance_loss_mlp": 1.08357406, + "epoch": 0.1925747782955058, + "flos": 19466443020960.0, + "grad_norm": 1.8593796196747727, + "language_loss": 0.9149245, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.94354928, + "num_input_tokens_seen": 69149570, + "step": 3203, + "time_per_iteration": 2.7644567489624023 + }, + { + "auxiliary_loss_clip": 0.0151715, + "auxiliary_loss_mlp": 0.01365242, + "balance_loss_clip": 1.16297817, + "balance_loss_mlp": 1.10336339, + "epoch": 0.19263490154817375, + "flos": 17787536259840.0, + "grad_norm": 3.6840957007081934, + "language_loss": 0.81841803, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.847242, + "num_input_tokens_seen": 69168190, + "step": 3204, + "time_per_iteration": 2.78283429145813 + }, + { + "auxiliary_loss_clip": 0.01508995, + "auxiliary_loss_mlp": 0.01352673, + "balance_loss_clip": 1.15555775, + "balance_loss_mlp": 1.09270096, + "epoch": 0.19269502480084172, + "flos": 17458999259040.0, + "grad_norm": 2.3914238225545086, + "language_loss": 0.75520122, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.78381789, + "num_input_tokens_seen": 69186950, + "step": 3205, + "time_per_iteration": 2.7813658714294434 + }, + { + "auxiliary_loss_clip": 0.01511376, + "auxiliary_loss_mlp": 0.01349146, + "balance_loss_clip": 1.15775931, + "balance_loss_mlp": 1.08383369, + "epoch": 0.19275514805350968, + "flos": 21509046551520.0, + "grad_norm": 3.081719714450801, + "language_loss": 0.8327105, + "learning_rate": 3.728502366649107e-06, + "loss": 0.86131573, + "num_input_tokens_seen": 69204850, + "step": 3206, + "time_per_iteration": 2.7632083892822266 + }, + { + "auxiliary_loss_clip": 0.0165889, + "auxiliary_loss_mlp": 0.01430717, + "balance_loss_clip": 1.30475223, + "balance_loss_mlp": 1.21938324, + "epoch": 0.19281527130617768, + "flos": 47701205507520.0, + "grad_norm": 0.904531497214332, + "language_loss": 0.60595787, + "learning_rate": 3.728306411079786e-06, + "loss": 0.63685393, + "num_input_tokens_seen": 69259200, + "step": 3207, + "time_per_iteration": 3.114325523376465 + }, + { + "auxiliary_loss_clip": 0.01513932, + "auxiliary_loss_mlp": 0.01342335, + "balance_loss_clip": 1.1594708, + "balance_loss_mlp": 1.07721329, + "epoch": 0.19287539455884564, + "flos": 11802830073120.0, + "grad_norm": 2.777520945097893, + "language_loss": 0.75846058, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.78702325, + "num_input_tokens_seen": 69275835, + "step": 3208, + "time_per_iteration": 2.797820806503296 + }, + { + "auxiliary_loss_clip": 0.01507388, + "auxiliary_loss_mlp": 0.01345152, + "balance_loss_clip": 1.15365064, + "balance_loss_mlp": 1.07182932, + "epoch": 0.1929355178115136, + "flos": 20633542090560.0, + "grad_norm": 2.4982944019672018, + "language_loss": 0.61273259, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.641258, + "num_input_tokens_seen": 69294810, + "step": 3209, + "time_per_iteration": 2.7953522205352783 + }, + { + "auxiliary_loss_clip": 0.01512455, + "auxiliary_loss_mlp": 0.01334149, + "balance_loss_clip": 1.15784693, + "balance_loss_mlp": 1.06063509, + "epoch": 0.19299564106418157, + "flos": 40811145397920.0, + "grad_norm": 3.652482529458481, + "language_loss": 0.80724943, + "learning_rate": 3.727718151176243e-06, + "loss": 0.83571541, + "num_input_tokens_seen": 69316065, + "step": 3210, + "time_per_iteration": 2.9541165828704834 + }, + { + "auxiliary_loss_clip": 0.01514637, + "auxiliary_loss_mlp": 0.01332578, + "balance_loss_clip": 1.16032577, + "balance_loss_mlp": 1.06497645, + "epoch": 0.19305576431684954, + "flos": 11362859045280.0, + "grad_norm": 2.2807326070194724, + "language_loss": 0.82816339, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.85663557, + "num_input_tokens_seen": 69332900, + "step": 3211, + "time_per_iteration": 2.784672975540161 + }, + { + "auxiliary_loss_clip": 0.0166128, + "auxiliary_loss_mlp": 0.01346168, + "balance_loss_clip": 1.308851, + "balance_loss_mlp": 1.11499786, + "epoch": 0.1931158875695175, + "flos": 54517115700000.0, + "grad_norm": 1.0634575401130932, + "language_loss": 0.63539171, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.66546619, + "num_input_tokens_seen": 69382535, + "step": 3212, + "time_per_iteration": 3.2103707790374756 + }, + { + "auxiliary_loss_clip": 0.01517328, + "auxiliary_loss_mlp": 0.01325592, + "balance_loss_clip": 1.16285276, + "balance_loss_mlp": 1.05379486, + "epoch": 0.19317601082218547, + "flos": 19830481143840.0, + "grad_norm": 2.56485048970791, + "language_loss": 0.76514906, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.79357827, + "num_input_tokens_seen": 69400600, + "step": 3213, + "time_per_iteration": 4.442617416381836 + }, + { + "auxiliary_loss_clip": 0.01513517, + "auxiliary_loss_mlp": 0.01316681, + "balance_loss_clip": 1.16034102, + "balance_loss_mlp": 1.04316711, + "epoch": 0.19323613407485346, + "flos": 13153768755840.0, + "grad_norm": 2.2849088901782686, + "language_loss": 0.71451819, + "learning_rate": 3.726932887459503e-06, + "loss": 0.74282014, + "num_input_tokens_seen": 69417350, + "step": 3214, + "time_per_iteration": 2.882908582687378 + }, + { + "auxiliary_loss_clip": 0.01509961, + "auxiliary_loss_mlp": 0.01312044, + "balance_loss_clip": 1.15714085, + "balance_loss_mlp": 1.03833961, + "epoch": 0.19329625732752143, + "flos": 14028969791520.0, + "grad_norm": 2.386582316073769, + "language_loss": 0.75427043, + "learning_rate": 3.72673640779803e-06, + "loss": 0.78249049, + "num_input_tokens_seen": 69431845, + "step": 3215, + "time_per_iteration": 2.79895281791687 + }, + { + "auxiliary_loss_clip": 0.01515297, + "auxiliary_loss_mlp": 0.01313281, + "balance_loss_clip": 1.16285276, + "balance_loss_mlp": 1.04358137, + "epoch": 0.1933563805801894, + "flos": 23444046799200.0, + "grad_norm": 2.2274926907189663, + "language_loss": 0.88315231, + "learning_rate": 3.72653986265854e-06, + "loss": 0.91143805, + "num_input_tokens_seen": 69453275, + "step": 3216, + "time_per_iteration": 4.238844633102417 + }, + { + "auxiliary_loss_clip": 0.01514142, + "auxiliary_loss_mlp": 0.01328153, + "balance_loss_clip": 1.16073704, + "balance_loss_mlp": 1.06264997, + "epoch": 0.19341650383285736, + "flos": 20487403432800.0, + "grad_norm": 1.8775779092665177, + "language_loss": 0.80226707, + "learning_rate": 3.726343252048485e-06, + "loss": 0.83069003, + "num_input_tokens_seen": 69471830, + "step": 3217, + "time_per_iteration": 2.8145830631256104 + }, + { + "auxiliary_loss_clip": 0.01511409, + "auxiliary_loss_mlp": 0.01358391, + "balance_loss_clip": 1.15787005, + "balance_loss_mlp": 1.08907366, + "epoch": 0.19347662708552532, + "flos": 17860434912000.0, + "grad_norm": 4.8842701697641315, + "language_loss": 0.61868632, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.64738435, + "num_input_tokens_seen": 69489320, + "step": 3218, + "time_per_iteration": 2.7685320377349854 + }, + { + "auxiliary_loss_clip": 0.01517603, + "auxiliary_loss_mlp": 0.01341735, + "balance_loss_clip": 1.16348076, + "balance_loss_mlp": 1.06936574, + "epoch": 0.1935367503381933, + "flos": 18189237409920.0, + "grad_norm": 2.030057052519616, + "language_loss": 0.80304158, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.83163494, + "num_input_tokens_seen": 69506665, + "step": 3219, + "time_per_iteration": 2.784902811050415 + }, + { + "auxiliary_loss_clip": 0.01518801, + "auxiliary_loss_mlp": 0.01343478, + "balance_loss_clip": 1.16615355, + "balance_loss_mlp": 1.07377887, + "epoch": 0.19359687359086128, + "flos": 15958735953120.0, + "grad_norm": 2.6126412730037454, + "language_loss": 0.85792142, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.88654423, + "num_input_tokens_seen": 69523835, + "step": 3220, + "time_per_iteration": 4.336621046066284 + }, + { + "auxiliary_loss_clip": 0.01519844, + "auxiliary_loss_mlp": 0.01367347, + "balance_loss_clip": 1.16687107, + "balance_loss_mlp": 1.10604048, + "epoch": 0.19365699684352924, + "flos": 21217338158400.0, + "grad_norm": 4.611443961165063, + "language_loss": 0.84199959, + "learning_rate": 3.725556155051766e-06, + "loss": 0.87087148, + "num_input_tokens_seen": 69542620, + "step": 3221, + "time_per_iteration": 4.337270259857178 + }, + { + "auxiliary_loss_clip": 0.01523087, + "auxiliary_loss_mlp": 0.01363278, + "balance_loss_clip": 1.1711216, + "balance_loss_mlp": 1.10063577, + "epoch": 0.1937171200961972, + "flos": 17313012313920.0, + "grad_norm": 2.4359775774526353, + "language_loss": 0.86410201, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.89296556, + "num_input_tokens_seen": 69561130, + "step": 3222, + "time_per_iteration": 2.7967138290405273 + }, + { + "auxiliary_loss_clip": 0.01514973, + "auxiliary_loss_mlp": 0.0133078, + "balance_loss_clip": 1.16200566, + "balance_loss_mlp": 1.05745733, + "epoch": 0.19377724334886517, + "flos": 22638065384160.0, + "grad_norm": 6.6194633048381295, + "language_loss": 0.78742421, + "learning_rate": 3.72516221392398e-06, + "loss": 0.81588173, + "num_input_tokens_seen": 69580425, + "step": 3223, + "time_per_iteration": 2.886728286743164 + }, + { + "auxiliary_loss_clip": 0.01526735, + "auxiliary_loss_mlp": 0.01339816, + "balance_loss_clip": 1.17380333, + "balance_loss_mlp": 1.07068896, + "epoch": 0.19383736660153314, + "flos": 15079173179040.0, + "grad_norm": 2.160868826751337, + "language_loss": 0.75482893, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.78349441, + "num_input_tokens_seen": 69597085, + "step": 3224, + "time_per_iteration": 2.767648935317993 + }, + { + "auxiliary_loss_clip": 0.015223, + "auxiliary_loss_mlp": 0.01328423, + "balance_loss_clip": 1.16930771, + "balance_loss_mlp": 1.05700755, + "epoch": 0.1938974898542011, + "flos": 47123516237760.0, + "grad_norm": 3.0737698533344058, + "language_loss": 0.70815289, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73666012, + "num_input_tokens_seen": 69618885, + "step": 3225, + "time_per_iteration": 3.004422187805176 + }, + { + "auxiliary_loss_clip": 0.01523318, + "auxiliary_loss_mlp": 0.01313645, + "balance_loss_clip": 1.17134511, + "balance_loss_mlp": 1.03898668, + "epoch": 0.19395761310686907, + "flos": 25814959761600.0, + "grad_norm": 2.2301055298741006, + "language_loss": 0.69121403, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71958363, + "num_input_tokens_seen": 69638200, + "step": 3226, + "time_per_iteration": 2.8387582302093506 + }, + { + "auxiliary_loss_clip": 0.01536382, + "auxiliary_loss_mlp": 0.01320823, + "balance_loss_clip": 1.18491197, + "balance_loss_mlp": 1.0495975, + "epoch": 0.19401773635953706, + "flos": 23041852583040.0, + "grad_norm": 1.9071460521527053, + "language_loss": 0.76239502, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.79096711, + "num_input_tokens_seen": 69657550, + "step": 3227, + "time_per_iteration": 2.864105701446533 + }, + { + "auxiliary_loss_clip": 0.01525946, + "auxiliary_loss_mlp": 0.01321109, + "balance_loss_clip": 1.17415297, + "balance_loss_mlp": 1.05236316, + "epoch": 0.19407785961220503, + "flos": 15922400411520.0, + "grad_norm": 2.391182314454732, + "language_loss": 0.69806087, + "learning_rate": 3.724176216414662e-06, + "loss": 0.72653139, + "num_input_tokens_seen": 69675005, + "step": 3228, + "time_per_iteration": 2.9655697345733643 + }, + { + "auxiliary_loss_clip": 0.01530147, + "auxiliary_loss_mlp": 0.01328999, + "balance_loss_clip": 1.17915821, + "balance_loss_mlp": 1.06044459, + "epoch": 0.194137982864873, + "flos": 25924118099040.0, + "grad_norm": 6.734500460989426, + "language_loss": 0.74110192, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76969337, + "num_input_tokens_seen": 69696455, + "step": 3229, + "time_per_iteration": 2.9090499877929688 + }, + { + "auxiliary_loss_clip": 0.01526978, + "auxiliary_loss_mlp": 0.01311606, + "balance_loss_clip": 1.17658222, + "balance_loss_mlp": 1.03561282, + "epoch": 0.19419810611754096, + "flos": 13263040877760.0, + "grad_norm": 2.517042967186959, + "language_loss": 0.658086, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.68647182, + "num_input_tokens_seen": 69714245, + "step": 3230, + "time_per_iteration": 2.833439588546753 + }, + { + "auxiliary_loss_clip": 0.01524243, + "auxiliary_loss_mlp": 0.01314823, + "balance_loss_clip": 1.1730721, + "balance_loss_mlp": 1.04226267, + "epoch": 0.19425822937020892, + "flos": 15707383486560.0, + "grad_norm": 2.2955144533705956, + "language_loss": 0.81743473, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.84582543, + "num_input_tokens_seen": 69731515, + "step": 3231, + "time_per_iteration": 2.8564658164978027 + }, + { + "auxiliary_loss_clip": 0.01529344, + "auxiliary_loss_mlp": 0.01316318, + "balance_loss_clip": 1.17857742, + "balance_loss_mlp": 1.04204178, + "epoch": 0.1943183526228769, + "flos": 23107241459520.0, + "grad_norm": 4.367257580761418, + "language_loss": 0.87212062, + "learning_rate": 3.72338624150555e-06, + "loss": 0.90057719, + "num_input_tokens_seen": 69748885, + "step": 3232, + "time_per_iteration": 2.8443591594696045 + }, + { + "auxiliary_loss_clip": 0.0153709, + "auxiliary_loss_mlp": 0.01326461, + "balance_loss_clip": 1.18607998, + "balance_loss_mlp": 1.05180323, + "epoch": 0.19437847587554485, + "flos": 24714652910400.0, + "grad_norm": 2.332369748239194, + "language_loss": 0.85022104, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87885648, + "num_input_tokens_seen": 69767540, + "step": 3233, + "time_per_iteration": 2.8991880416870117 + }, + { + "auxiliary_loss_clip": 0.01528454, + "auxiliary_loss_mlp": 0.01317015, + "balance_loss_clip": 1.17754936, + "balance_loss_mlp": 1.04102218, + "epoch": 0.19443859912821285, + "flos": 23118581979360.0, + "grad_norm": 1.7742737316550072, + "language_loss": 0.89335704, + "learning_rate": 3.722990861915158e-06, + "loss": 0.92181176, + "num_input_tokens_seen": 69789340, + "step": 3234, + "time_per_iteration": 2.907130241394043 + }, + { + "auxiliary_loss_clip": 0.01525905, + "auxiliary_loss_mlp": 0.01320116, + "balance_loss_clip": 1.17483497, + "balance_loss_mlp": 1.04240572, + "epoch": 0.1944987223808808, + "flos": 15086114032320.0, + "grad_norm": 2.763296615574664, + "language_loss": 0.78387207, + "learning_rate": 3.722793074112234e-06, + "loss": 0.81233227, + "num_input_tokens_seen": 69806470, + "step": 3235, + "time_per_iteration": 2.8283159732818604 + }, + { + "auxiliary_loss_clip": 0.01537235, + "auxiliary_loss_mlp": 0.01339859, + "balance_loss_clip": 1.18628693, + "balance_loss_mlp": 1.06863368, + "epoch": 0.19455884563354878, + "flos": 17128679634720.0, + "grad_norm": 2.6425627717337155, + "language_loss": 0.79457939, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.82335031, + "num_input_tokens_seen": 69822655, + "step": 3236, + "time_per_iteration": 2.793727397918701 + }, + { + "auxiliary_loss_clip": 0.01542488, + "auxiliary_loss_mlp": 0.01316672, + "balance_loss_clip": 1.19159293, + "balance_loss_mlp": 1.03858113, + "epoch": 0.19461896888621674, + "flos": 20195467470720.0, + "grad_norm": 2.5476459750742477, + "language_loss": 0.75719553, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78578711, + "num_input_tokens_seen": 69841895, + "step": 3237, + "time_per_iteration": 2.870577812194824 + }, + { + "auxiliary_loss_clip": 0.01533719, + "auxiliary_loss_mlp": 0.01332522, + "balance_loss_clip": 1.18180919, + "balance_loss_mlp": 1.05462134, + "epoch": 0.1946790921388847, + "flos": 25303720992480.0, + "grad_norm": 2.6467741576526507, + "language_loss": 0.74929297, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77795541, + "num_input_tokens_seen": 69862220, + "step": 3238, + "time_per_iteration": 2.8643226623535156 + }, + { + "auxiliary_loss_clip": 0.01537646, + "auxiliary_loss_mlp": 0.01322173, + "balance_loss_clip": 1.18501973, + "balance_loss_mlp": 1.04427266, + "epoch": 0.19473921539155267, + "flos": 20195467470720.0, + "grad_norm": 2.3891921616425393, + "language_loss": 0.73678905, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.76538724, + "num_input_tokens_seen": 69881830, + "step": 3239, + "time_per_iteration": 2.8260583877563477 + }, + { + "auxiliary_loss_clip": 0.01534753, + "auxiliary_loss_mlp": 0.01314201, + "balance_loss_clip": 1.18374395, + "balance_loss_mlp": 1.04202235, + "epoch": 0.19479933864422067, + "flos": 20889711433440.0, + "grad_norm": 2.059792785863714, + "language_loss": 0.73683798, + "learning_rate": 3.721803155320412e-06, + "loss": 0.76532751, + "num_input_tokens_seen": 69900515, + "step": 3240, + "time_per_iteration": 2.9853036403656006 + }, + { + "auxiliary_loss_clip": 0.0154206, + "auxiliary_loss_mlp": 0.01326778, + "balance_loss_clip": 1.19018197, + "balance_loss_mlp": 1.05650711, + "epoch": 0.19485946189688863, + "flos": 23297339219040.0, + "grad_norm": 2.723972169880476, + "language_loss": 0.66563839, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.69432676, + "num_input_tokens_seen": 69920060, + "step": 3241, + "time_per_iteration": 2.831313133239746 + }, + { + "auxiliary_loss_clip": 0.01539449, + "auxiliary_loss_mlp": 0.0133999, + "balance_loss_clip": 1.18655396, + "balance_loss_mlp": 1.07238925, + "epoch": 0.1949195851495566, + "flos": 23297301290880.0, + "grad_norm": 1.6836837861205882, + "language_loss": 0.83144748, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.86024189, + "num_input_tokens_seen": 69939820, + "step": 3242, + "time_per_iteration": 2.8493905067443848 + }, + { + "auxiliary_loss_clip": 0.01660169, + "auxiliary_loss_mlp": 0.01318748, + "balance_loss_clip": 1.31598473, + "balance_loss_mlp": 1.09597015, + "epoch": 0.19497970840222456, + "flos": 64969576012800.0, + "grad_norm": 1.8713846406591044, + "language_loss": 0.57465124, + "learning_rate": 3.721208420493875e-06, + "loss": 0.60444045, + "num_input_tokens_seen": 70002145, + "step": 3243, + "time_per_iteration": 3.347036600112915 + }, + { + "auxiliary_loss_clip": 0.01535274, + "auxiliary_loss_mlp": 0.01339242, + "balance_loss_clip": 1.18280435, + "balance_loss_mlp": 1.07164121, + "epoch": 0.19503983165489253, + "flos": 19646679458880.0, + "grad_norm": 3.443542800711624, + "language_loss": 0.83878446, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.86752957, + "num_input_tokens_seen": 70020510, + "step": 3244, + "time_per_iteration": 2.8020308017730713 + }, + { + "auxiliary_loss_clip": 0.01532571, + "auxiliary_loss_mlp": 0.01337815, + "balance_loss_clip": 1.18026459, + "balance_loss_mlp": 1.0646832, + "epoch": 0.1950999549075605, + "flos": 21144098152800.0, + "grad_norm": 3.2955062171141005, + "language_loss": 0.773278, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.80198187, + "num_input_tokens_seen": 70040760, + "step": 3245, + "time_per_iteration": 2.8889260292053223 + }, + { + "auxiliary_loss_clip": 0.01534109, + "auxiliary_loss_mlp": 0.01344777, + "balance_loss_clip": 1.1812247, + "balance_loss_mlp": 1.07774782, + "epoch": 0.19516007816022846, + "flos": 20886828893280.0, + "grad_norm": 2.2750118744880363, + "language_loss": 0.84107298, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86986184, + "num_input_tokens_seen": 70058720, + "step": 3246, + "time_per_iteration": 2.815800189971924 + }, + { + "auxiliary_loss_clip": 0.01532134, + "auxiliary_loss_mlp": 0.01335999, + "balance_loss_clip": 1.18023932, + "balance_loss_mlp": 1.06629944, + "epoch": 0.19522020141289645, + "flos": 16912449008640.0, + "grad_norm": 3.08757461388434, + "language_loss": 0.75610298, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78478432, + "num_input_tokens_seen": 70076470, + "step": 3247, + "time_per_iteration": 2.8105998039245605 + }, + { + "auxiliary_loss_clip": 0.01533885, + "auxiliary_loss_mlp": 0.01330799, + "balance_loss_clip": 1.18239951, + "balance_loss_mlp": 1.06529582, + "epoch": 0.19528032466556441, + "flos": 26727292830240.0, + "grad_norm": 2.4383547465730913, + "language_loss": 0.75662601, + "learning_rate": 3.720215890515421e-06, + "loss": 0.78527296, + "num_input_tokens_seen": 70096220, + "step": 3248, + "time_per_iteration": 2.8352584838867188 + }, + { + "auxiliary_loss_clip": 0.01529121, + "auxiliary_loss_mlp": 0.01337448, + "balance_loss_clip": 1.17672396, + "balance_loss_mlp": 1.06908441, + "epoch": 0.19534044791823238, + "flos": 21034939815360.0, + "grad_norm": 2.2667556223936205, + "language_loss": 0.78795135, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.81661707, + "num_input_tokens_seen": 70114800, + "step": 3249, + "time_per_iteration": 2.845506429672241 + }, + { + "auxiliary_loss_clip": 0.01533293, + "auxiliary_loss_mlp": 0.0133585, + "balance_loss_clip": 1.17922211, + "balance_loss_mlp": 1.06595969, + "epoch": 0.19540057117090034, + "flos": 22346129422080.0, + "grad_norm": 1.6132086852444834, + "language_loss": 0.73118883, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75988024, + "num_input_tokens_seen": 70134930, + "step": 3250, + "time_per_iteration": 4.36090874671936 + }, + { + "auxiliary_loss_clip": 0.01537685, + "auxiliary_loss_mlp": 0.01324188, + "balance_loss_clip": 1.18519449, + "balance_loss_mlp": 1.05353475, + "epoch": 0.1954606944235683, + "flos": 20303639676000.0, + "grad_norm": 2.1078790592436434, + "language_loss": 0.79549056, + "learning_rate": 3.719619589699017e-06, + "loss": 0.82410932, + "num_input_tokens_seen": 70152045, + "step": 3251, + "time_per_iteration": 2.7392780780792236 + }, + { + "auxiliary_loss_clip": 0.01533337, + "auxiliary_loss_mlp": 0.0132426, + "balance_loss_clip": 1.18030834, + "balance_loss_mlp": 1.04902983, + "epoch": 0.19552081767623627, + "flos": 17348741004960.0, + "grad_norm": 4.003607933676988, + "language_loss": 0.84057558, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.86915159, + "num_input_tokens_seen": 70169240, + "step": 3252, + "time_per_iteration": 2.785479784011841 + }, + { + "auxiliary_loss_clip": 0.0152362, + "auxiliary_loss_mlp": 0.01330704, + "balance_loss_clip": 1.17037725, + "balance_loss_mlp": 1.05604529, + "epoch": 0.19558094092890424, + "flos": 31980319596000.0, + "grad_norm": 1.7510351596006426, + "language_loss": 0.7375592, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76610243, + "num_input_tokens_seen": 70192690, + "step": 3253, + "time_per_iteration": 2.8421823978424072 + }, + { + "auxiliary_loss_clip": 0.01529991, + "auxiliary_loss_mlp": 0.01332996, + "balance_loss_clip": 1.17629397, + "balance_loss_mlp": 1.05986404, + "epoch": 0.19564106418157223, + "flos": 22270841295840.0, + "grad_norm": 2.481623055276144, + "language_loss": 0.76523995, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.79386985, + "num_input_tokens_seen": 70209685, + "step": 3254, + "time_per_iteration": 4.219735622406006 + }, + { + "auxiliary_loss_clip": 0.01701573, + "auxiliary_loss_mlp": 0.01273186, + "balance_loss_clip": 1.3557632, + "balance_loss_mlp": 1.0328598, + "epoch": 0.1957011874342402, + "flos": 54369687484800.0, + "grad_norm": 0.7637450504740751, + "language_loss": 0.55256808, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.58231568, + "num_input_tokens_seen": 70265050, + "step": 3255, + "time_per_iteration": 3.2866251468658447 + }, + { + "auxiliary_loss_clip": 0.01534712, + "auxiliary_loss_mlp": 0.01328734, + "balance_loss_clip": 1.18153417, + "balance_loss_mlp": 1.05064237, + "epoch": 0.19576131068690816, + "flos": 16508472168960.0, + "grad_norm": 3.329763568972229, + "language_loss": 0.7156443, + "learning_rate": 3.718624450942688e-06, + "loss": 0.74427867, + "num_input_tokens_seen": 70281830, + "step": 3256, + "time_per_iteration": 2.7489397525787354 + }, + { + "auxiliary_loss_clip": 0.01538094, + "auxiliary_loss_mlp": 0.01323556, + "balance_loss_clip": 1.18503153, + "balance_loss_mlp": 1.05004215, + "epoch": 0.19582143393957613, + "flos": 14721393202560.0, + "grad_norm": 2.505801418464569, + "language_loss": 0.80958128, + "learning_rate": 3.718425227649987e-06, + "loss": 0.83819777, + "num_input_tokens_seen": 70297420, + "step": 3257, + "time_per_iteration": 2.8224008083343506 + }, + { + "auxiliary_loss_clip": 0.01533799, + "auxiliary_loss_mlp": 0.0132261, + "balance_loss_clip": 1.1809293, + "balance_loss_mlp": 1.04146647, + "epoch": 0.1958815571922441, + "flos": 24427799321760.0, + "grad_norm": 4.1961092977143, + "language_loss": 0.75467038, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.78323448, + "num_input_tokens_seen": 70319210, + "step": 3258, + "time_per_iteration": 4.313570261001587 + }, + { + "auxiliary_loss_clip": 0.01538226, + "auxiliary_loss_mlp": 0.01322301, + "balance_loss_clip": 1.18590355, + "balance_loss_mlp": 1.03963244, + "epoch": 0.19594168044491206, + "flos": 24902626692960.0, + "grad_norm": 2.0792010450209903, + "language_loss": 0.73745143, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.76605678, + "num_input_tokens_seen": 70339045, + "step": 3259, + "time_per_iteration": 2.8976056575775146 + }, + { + "auxiliary_loss_clip": 0.01538397, + "auxiliary_loss_mlp": 0.01329992, + "balance_loss_clip": 1.18511772, + "balance_loss_mlp": 1.04922986, + "epoch": 0.19600180369758005, + "flos": 12058620134400.0, + "grad_norm": 2.467073423775058, + "language_loss": 0.77059102, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.79927492, + "num_input_tokens_seen": 70356505, + "step": 3260, + "time_per_iteration": 4.287682771682739 + }, + { + "auxiliary_loss_clip": 0.01528633, + "auxiliary_loss_mlp": 0.01330184, + "balance_loss_clip": 1.17551541, + "balance_loss_mlp": 1.05590701, + "epoch": 0.19606192695024802, + "flos": 20852313903360.0, + "grad_norm": 2.1828988940645537, + "language_loss": 0.82045293, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84904104, + "num_input_tokens_seen": 70375410, + "step": 3261, + "time_per_iteration": 2.8143365383148193 + }, + { + "auxiliary_loss_clip": 0.0152946, + "auxiliary_loss_mlp": 0.01326457, + "balance_loss_clip": 1.17708838, + "balance_loss_mlp": 1.05580401, + "epoch": 0.19612205020291598, + "flos": 28478036255040.0, + "grad_norm": 2.0901394009463496, + "language_loss": 0.77237487, + "learning_rate": 3.717428133894807e-06, + "loss": 0.80093408, + "num_input_tokens_seen": 70396315, + "step": 3262, + "time_per_iteration": 2.814358711242676 + }, + { + "auxiliary_loss_clip": 0.01539882, + "auxiliary_loss_mlp": 0.0132767, + "balance_loss_clip": 1.18517852, + "balance_loss_mlp": 1.054919, + "epoch": 0.19618217345558395, + "flos": 25558980059520.0, + "grad_norm": 1.7721391769067474, + "language_loss": 0.86588025, + "learning_rate": 3.71722851973837e-06, + "loss": 0.89455581, + "num_input_tokens_seen": 70417945, + "step": 3263, + "time_per_iteration": 2.784453868865967 + }, + { + "auxiliary_loss_clip": 0.0153529, + "auxiliary_loss_mlp": 0.0132403, + "balance_loss_clip": 1.1817553, + "balance_loss_mlp": 1.04879951, + "epoch": 0.1962422967082519, + "flos": 25266740672160.0, + "grad_norm": 1.8569548513563767, + "language_loss": 0.73821872, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76681191, + "num_input_tokens_seen": 70438690, + "step": 3264, + "time_per_iteration": 2.7943012714385986 + }, + { + "auxiliary_loss_clip": 0.01539889, + "auxiliary_loss_mlp": 0.01333977, + "balance_loss_clip": 1.18498993, + "balance_loss_mlp": 1.06523132, + "epoch": 0.19630241996091988, + "flos": 18809369019360.0, + "grad_norm": 2.2460471619229323, + "language_loss": 0.78490317, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.81364185, + "num_input_tokens_seen": 70455385, + "step": 3265, + "time_per_iteration": 2.76705002784729 + }, + { + "auxiliary_loss_clip": 0.01696412, + "auxiliary_loss_mlp": 0.01266014, + "balance_loss_clip": 1.35033417, + "balance_loss_mlp": 1.02416229, + "epoch": 0.19636254321358784, + "flos": 62326867305600.0, + "grad_norm": 0.7971393158403766, + "language_loss": 0.53385448, + "learning_rate": 3.716629286594483e-06, + "loss": 0.56347871, + "num_input_tokens_seen": 70514280, + "step": 3266, + "time_per_iteration": 3.3194196224212646 + }, + { + "auxiliary_loss_clip": 0.01523237, + "auxiliary_loss_mlp": 0.01335589, + "balance_loss_clip": 1.16939688, + "balance_loss_mlp": 1.06188464, + "epoch": 0.19642266646625584, + "flos": 21071464997760.0, + "grad_norm": 2.0227748423746994, + "language_loss": 0.80278146, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.83136976, + "num_input_tokens_seen": 70531800, + "step": 3267, + "time_per_iteration": 2.8064680099487305 + }, + { + "auxiliary_loss_clip": 0.01528631, + "auxiliary_loss_mlp": 0.01327744, + "balance_loss_clip": 1.175035, + "balance_loss_mlp": 1.05918884, + "epoch": 0.1964827897189238, + "flos": 14540436129600.0, + "grad_norm": 2.181469476896075, + "language_loss": 0.86815798, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.89672172, + "num_input_tokens_seen": 70550615, + "step": 3268, + "time_per_iteration": 2.7912285327911377 + }, + { + "auxiliary_loss_clip": 0.01529363, + "auxiliary_loss_mlp": 0.01318395, + "balance_loss_clip": 1.17563605, + "balance_loss_mlp": 1.04564452, + "epoch": 0.19654291297159177, + "flos": 19246647147840.0, + "grad_norm": 3.3668412291415195, + "language_loss": 0.6931268, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.72160435, + "num_input_tokens_seen": 70568690, + "step": 3269, + "time_per_iteration": 2.7994813919067383 + }, + { + "auxiliary_loss_clip": 0.01529509, + "auxiliary_loss_mlp": 0.01335296, + "balance_loss_clip": 1.17576134, + "balance_loss_mlp": 1.05987453, + "epoch": 0.19660303622425973, + "flos": 25778093225760.0, + "grad_norm": 12.516613764458931, + "language_loss": 0.80868745, + "learning_rate": 3.715829397778135e-06, + "loss": 0.83733553, + "num_input_tokens_seen": 70588665, + "step": 3270, + "time_per_iteration": 2.8447165489196777 + }, + { + "auxiliary_loss_clip": 0.01531789, + "auxiliary_loss_mlp": 0.0132979, + "balance_loss_clip": 1.17795944, + "balance_loss_mlp": 1.0621891, + "epoch": 0.1966631594769277, + "flos": 20597206548960.0, + "grad_norm": 2.2907564794388406, + "language_loss": 0.83867872, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86729449, + "num_input_tokens_seen": 70606900, + "step": 3271, + "time_per_iteration": 2.722618579864502 + }, + { + "auxiliary_loss_clip": 0.0153152, + "auxiliary_loss_mlp": 0.01322619, + "balance_loss_clip": 1.17710495, + "balance_loss_mlp": 1.05063128, + "epoch": 0.19672328272959566, + "flos": 23625459010080.0, + "grad_norm": 2.5336879994132393, + "language_loss": 0.80441314, + "learning_rate": 3.715429062953087e-06, + "loss": 0.83295453, + "num_input_tokens_seen": 70625955, + "step": 3272, + "time_per_iteration": 2.8621866703033447 + }, + { + "auxiliary_loss_clip": 0.01527108, + "auxiliary_loss_mlp": 0.01328353, + "balance_loss_clip": 1.17251205, + "balance_loss_mlp": 1.05674624, + "epoch": 0.19678340598226365, + "flos": 23113082396160.0, + "grad_norm": 3.106890306149101, + "language_loss": 0.80430198, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.8328566, + "num_input_tokens_seen": 70646090, + "step": 3273, + "time_per_iteration": 2.825965404510498 + }, + { + "auxiliary_loss_clip": 0.01531222, + "auxiliary_loss_mlp": 0.01327975, + "balance_loss_clip": 1.17664599, + "balance_loss_mlp": 1.0552243, + "epoch": 0.19684352923493162, + "flos": 24537147300000.0, + "grad_norm": 2.415922797447333, + "language_loss": 0.78008389, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80867589, + "num_input_tokens_seen": 70666065, + "step": 3274, + "time_per_iteration": 2.8352954387664795 + }, + { + "auxiliary_loss_clip": 0.01528884, + "auxiliary_loss_mlp": 0.0134017, + "balance_loss_clip": 1.17485738, + "balance_loss_mlp": 1.06608355, + "epoch": 0.19690365248759958, + "flos": 21798479255040.0, + "grad_norm": 9.213503637092844, + "language_loss": 0.81093705, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83962762, + "num_input_tokens_seen": 70681580, + "step": 3275, + "time_per_iteration": 2.805387020111084 + }, + { + "auxiliary_loss_clip": 0.01525876, + "auxiliary_loss_mlp": 0.01337574, + "balance_loss_clip": 1.17231679, + "balance_loss_mlp": 1.06825686, + "epoch": 0.19696377574026755, + "flos": 19058332011840.0, + "grad_norm": 2.237752767586033, + "language_loss": 0.81339616, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.84203064, + "num_input_tokens_seen": 70697745, + "step": 3276, + "time_per_iteration": 2.7365779876708984 + }, + { + "auxiliary_loss_clip": 0.01530896, + "auxiliary_loss_mlp": 0.01342194, + "balance_loss_clip": 1.17632723, + "balance_loss_mlp": 1.07344818, + "epoch": 0.19702389899293551, + "flos": 22822891129440.0, + "grad_norm": 2.8527620073024966, + "language_loss": 0.89641607, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.92514694, + "num_input_tokens_seen": 70715110, + "step": 3277, + "time_per_iteration": 2.8412363529205322 + }, + { + "auxiliary_loss_clip": 0.01522877, + "auxiliary_loss_mlp": 0.01345601, + "balance_loss_clip": 1.16818976, + "balance_loss_mlp": 1.07456708, + "epoch": 0.19708402224560348, + "flos": 22896510416640.0, + "grad_norm": 2.809265130281902, + "language_loss": 0.62804049, + "learning_rate": 3.714226497539239e-06, + "loss": 0.65672529, + "num_input_tokens_seen": 70734715, + "step": 3278, + "time_per_iteration": 2.8174426555633545 + }, + { + "auxiliary_loss_clip": 0.01522941, + "auxiliary_loss_mlp": 0.0134693, + "balance_loss_clip": 1.16914058, + "balance_loss_mlp": 1.07780266, + "epoch": 0.19714414549827144, + "flos": 25664686934400.0, + "grad_norm": 2.197686034767404, + "language_loss": 0.73458338, + "learning_rate": 3.714025842413166e-06, + "loss": 0.76328212, + "num_input_tokens_seen": 70752650, + "step": 3279, + "time_per_iteration": 2.802558183670044 + }, + { + "auxiliary_loss_clip": 0.01516175, + "auxiliary_loss_mlp": 0.01325773, + "balance_loss_clip": 1.16333008, + "balance_loss_mlp": 1.05225945, + "epoch": 0.19720426875093944, + "flos": 23918153535360.0, + "grad_norm": 1.7362605102498199, + "language_loss": 0.82569176, + "learning_rate": 3.713825122291061e-06, + "loss": 0.85411119, + "num_input_tokens_seen": 70772365, + "step": 3280, + "time_per_iteration": 2.7983129024505615 + }, + { + "auxiliary_loss_clip": 0.0152234, + "auxiliary_loss_mlp": 0.01336028, + "balance_loss_clip": 1.16908979, + "balance_loss_mlp": 1.0613699, + "epoch": 0.1972643920036074, + "flos": 13883703481440.0, + "grad_norm": 2.0785235438232434, + "language_loss": 0.77972108, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80830479, + "num_input_tokens_seen": 70790340, + "step": 3281, + "time_per_iteration": 2.7885944843292236 + }, + { + "auxiliary_loss_clip": 0.01524305, + "auxiliary_loss_mlp": 0.01343675, + "balance_loss_clip": 1.16984928, + "balance_loss_mlp": 1.07874417, + "epoch": 0.19732451525627537, + "flos": 19865565056160.0, + "grad_norm": 2.44043432722378, + "language_loss": 0.79534984, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.82402962, + "num_input_tokens_seen": 70809295, + "step": 3282, + "time_per_iteration": 2.826876163482666 + }, + { + "auxiliary_loss_clip": 0.01510846, + "auxiliary_loss_mlp": 0.01355622, + "balance_loss_clip": 1.15728724, + "balance_loss_mlp": 1.08477867, + "epoch": 0.19738463850894333, + "flos": 24975790842240.0, + "grad_norm": 2.5294491027187065, + "language_loss": 0.72154325, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.75020796, + "num_input_tokens_seen": 70828765, + "step": 3283, + "time_per_iteration": 2.7786178588867188 + }, + { + "auxiliary_loss_clip": 0.01520453, + "auxiliary_loss_mlp": 0.01346168, + "balance_loss_clip": 1.1664896, + "balance_loss_mlp": 1.07189143, + "epoch": 0.1974447617616113, + "flos": 18370839261600.0, + "grad_norm": 1.9146578924978057, + "language_loss": 0.78686142, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81552762, + "num_input_tokens_seen": 70846805, + "step": 3284, + "time_per_iteration": 2.8637349605560303 + }, + { + "auxiliary_loss_clip": 0.01524266, + "auxiliary_loss_mlp": 0.01343822, + "balance_loss_clip": 1.16946578, + "balance_loss_mlp": 1.07412338, + "epoch": 0.19750488501427926, + "flos": 22895145002880.0, + "grad_norm": 2.5979134568186306, + "language_loss": 0.86519861, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.89387959, + "num_input_tokens_seen": 70863805, + "step": 3285, + "time_per_iteration": 2.7822532653808594 + }, + { + "auxiliary_loss_clip": 0.01513079, + "auxiliary_loss_mlp": 0.01324603, + "balance_loss_clip": 1.15915501, + "balance_loss_mlp": 1.05337834, + "epoch": 0.19756500826694723, + "flos": 21873653596800.0, + "grad_norm": 2.4104674666167054, + "language_loss": 0.88288271, + "learning_rate": 3.712619437068174e-06, + "loss": 0.91125959, + "num_input_tokens_seen": 70882660, + "step": 3286, + "time_per_iteration": 2.820960283279419 + }, + { + "auxiliary_loss_clip": 0.01523831, + "auxiliary_loss_mlp": 0.01353794, + "balance_loss_clip": 1.16983211, + "balance_loss_mlp": 1.07341361, + "epoch": 0.19762513151961522, + "flos": 15160795308000.0, + "grad_norm": 2.6321980616552354, + "language_loss": 0.77778137, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80655766, + "num_input_tokens_seen": 70898765, + "step": 3287, + "time_per_iteration": 2.797884464263916 + }, + { + "auxiliary_loss_clip": 0.01516568, + "auxiliary_loss_mlp": 0.01335008, + "balance_loss_clip": 1.16316867, + "balance_loss_mlp": 1.05329204, + "epoch": 0.1976852547722832, + "flos": 16980872137920.0, + "grad_norm": 2.2566187621938227, + "language_loss": 0.82027376, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.84878957, + "num_input_tokens_seen": 70916370, + "step": 3288, + "time_per_iteration": 4.281113386154175 + }, + { + "auxiliary_loss_clip": 0.0151903, + "auxiliary_loss_mlp": 0.01349377, + "balance_loss_clip": 1.16464269, + "balance_loss_mlp": 1.08044052, + "epoch": 0.19774537802495115, + "flos": 20305005089760.0, + "grad_norm": 1.8603580275100924, + "language_loss": 0.72840476, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75708884, + "num_input_tokens_seen": 70934870, + "step": 3289, + "time_per_iteration": 2.8284873962402344 + }, + { + "auxiliary_loss_clip": 0.01518949, + "auxiliary_loss_mlp": 0.01328341, + "balance_loss_clip": 1.16402888, + "balance_loss_mlp": 1.04986763, + "epoch": 0.19780550127761912, + "flos": 27237962676960.0, + "grad_norm": 2.1166506983400186, + "language_loss": 0.7961157, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.8245886, + "num_input_tokens_seen": 70955140, + "step": 3290, + "time_per_iteration": 2.8736541271209717 + }, + { + "auxiliary_loss_clip": 0.01644469, + "auxiliary_loss_mlp": 0.01265953, + "balance_loss_clip": 1.29437208, + "balance_loss_mlp": 1.02867889, + "epoch": 0.19786562453028708, + "flos": 63558217042560.0, + "grad_norm": 0.9185862454579226, + "language_loss": 0.60292602, + "learning_rate": 3.711612913388418e-06, + "loss": 0.63203025, + "num_input_tokens_seen": 71012005, + "step": 3291, + "time_per_iteration": 3.4057095050811768 + }, + { + "auxiliary_loss_clip": 0.01515384, + "auxiliary_loss_mlp": 0.01320886, + "balance_loss_clip": 1.16133785, + "balance_loss_mlp": 1.03783572, + "epoch": 0.19792574778295505, + "flos": 26289066497760.0, + "grad_norm": 1.861587284986605, + "language_loss": 0.81271601, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.84107876, + "num_input_tokens_seen": 71031140, + "step": 3292, + "time_per_iteration": 4.398704290390015 + }, + { + "auxiliary_loss_clip": 0.01517137, + "auxiliary_loss_mlp": 0.01336636, + "balance_loss_clip": 1.16266894, + "balance_loss_mlp": 1.06579208, + "epoch": 0.19798587103562304, + "flos": 19940170475520.0, + "grad_norm": 2.0612538737362605, + "language_loss": 0.81700855, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84554625, + "num_input_tokens_seen": 71050250, + "step": 3293, + "time_per_iteration": 2.8176848888397217 + }, + { + "auxiliary_loss_clip": 0.01522066, + "auxiliary_loss_mlp": 0.01341505, + "balance_loss_clip": 1.16837072, + "balance_loss_mlp": 1.05711937, + "epoch": 0.198045994288291, + "flos": 20122341249600.0, + "grad_norm": 2.329166259814502, + "language_loss": 0.61086833, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63950408, + "num_input_tokens_seen": 71068665, + "step": 3294, + "time_per_iteration": 2.8216116428375244 + }, + { + "auxiliary_loss_clip": 0.01519067, + "auxiliary_loss_mlp": 0.01330225, + "balance_loss_clip": 1.16350591, + "balance_loss_mlp": 1.05804598, + "epoch": 0.19810611754095897, + "flos": 17969593249440.0, + "grad_norm": 2.2790261121564903, + "language_loss": 0.87582296, + "learning_rate": 3.710806526117251e-06, + "loss": 0.90431589, + "num_input_tokens_seen": 71085320, + "step": 3295, + "time_per_iteration": 2.7380337715148926 + }, + { + "auxiliary_loss_clip": 0.0151764, + "auxiliary_loss_mlp": 0.0133608, + "balance_loss_clip": 1.16236448, + "balance_loss_mlp": 1.06084943, + "epoch": 0.19816624079362694, + "flos": 15086758811040.0, + "grad_norm": 2.493959787811987, + "language_loss": 0.81129891, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.83983612, + "num_input_tokens_seen": 71102020, + "step": 3296, + "time_per_iteration": 2.708040237426758 + }, + { + "auxiliary_loss_clip": 0.01517891, + "auxiliary_loss_mlp": 0.0131596, + "balance_loss_clip": 1.16219664, + "balance_loss_mlp": 1.03805923, + "epoch": 0.1982263640462949, + "flos": 24902664621120.0, + "grad_norm": 2.2790686232429866, + "language_loss": 0.67841536, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70675385, + "num_input_tokens_seen": 71123390, + "step": 3297, + "time_per_iteration": 4.320048809051514 + }, + { + "auxiliary_loss_clip": 0.01517567, + "auxiliary_loss_mlp": 0.01320515, + "balance_loss_clip": 1.16254616, + "balance_loss_mlp": 1.0475738, + "epoch": 0.19828648729896287, + "flos": 20378283023520.0, + "grad_norm": 1.8972795905080329, + "language_loss": 0.81006575, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83844662, + "num_input_tokens_seen": 71141800, + "step": 3298, + "time_per_iteration": 2.7542974948883057 + }, + { + "auxiliary_loss_clip": 0.01512767, + "auxiliary_loss_mlp": 0.01342715, + "balance_loss_clip": 1.15757477, + "balance_loss_mlp": 1.07129967, + "epoch": 0.19834661055163083, + "flos": 18882040102560.0, + "grad_norm": 2.0472400944115585, + "language_loss": 0.85482764, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.88338256, + "num_input_tokens_seen": 71159505, + "step": 3299, + "time_per_iteration": 2.7566967010498047 + }, + { + "auxiliary_loss_clip": 0.01628037, + "auxiliary_loss_mlp": 0.01339867, + "balance_loss_clip": 1.27665389, + "balance_loss_mlp": 1.11708832, + "epoch": 0.19840673380429882, + "flos": 60265792396800.0, + "grad_norm": 0.7933922170207212, + "language_loss": 0.53173262, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.56141168, + "num_input_tokens_seen": 71223265, + "step": 3300, + "time_per_iteration": 3.317028284072876 + }, + { + "auxiliary_loss_clip": 0.01520133, + "auxiliary_loss_mlp": 0.01322705, + "balance_loss_clip": 1.16475177, + "balance_loss_mlp": 1.04671216, + "epoch": 0.1984668570569668, + "flos": 19904100431040.0, + "grad_norm": 1.7116896782537243, + "language_loss": 0.73910654, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.76753491, + "num_input_tokens_seen": 71242385, + "step": 3301, + "time_per_iteration": 2.7826194763183594 + }, + { + "auxiliary_loss_clip": 0.01510901, + "auxiliary_loss_mlp": 0.01327553, + "balance_loss_clip": 1.15583444, + "balance_loss_mlp": 1.05308592, + "epoch": 0.19852698030963475, + "flos": 15632474641920.0, + "grad_norm": 2.66688946344183, + "language_loss": 0.88213265, + "learning_rate": 3.709392851040235e-06, + "loss": 0.91051722, + "num_input_tokens_seen": 71258990, + "step": 3302, + "time_per_iteration": 2.813398838043213 + }, + { + "auxiliary_loss_clip": 0.01506194, + "auxiliary_loss_mlp": 0.01310899, + "balance_loss_clip": 1.15122008, + "balance_loss_mlp": 1.02899289, + "epoch": 0.19858710356230272, + "flos": 43146367597440.0, + "grad_norm": 1.9284844194229103, + "language_loss": 0.73615485, + "learning_rate": 3.709190638115111e-06, + "loss": 0.7643258, + "num_input_tokens_seen": 71282770, + "step": 3303, + "time_per_iteration": 3.0002644062042236 + }, + { + "auxiliary_loss_clip": 0.01518224, + "auxiliary_loss_mlp": 0.01326621, + "balance_loss_clip": 1.16252804, + "balance_loss_mlp": 1.04891133, + "epoch": 0.19864722681497068, + "flos": 35146518232320.0, + "grad_norm": 2.05801634229883, + "language_loss": 0.74817073, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.77661908, + "num_input_tokens_seen": 71301410, + "step": 3304, + "time_per_iteration": 2.9192869663238525 + }, + { + "auxiliary_loss_clip": 0.01513762, + "auxiliary_loss_mlp": 0.01314986, + "balance_loss_clip": 1.15968072, + "balance_loss_mlp": 1.0414722, + "epoch": 0.19870735006763865, + "flos": 19428173143200.0, + "grad_norm": 1.7645132054963293, + "language_loss": 0.86259794, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.89088547, + "num_input_tokens_seen": 71319670, + "step": 3305, + "time_per_iteration": 2.7883782386779785 + }, + { + "auxiliary_loss_clip": 0.015116, + "auxiliary_loss_mlp": 0.01326216, + "balance_loss_clip": 1.15801311, + "balance_loss_mlp": 1.04297495, + "epoch": 0.19876747332030664, + "flos": 23549488176960.0, + "grad_norm": 1.765387428491922, + "language_loss": 0.68712711, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.7155053, + "num_input_tokens_seen": 71339850, + "step": 3306, + "time_per_iteration": 2.8252713680267334 + }, + { + "auxiliary_loss_clip": 0.01506635, + "auxiliary_loss_mlp": 0.01323779, + "balance_loss_clip": 1.15194392, + "balance_loss_mlp": 1.04912078, + "epoch": 0.1988275965729746, + "flos": 19831505204160.0, + "grad_norm": 1.5195742441265514, + "language_loss": 0.76279521, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.79109931, + "num_input_tokens_seen": 71359795, + "step": 3307, + "time_per_iteration": 2.8403987884521484 + }, + { + "auxiliary_loss_clip": 0.01513165, + "auxiliary_loss_mlp": 0.01310035, + "balance_loss_clip": 1.15853989, + "balance_loss_mlp": 1.0363307, + "epoch": 0.19888771982564257, + "flos": 23515883462880.0, + "grad_norm": 3.319384760429614, + "language_loss": 0.76322329, + "learning_rate": 3.708178601452737e-06, + "loss": 0.79145527, + "num_input_tokens_seen": 71378885, + "step": 3308, + "time_per_iteration": 2.812652111053467 + }, + { + "auxiliary_loss_clip": 0.01507512, + "auxiliary_loss_mlp": 0.01318658, + "balance_loss_clip": 1.15222299, + "balance_loss_mlp": 1.0466702, + "epoch": 0.19894784307831054, + "flos": 18152484658560.0, + "grad_norm": 1.885046920890153, + "language_loss": 0.76441413, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.79267585, + "num_input_tokens_seen": 71397285, + "step": 3309, + "time_per_iteration": 2.7957193851470947 + }, + { + "auxiliary_loss_clip": 0.01518007, + "auxiliary_loss_mlp": 0.01320696, + "balance_loss_clip": 1.16302633, + "balance_loss_mlp": 1.04889929, + "epoch": 0.1990079663309785, + "flos": 24278133345120.0, + "grad_norm": 1.6937071328032376, + "language_loss": 0.88096273, + "learning_rate": 3.707773333313917e-06, + "loss": 0.9093498, + "num_input_tokens_seen": 71415775, + "step": 3310, + "time_per_iteration": 2.7955493927001953 + }, + { + "auxiliary_loss_clip": 0.01510249, + "auxiliary_loss_mlp": 0.01327908, + "balance_loss_clip": 1.15698361, + "balance_loss_mlp": 1.057446, + "epoch": 0.19906808958364647, + "flos": 34900589492640.0, + "grad_norm": 2.494223039651707, + "language_loss": 0.64604551, + "learning_rate": 3.70757060210226e-06, + "loss": 0.67442709, + "num_input_tokens_seen": 71437315, + "step": 3311, + "time_per_iteration": 2.8696229457855225 + }, + { + "auxiliary_loss_clip": 0.0151003, + "auxiliary_loss_mlp": 0.01336897, + "balance_loss_clip": 1.15603125, + "balance_loss_mlp": 1.06052208, + "epoch": 0.19912821283631443, + "flos": 24027198088320.0, + "grad_norm": 2.8366905711836834, + "language_loss": 0.74056959, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76903886, + "num_input_tokens_seen": 71456320, + "step": 3312, + "time_per_iteration": 2.8216147422790527 + }, + { + "auxiliary_loss_clip": 0.0152015, + "auxiliary_loss_mlp": 0.0134325, + "balance_loss_clip": 1.16658926, + "balance_loss_mlp": 1.07069039, + "epoch": 0.19918833608898243, + "flos": 19860748179840.0, + "grad_norm": 2.569427677459645, + "language_loss": 0.83612287, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.86475682, + "num_input_tokens_seen": 71475360, + "step": 3313, + "time_per_iteration": 2.791764497756958 + }, + { + "auxiliary_loss_clip": 0.01516531, + "auxiliary_loss_mlp": 0.01323098, + "balance_loss_clip": 1.16494143, + "balance_loss_mlp": 1.04920316, + "epoch": 0.1992484593416504, + "flos": 29098357505280.0, + "grad_norm": 2.9736148255992725, + "language_loss": 0.81498981, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.84338611, + "num_input_tokens_seen": 71496155, + "step": 3314, + "time_per_iteration": 2.8335368633270264 + }, + { + "auxiliary_loss_clip": 0.01512998, + "auxiliary_loss_mlp": 0.01325325, + "balance_loss_clip": 1.15891457, + "balance_loss_mlp": 1.05924988, + "epoch": 0.19930858259431836, + "flos": 23297263362720.0, + "grad_norm": 1.6330514119912947, + "language_loss": 0.88100934, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.9093926, + "num_input_tokens_seen": 71517295, + "step": 3315, + "time_per_iteration": 2.8213186264038086 + }, + { + "auxiliary_loss_clip": 0.0151422, + "auxiliary_loss_mlp": 0.01331182, + "balance_loss_clip": 1.16254401, + "balance_loss_mlp": 1.06186461, + "epoch": 0.19936870584698632, + "flos": 25381436520960.0, + "grad_norm": 2.005785222630406, + "language_loss": 0.70924032, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73769438, + "num_input_tokens_seen": 71540000, + "step": 3316, + "time_per_iteration": 2.8257102966308594 + }, + { + "auxiliary_loss_clip": 0.01614874, + "auxiliary_loss_mlp": 0.01297996, + "balance_loss_clip": 1.26709318, + "balance_loss_mlp": 1.06529999, + "epoch": 0.1994288290996543, + "flos": 62175418341120.0, + "grad_norm": 0.8585544551686215, + "language_loss": 0.66277945, + "learning_rate": 3.706352855325342e-06, + "loss": 0.69190818, + "num_input_tokens_seen": 71607880, + "step": 3317, + "time_per_iteration": 3.4053540229797363 + }, + { + "auxiliary_loss_clip": 0.01506452, + "auxiliary_loss_mlp": 0.01331063, + "balance_loss_clip": 1.15485501, + "balance_loss_mlp": 1.05640531, + "epoch": 0.19948895235232225, + "flos": 19027837406880.0, + "grad_norm": 2.9787712004572873, + "language_loss": 0.75107372, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.77944887, + "num_input_tokens_seen": 71625695, + "step": 3318, + "time_per_iteration": 2.7737419605255127 + }, + { + "auxiliary_loss_clip": 0.0151276, + "auxiliary_loss_mlp": 0.01333067, + "balance_loss_clip": 1.1612078, + "balance_loss_mlp": 1.06393969, + "epoch": 0.19954907560499022, + "flos": 37818849196800.0, + "grad_norm": 2.023515591005414, + "language_loss": 0.79122722, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81968558, + "num_input_tokens_seen": 71648520, + "step": 3319, + "time_per_iteration": 2.937142848968506 + }, + { + "auxiliary_loss_clip": 0.0151315, + "auxiliary_loss_mlp": 0.01363673, + "balance_loss_clip": 1.16261339, + "balance_loss_mlp": 1.09549952, + "epoch": 0.1996091988576582, + "flos": 49568958763200.0, + "grad_norm": 2.405161074911696, + "language_loss": 0.76147282, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.79024112, + "num_input_tokens_seen": 71672185, + "step": 3320, + "time_per_iteration": 3.0294647216796875 + }, + { + "auxiliary_loss_clip": 0.01516515, + "auxiliary_loss_mlp": 0.01365642, + "balance_loss_clip": 1.16399693, + "balance_loss_mlp": 1.10357213, + "epoch": 0.19966932211032618, + "flos": 22637875743360.0, + "grad_norm": 1.5664711010635894, + "language_loss": 0.8051061, + "learning_rate": 3.705539729936701e-06, + "loss": 0.83392769, + "num_input_tokens_seen": 71692890, + "step": 3321, + "time_per_iteration": 2.8230056762695312 + }, + { + "auxiliary_loss_clip": 0.01635469, + "auxiliary_loss_mlp": 0.01360245, + "balance_loss_clip": 1.28842294, + "balance_loss_mlp": 1.14128113, + "epoch": 0.19972944536299414, + "flos": 54087764556960.0, + "grad_norm": 0.9017428510754547, + "language_loss": 0.65152019, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.68147731, + "num_input_tokens_seen": 71745815, + "step": 3322, + "time_per_iteration": 3.18496036529541 + }, + { + "auxiliary_loss_clip": 0.016323, + "auxiliary_loss_mlp": 0.01322662, + "balance_loss_clip": 1.28626513, + "balance_loss_mlp": 1.09988403, + "epoch": 0.1997895686156621, + "flos": 69360714891360.0, + "grad_norm": 0.9401839934649444, + "language_loss": 0.56855756, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59810716, + "num_input_tokens_seen": 71806915, + "step": 3323, + "time_per_iteration": 3.385497570037842 + }, + { + "auxiliary_loss_clip": 0.01521961, + "auxiliary_loss_mlp": 0.01330527, + "balance_loss_clip": 1.17201042, + "balance_loss_mlp": 1.06140065, + "epoch": 0.19984969186833007, + "flos": 18554072024160.0, + "grad_norm": 2.1694603746977568, + "language_loss": 0.80522215, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.83374703, + "num_input_tokens_seen": 71824645, + "step": 3324, + "time_per_iteration": 2.8135905265808105 + }, + { + "auxiliary_loss_clip": 0.01517737, + "auxiliary_loss_mlp": 0.01327672, + "balance_loss_clip": 1.16827226, + "balance_loss_mlp": 1.05740094, + "epoch": 0.19990981512099804, + "flos": 26431943333760.0, + "grad_norm": 1.6989159633848523, + "language_loss": 0.53969741, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.56815159, + "num_input_tokens_seen": 71845125, + "step": 3325, + "time_per_iteration": 2.857383966445923 + }, + { + "auxiliary_loss_clip": 0.01520025, + "auxiliary_loss_mlp": 0.01325608, + "balance_loss_clip": 1.17248166, + "balance_loss_mlp": 1.05018663, + "epoch": 0.19996993837366603, + "flos": 16327856449440.0, + "grad_norm": 1.9342889429683645, + "language_loss": 0.85908616, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88754249, + "num_input_tokens_seen": 71863500, + "step": 3326, + "time_per_iteration": 4.309245586395264 + }, + { + "auxiliary_loss_clip": 0.01519697, + "auxiliary_loss_mlp": 0.01331464, + "balance_loss_clip": 1.17054796, + "balance_loss_mlp": 1.06062055, + "epoch": 0.200030061626334, + "flos": 20845941972480.0, + "grad_norm": 2.429315255074445, + "language_loss": 0.72205418, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.75056583, + "num_input_tokens_seen": 71881845, + "step": 3327, + "time_per_iteration": 2.816744089126587 + }, + { + "auxiliary_loss_clip": 0.01522801, + "auxiliary_loss_mlp": 0.01327655, + "balance_loss_clip": 1.17448175, + "balance_loss_mlp": 1.04841888, + "epoch": 0.20009018487900196, + "flos": 23764543030080.0, + "grad_norm": 1.8549900706906601, + "language_loss": 0.76727355, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.79577804, + "num_input_tokens_seen": 71900940, + "step": 3328, + "time_per_iteration": 2.827028512954712 + }, + { + "auxiliary_loss_clip": 0.01524639, + "auxiliary_loss_mlp": 0.01307254, + "balance_loss_clip": 1.17743099, + "balance_loss_mlp": 1.03679168, + "epoch": 0.20015030813166992, + "flos": 28114111916640.0, + "grad_norm": 1.9811662552532592, + "language_loss": 0.69612783, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.72444677, + "num_input_tokens_seen": 71921925, + "step": 3329, + "time_per_iteration": 4.461287260055542 + }, + { + "auxiliary_loss_clip": 0.01521206, + "auxiliary_loss_mlp": 0.01322248, + "balance_loss_clip": 1.17277765, + "balance_loss_mlp": 1.03709984, + "epoch": 0.2002104313843379, + "flos": 26069536121760.0, + "grad_norm": 2.047745509568482, + "language_loss": 0.81809521, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.84652972, + "num_input_tokens_seen": 71941855, + "step": 3330, + "time_per_iteration": 2.880201578140259 + }, + { + "auxiliary_loss_clip": 0.01521207, + "auxiliary_loss_mlp": 0.01331441, + "balance_loss_clip": 1.17452741, + "balance_loss_mlp": 1.05010724, + "epoch": 0.20027055463700585, + "flos": 22968878074560.0, + "grad_norm": 4.10875993685716, + "language_loss": 0.7678653, + "learning_rate": 3.703502390349417e-06, + "loss": 0.79639184, + "num_input_tokens_seen": 71960915, + "step": 3331, + "time_per_iteration": 2.774001121520996 + }, + { + "auxiliary_loss_clip": 0.01520588, + "auxiliary_loss_mlp": 0.01312808, + "balance_loss_clip": 1.17504501, + "balance_loss_mlp": 1.02861333, + "epoch": 0.20033067788967382, + "flos": 17167594291200.0, + "grad_norm": 2.165214208020569, + "language_loss": 0.79520524, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.82353914, + "num_input_tokens_seen": 71979220, + "step": 3332, + "time_per_iteration": 2.8816020488739014 + }, + { + "auxiliary_loss_clip": 0.01705978, + "auxiliary_loss_mlp": 0.01363831, + "balance_loss_clip": 1.36997342, + "balance_loss_mlp": 1.12197876, + "epoch": 0.2003908011423418, + "flos": 60831231235200.0, + "grad_norm": 0.9857356840676406, + "language_loss": 0.61921036, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64990842, + "num_input_tokens_seen": 72033950, + "step": 3333, + "time_per_iteration": 3.205507278442383 + }, + { + "auxiliary_loss_clip": 0.01516899, + "auxiliary_loss_mlp": 0.01333508, + "balance_loss_clip": 1.1721189, + "balance_loss_mlp": 1.0580864, + "epoch": 0.20045092439500978, + "flos": 24208496514720.0, + "grad_norm": 2.473549583620099, + "language_loss": 0.8122319, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.84073597, + "num_input_tokens_seen": 72051395, + "step": 3334, + "time_per_iteration": 4.439788103103638 + }, + { + "auxiliary_loss_clip": 0.01519118, + "auxiliary_loss_mlp": 0.01324301, + "balance_loss_clip": 1.17214525, + "balance_loss_mlp": 1.05479324, + "epoch": 0.20051104764767774, + "flos": 29390179682880.0, + "grad_norm": 2.6502977207661544, + "language_loss": 0.74971986, + "learning_rate": 3.702685645366134e-06, + "loss": 0.77815402, + "num_input_tokens_seen": 72071305, + "step": 3335, + "time_per_iteration": 4.492689371109009 + }, + { + "auxiliary_loss_clip": 0.01523146, + "auxiliary_loss_mlp": 0.01328589, + "balance_loss_clip": 1.17666912, + "balance_loss_mlp": 1.053931, + "epoch": 0.2005711709003457, + "flos": 23516073103680.0, + "grad_norm": 2.0608459379475663, + "language_loss": 0.80108118, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82959855, + "num_input_tokens_seen": 72090165, + "step": 3336, + "time_per_iteration": 2.878509759902954 + }, + { + "auxiliary_loss_clip": 0.01519459, + "auxiliary_loss_mlp": 0.01327095, + "balance_loss_clip": 1.17379415, + "balance_loss_mlp": 1.0535816, + "epoch": 0.20063129415301367, + "flos": 22525303871520.0, + "grad_norm": 2.3302506833052408, + "language_loss": 0.78019953, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.80866504, + "num_input_tokens_seen": 72107210, + "step": 3337, + "time_per_iteration": 2.8480067253112793 + }, + { + "auxiliary_loss_clip": 0.01521843, + "auxiliary_loss_mlp": 0.01321813, + "balance_loss_clip": 1.17669272, + "balance_loss_mlp": 1.04810834, + "epoch": 0.20069141740568164, + "flos": 25960643281440.0, + "grad_norm": 2.4077320465191865, + "language_loss": 0.69311315, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.72154975, + "num_input_tokens_seen": 72126315, + "step": 3338, + "time_per_iteration": 2.9369075298309326 + }, + { + "auxiliary_loss_clip": 0.01530683, + "auxiliary_loss_mlp": 0.01327434, + "balance_loss_clip": 1.18463516, + "balance_loss_mlp": 1.05544627, + "epoch": 0.2007515406583496, + "flos": 24792937361280.0, + "grad_norm": 2.9527322375586205, + "language_loss": 0.69114667, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71972787, + "num_input_tokens_seen": 72146470, + "step": 3339, + "time_per_iteration": 2.818878412246704 + }, + { + "auxiliary_loss_clip": 0.01530286, + "auxiliary_loss_mlp": 0.013303, + "balance_loss_clip": 1.18418741, + "balance_loss_mlp": 1.05640483, + "epoch": 0.2008116639110176, + "flos": 37928652312960.0, + "grad_norm": 2.2468041067083946, + "language_loss": 0.6653018, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.69390762, + "num_input_tokens_seen": 72166600, + "step": 3340, + "time_per_iteration": 2.9029810428619385 + }, + { + "auxiliary_loss_clip": 0.01520415, + "auxiliary_loss_mlp": 0.01346019, + "balance_loss_clip": 1.17520106, + "balance_loss_mlp": 1.07651103, + "epoch": 0.20087178716368556, + "flos": 20742624571680.0, + "grad_norm": 3.403484198233785, + "language_loss": 0.74440026, + "learning_rate": 3.701458591066019e-06, + "loss": 0.77306461, + "num_input_tokens_seen": 72185160, + "step": 3341, + "time_per_iteration": 2.891068696975708 + }, + { + "auxiliary_loss_clip": 0.015258, + "auxiliary_loss_mlp": 0.01338631, + "balance_loss_clip": 1.18008435, + "balance_loss_mlp": 1.07045794, + "epoch": 0.20093191041635353, + "flos": 23844572176320.0, + "grad_norm": 2.3005654975257053, + "language_loss": 0.72294241, + "learning_rate": 3.70125385615256e-06, + "loss": 0.75158674, + "num_input_tokens_seen": 72205160, + "step": 3342, + "time_per_iteration": 2.8577911853790283 + }, + { + "auxiliary_loss_clip": 0.01519348, + "auxiliary_loss_mlp": 0.01331666, + "balance_loss_clip": 1.17357802, + "balance_loss_mlp": 1.06101346, + "epoch": 0.2009920336690215, + "flos": 21793852019520.0, + "grad_norm": 2.548971364834111, + "language_loss": 0.72902125, + "learning_rate": 3.701049056727384e-06, + "loss": 0.7575314, + "num_input_tokens_seen": 72223555, + "step": 3343, + "time_per_iteration": 2.8439908027648926 + }, + { + "auxiliary_loss_clip": 0.01521255, + "auxiliary_loss_mlp": 0.01341391, + "balance_loss_clip": 1.17491722, + "balance_loss_mlp": 1.06902206, + "epoch": 0.20105215692168946, + "flos": 26361623796480.0, + "grad_norm": 2.4627141958669063, + "language_loss": 0.81246674, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.84109318, + "num_input_tokens_seen": 72242465, + "step": 3344, + "time_per_iteration": 2.91621994972229 + }, + { + "auxiliary_loss_clip": 0.01521518, + "auxiliary_loss_mlp": 0.01341256, + "balance_loss_clip": 1.17558825, + "balance_loss_mlp": 1.07079434, + "epoch": 0.20111228017435742, + "flos": 18809293163040.0, + "grad_norm": 2.395869540817821, + "language_loss": 0.8399505, + "learning_rate": 3.700639264372948e-06, + "loss": 0.8685782, + "num_input_tokens_seen": 72260655, + "step": 3345, + "time_per_iteration": 2.839421510696411 + }, + { + "auxiliary_loss_clip": 0.01527386, + "auxiliary_loss_mlp": 0.01339604, + "balance_loss_clip": 1.1822834, + "balance_loss_mlp": 1.07658076, + "epoch": 0.20117240342702541, + "flos": 19977150795840.0, + "grad_norm": 1.7992092846389702, + "language_loss": 0.68439621, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.7130661, + "num_input_tokens_seen": 72279055, + "step": 3346, + "time_per_iteration": 2.8692824840545654 + }, + { + "auxiliary_loss_clip": 0.01517409, + "auxiliary_loss_mlp": 0.01345538, + "balance_loss_clip": 1.1700418, + "balance_loss_mlp": 1.07869947, + "epoch": 0.20123252667969338, + "flos": 23144601061440.0, + "grad_norm": 2.3591706966672388, + "language_loss": 0.74120784, + "learning_rate": 3.70022921406487e-06, + "loss": 0.76983732, + "num_input_tokens_seen": 72297895, + "step": 3347, + "time_per_iteration": 2.870859384536743 + }, + { + "auxiliary_loss_clip": 0.0152764, + "auxiliary_loss_mlp": 0.01353212, + "balance_loss_clip": 1.18187475, + "balance_loss_mlp": 1.09228635, + "epoch": 0.20129264993236134, + "flos": 23223909572640.0, + "grad_norm": 2.153860738372545, + "language_loss": 0.86916208, + "learning_rate": 3.70002409219765e-06, + "loss": 0.89797062, + "num_input_tokens_seen": 72318385, + "step": 3348, + "time_per_iteration": 2.8634986877441406 + }, + { + "auxiliary_loss_clip": 0.01523758, + "auxiliary_loss_mlp": 0.01338718, + "balance_loss_clip": 1.17723751, + "balance_loss_mlp": 1.07645762, + "epoch": 0.2013527731850293, + "flos": 21873615668640.0, + "grad_norm": 3.276770952172722, + "language_loss": 0.70949155, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73811632, + "num_input_tokens_seen": 72338235, + "step": 3349, + "time_per_iteration": 2.856656312942505 + }, + { + "auxiliary_loss_clip": 0.01524645, + "auxiliary_loss_mlp": 0.01340917, + "balance_loss_clip": 1.17755389, + "balance_loss_mlp": 1.07732129, + "epoch": 0.20141289643769728, + "flos": 18042757398720.0, + "grad_norm": 1.6822319258254483, + "language_loss": 0.71458924, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.74324489, + "num_input_tokens_seen": 72357825, + "step": 3350, + "time_per_iteration": 2.8924143314361572 + }, + { + "auxiliary_loss_clip": 0.01523082, + "auxiliary_loss_mlp": 0.01354712, + "balance_loss_clip": 1.17657852, + "balance_loss_mlp": 1.0874927, + "epoch": 0.20147301969036524, + "flos": 23953692585600.0, + "grad_norm": 4.2468416259093695, + "language_loss": 0.7648108, + "learning_rate": 3.69940833983661e-06, + "loss": 0.79358876, + "num_input_tokens_seen": 72376335, + "step": 3351, + "time_per_iteration": 2.882964849472046 + }, + { + "auxiliary_loss_clip": 0.01518111, + "auxiliary_loss_mlp": 0.01339161, + "balance_loss_clip": 1.17116213, + "balance_loss_mlp": 1.06850815, + "epoch": 0.2015331429430332, + "flos": 25590574581120.0, + "grad_norm": 1.8279220506270077, + "language_loss": 0.81215239, + "learning_rate": 3.699202960155748e-06, + "loss": 0.84072506, + "num_input_tokens_seen": 72395440, + "step": 3352, + "time_per_iteration": 2.9293549060821533 + }, + { + "auxiliary_loss_clip": 0.01525975, + "auxiliary_loss_mlp": 0.01331362, + "balance_loss_clip": 1.17937267, + "balance_loss_mlp": 1.06433344, + "epoch": 0.2015932661957012, + "flos": 26727444542880.0, + "grad_norm": 2.3372389999546646, + "language_loss": 0.80371904, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.83229244, + "num_input_tokens_seen": 72414670, + "step": 3353, + "time_per_iteration": 2.960408926010132 + }, + { + "auxiliary_loss_clip": 0.01525924, + "auxiliary_loss_mlp": 0.01319697, + "balance_loss_clip": 1.18039906, + "balance_loss_mlp": 1.04675555, + "epoch": 0.20165338944836916, + "flos": 15634977900480.0, + "grad_norm": 1.9503113415175486, + "language_loss": 0.90316325, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.93161952, + "num_input_tokens_seen": 72432210, + "step": 3354, + "time_per_iteration": 2.8272740840911865 + }, + { + "auxiliary_loss_clip": 0.01708392, + "auxiliary_loss_mlp": 0.01340416, + "balance_loss_clip": 1.36379099, + "balance_loss_mlp": 1.11306, + "epoch": 0.20171351270103713, + "flos": 57918357329760.0, + "grad_norm": 0.860260164072623, + "language_loss": 0.55810481, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.58859289, + "num_input_tokens_seen": 72489225, + "step": 3355, + "time_per_iteration": 3.3684487342834473 + }, + { + "auxiliary_loss_clip": 0.0152929, + "auxiliary_loss_mlp": 0.01328679, + "balance_loss_clip": 1.18303967, + "balance_loss_mlp": 1.05802655, + "epoch": 0.2017736359537051, + "flos": 20816509356000.0, + "grad_norm": 1.8331615785283057, + "language_loss": 0.84382701, + "learning_rate": 3.698380797170751e-06, + "loss": 0.87240672, + "num_input_tokens_seen": 72508715, + "step": 3356, + "time_per_iteration": 2.839820623397827 + }, + { + "auxiliary_loss_clip": 0.01522732, + "auxiliary_loss_mlp": 0.01345005, + "balance_loss_clip": 1.17675447, + "balance_loss_mlp": 1.0693928, + "epoch": 0.20183375920637306, + "flos": 17093937075840.0, + "grad_norm": 2.4886618959198863, + "language_loss": 0.69516844, + "learning_rate": 3.698175095398085e-06, + "loss": 0.72384578, + "num_input_tokens_seen": 72525135, + "step": 3357, + "time_per_iteration": 2.8736467361450195 + }, + { + "auxiliary_loss_clip": 0.0152405, + "auxiliary_loss_mlp": 0.01324036, + "balance_loss_clip": 1.17946756, + "balance_loss_mlp": 1.04003143, + "epoch": 0.20189388245904102, + "flos": 18663344146080.0, + "grad_norm": 2.977145742361284, + "language_loss": 0.72172284, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.75020373, + "num_input_tokens_seen": 72543690, + "step": 3358, + "time_per_iteration": 2.8348515033721924 + }, + { + "auxiliary_loss_clip": 0.01529225, + "auxiliary_loss_mlp": 0.01316356, + "balance_loss_clip": 1.18580723, + "balance_loss_mlp": 1.03788304, + "epoch": 0.20195400571170902, + "flos": 16799270286240.0, + "grad_norm": 1.960802925271763, + "language_loss": 0.8323862, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.86084199, + "num_input_tokens_seen": 72560725, + "step": 3359, + "time_per_iteration": 2.841698169708252 + }, + { + "auxiliary_loss_clip": 0.01731843, + "auxiliary_loss_mlp": 0.01257675, + "balance_loss_clip": 1.40005636, + "balance_loss_mlp": 1.0242157, + "epoch": 0.20201412896437698, + "flos": 67180772036160.0, + "grad_norm": 0.8185558001500766, + "language_loss": 0.58910561, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61900079, + "num_input_tokens_seen": 72621940, + "step": 3360, + "time_per_iteration": 3.36193585395813 + }, + { + "auxiliary_loss_clip": 0.01532704, + "auxiliary_loss_mlp": 0.01343115, + "balance_loss_clip": 1.18882143, + "balance_loss_mlp": 1.05720329, + "epoch": 0.20207425221704495, + "flos": 21327179202720.0, + "grad_norm": 2.84704261170881, + "language_loss": 0.6273185, + "learning_rate": 3.697351644435763e-06, + "loss": 0.65607667, + "num_input_tokens_seen": 72639135, + "step": 3361, + "time_per_iteration": 2.8484487533569336 + }, + { + "auxiliary_loss_clip": 0.01530357, + "auxiliary_loss_mlp": 0.01339892, + "balance_loss_clip": 1.18632388, + "balance_loss_mlp": 1.05817676, + "epoch": 0.2021343754697129, + "flos": 22529286328320.0, + "grad_norm": 2.306955943974163, + "language_loss": 0.75664806, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.78535056, + "num_input_tokens_seen": 72658525, + "step": 3362, + "time_per_iteration": 2.900233268737793 + }, + { + "auxiliary_loss_clip": 0.0154066, + "auxiliary_loss_mlp": 0.01330996, + "balance_loss_clip": 1.19621181, + "balance_loss_mlp": 1.05576539, + "epoch": 0.20219449872238088, + "flos": 19064817727200.0, + "grad_norm": 1.7641884882800518, + "language_loss": 0.76628697, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.79500353, + "num_input_tokens_seen": 72678085, + "step": 3363, + "time_per_iteration": 2.811291217803955 + }, + { + "auxiliary_loss_clip": 0.01527776, + "auxiliary_loss_mlp": 0.01322746, + "balance_loss_clip": 1.18447232, + "balance_loss_mlp": 1.04217494, + "epoch": 0.20225462197504884, + "flos": 24719318074080.0, + "grad_norm": 1.6082818347372985, + "language_loss": 0.7540549, + "learning_rate": 3.696733380367391e-06, + "loss": 0.78256011, + "num_input_tokens_seen": 72698695, + "step": 3364, + "time_per_iteration": 5.642750024795532 + }, + { + "auxiliary_loss_clip": 0.01536013, + "auxiliary_loss_mlp": 0.01326745, + "balance_loss_clip": 1.19233847, + "balance_loss_mlp": 1.05170524, + "epoch": 0.2023147452277168, + "flos": 22020512889600.0, + "grad_norm": 2.746309385252025, + "language_loss": 0.72055554, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.74918318, + "num_input_tokens_seen": 72717880, + "step": 3365, + "time_per_iteration": 2.8380250930786133 + }, + { + "auxiliary_loss_clip": 0.01536295, + "auxiliary_loss_mlp": 0.01327762, + "balance_loss_clip": 1.19316983, + "balance_loss_mlp": 1.04585612, + "epoch": 0.2023748684803848, + "flos": 17747104476960.0, + "grad_norm": 2.3693949497914297, + "language_loss": 0.86242509, + "learning_rate": 3.696320882607286e-06, + "loss": 0.89106566, + "num_input_tokens_seen": 72736410, + "step": 3366, + "time_per_iteration": 2.846514940261841 + }, + { + "auxiliary_loss_clip": 0.01545806, + "auxiliary_loss_mlp": 0.01338249, + "balance_loss_clip": 1.20278704, + "balance_loss_mlp": 1.06797779, + "epoch": 0.20243499173305277, + "flos": 31141605814560.0, + "grad_norm": 1.890606101540418, + "language_loss": 0.69645596, + "learning_rate": 3.696114537236335e-06, + "loss": 0.7252965, + "num_input_tokens_seen": 72758295, + "step": 3367, + "time_per_iteration": 4.418527841567993 + }, + { + "auxiliary_loss_clip": 0.01533541, + "auxiliary_loss_mlp": 0.01334233, + "balance_loss_clip": 1.1901772, + "balance_loss_mlp": 1.05843091, + "epoch": 0.20249511498572073, + "flos": 33842079838080.0, + "grad_norm": 1.8537937915183178, + "language_loss": 0.68153763, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.71021545, + "num_input_tokens_seen": 72782495, + "step": 3368, + "time_per_iteration": 2.9847042560577393 + }, + { + "auxiliary_loss_clip": 0.01560556, + "auxiliary_loss_mlp": 0.01336197, + "balance_loss_clip": 1.216483, + "balance_loss_mlp": 1.06821489, + "epoch": 0.2025552382383887, + "flos": 21217793296320.0, + "grad_norm": 2.054886598003508, + "language_loss": 0.77592409, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.80489159, + "num_input_tokens_seen": 72801885, + "step": 3369, + "time_per_iteration": 2.8655171394348145 + }, + { + "auxiliary_loss_clip": 0.01546549, + "auxiliary_loss_mlp": 0.0132186, + "balance_loss_clip": 1.20374036, + "balance_loss_mlp": 1.04682052, + "epoch": 0.20261536149105666, + "flos": 14649139329120.0, + "grad_norm": 3.1150978598139445, + "language_loss": 0.64715147, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67583549, + "num_input_tokens_seen": 72816990, + "step": 3370, + "time_per_iteration": 2.934756278991699 + }, + { + "auxiliary_loss_clip": 0.01756759, + "auxiliary_loss_mlp": 0.01303734, + "balance_loss_clip": 1.42933083, + "balance_loss_mlp": 1.07408905, + "epoch": 0.20267548474372463, + "flos": 66790259693280.0, + "grad_norm": 0.695217437728137, + "language_loss": 0.5809409, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.61154586, + "num_input_tokens_seen": 72879240, + "step": 3371, + "time_per_iteration": 3.4376845359802246 + }, + { + "auxiliary_loss_clip": 0.01549048, + "auxiliary_loss_mlp": 0.01323632, + "balance_loss_clip": 1.20626915, + "balance_loss_mlp": 1.0497365, + "epoch": 0.2027356079963926, + "flos": 24683096316960.0, + "grad_norm": 2.0507053236586095, + "language_loss": 0.91922033, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94794714, + "num_input_tokens_seen": 72899030, + "step": 3372, + "time_per_iteration": 4.374111890792847 + }, + { + "auxiliary_loss_clip": 0.01544785, + "auxiliary_loss_mlp": 0.01329806, + "balance_loss_clip": 1.20042062, + "balance_loss_mlp": 1.05839002, + "epoch": 0.20279573124906058, + "flos": 26395000941600.0, + "grad_norm": 1.643569792929997, + "language_loss": 0.78690386, + "learning_rate": 3.694875114631167e-06, + "loss": 0.81564975, + "num_input_tokens_seen": 72919190, + "step": 3373, + "time_per_iteration": 2.9624476432800293 + }, + { + "auxiliary_loss_clip": 0.01554452, + "auxiliary_loss_mlp": 0.01314754, + "balance_loss_clip": 1.21097779, + "balance_loss_mlp": 1.04066777, + "epoch": 0.20285585450172855, + "flos": 33802672115520.0, + "grad_norm": 2.1038739546580647, + "language_loss": 0.71538836, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.74408042, + "num_input_tokens_seen": 72939720, + "step": 3374, + "time_per_iteration": 4.4447948932647705 + }, + { + "auxiliary_loss_clip": 0.01733078, + "auxiliary_loss_mlp": 0.0126223, + "balance_loss_clip": 1.40343189, + "balance_loss_mlp": 1.02648163, + "epoch": 0.20291597775439651, + "flos": 71171840311200.0, + "grad_norm": 0.9736732308703813, + "language_loss": 0.62450576, + "learning_rate": 3.694461459520516e-06, + "loss": 0.65445888, + "num_input_tokens_seen": 73000015, + "step": 3375, + "time_per_iteration": 3.285959005355835 + }, + { + "auxiliary_loss_clip": 0.01545538, + "auxiliary_loss_mlp": 0.01341504, + "balance_loss_clip": 1.20053732, + "balance_loss_mlp": 1.07142305, + "epoch": 0.20297610100706448, + "flos": 19495989421920.0, + "grad_norm": 1.7166718074584388, + "language_loss": 0.82614291, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.85501325, + "num_input_tokens_seen": 73017675, + "step": 3376, + "time_per_iteration": 2.9349262714385986 + }, + { + "auxiliary_loss_clip": 0.01541532, + "auxiliary_loss_mlp": 0.01339025, + "balance_loss_clip": 1.19718456, + "balance_loss_mlp": 1.06398511, + "epoch": 0.20303622425973245, + "flos": 25046489661120.0, + "grad_norm": 2.687058130877717, + "language_loss": 0.81569564, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.84450114, + "num_input_tokens_seen": 73036135, + "step": 3377, + "time_per_iteration": 2.898834228515625 + }, + { + "auxiliary_loss_clip": 0.01543447, + "auxiliary_loss_mlp": 0.01345932, + "balance_loss_clip": 1.19996858, + "balance_loss_mlp": 1.07585192, + "epoch": 0.2030963475124004, + "flos": 21982091299200.0, + "grad_norm": 2.4335603695867802, + "language_loss": 0.7680102, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79690397, + "num_input_tokens_seen": 73054075, + "step": 3378, + "time_per_iteration": 2.9314167499542236 + }, + { + "auxiliary_loss_clip": 0.01546491, + "auxiliary_loss_mlp": 0.01327792, + "balance_loss_clip": 1.20075369, + "balance_loss_mlp": 1.05561376, + "epoch": 0.2031564707650684, + "flos": 19502437209120.0, + "grad_norm": 2.322760597061458, + "language_loss": 0.79809213, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.82683492, + "num_input_tokens_seen": 73073530, + "step": 3379, + "time_per_iteration": 2.9238760471343994 + }, + { + "auxiliary_loss_clip": 0.01535157, + "auxiliary_loss_mlp": 0.0133454, + "balance_loss_clip": 1.1898632, + "balance_loss_mlp": 1.06541288, + "epoch": 0.20321659401773637, + "flos": 22749271842240.0, + "grad_norm": 1.9950757665375267, + "language_loss": 0.86826044, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.8969574, + "num_input_tokens_seen": 73092820, + "step": 3380, + "time_per_iteration": 2.8674473762512207 + }, + { + "auxiliary_loss_clip": 0.01538362, + "auxiliary_loss_mlp": 0.01346307, + "balance_loss_clip": 1.19086075, + "balance_loss_mlp": 1.08385623, + "epoch": 0.20327671727040433, + "flos": 22458056515200.0, + "grad_norm": 1.9175606016843636, + "language_loss": 0.75158954, + "learning_rate": 3.693218952340186e-06, + "loss": 0.78043616, + "num_input_tokens_seen": 73113385, + "step": 3381, + "time_per_iteration": 2.929108142852783 + }, + { + "auxiliary_loss_clip": 0.01534337, + "auxiliary_loss_mlp": 0.0133628, + "balance_loss_clip": 1.18789589, + "balance_loss_mlp": 1.0625751, + "epoch": 0.2033368405230723, + "flos": 19536952199040.0, + "grad_norm": 3.1133205493638663, + "language_loss": 0.7912429, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81994909, + "num_input_tokens_seen": 73131195, + "step": 3382, + "time_per_iteration": 2.867310047149658 + }, + { + "auxiliary_loss_clip": 0.01540192, + "auxiliary_loss_mlp": 0.01346704, + "balance_loss_clip": 1.19366074, + "balance_loss_mlp": 1.07319069, + "epoch": 0.20339696377574026, + "flos": 13810994470080.0, + "grad_norm": 1.8874631379823126, + "language_loss": 0.80467868, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.83354765, + "num_input_tokens_seen": 73148850, + "step": 3383, + "time_per_iteration": 2.809664011001587 + }, + { + "auxiliary_loss_clip": 0.0154412, + "auxiliary_loss_mlp": 0.01342843, + "balance_loss_clip": 1.19481194, + "balance_loss_mlp": 1.07123649, + "epoch": 0.20345708702840823, + "flos": 20341492344000.0, + "grad_norm": 2.3787162751954876, + "language_loss": 0.74752474, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.77639437, + "num_input_tokens_seen": 73166775, + "step": 3384, + "time_per_iteration": 2.8242623805999756 + }, + { + "auxiliary_loss_clip": 0.01535691, + "auxiliary_loss_mlp": 0.01349722, + "balance_loss_clip": 1.18782091, + "balance_loss_mlp": 1.07430077, + "epoch": 0.2035172102810762, + "flos": 20335727263680.0, + "grad_norm": 2.513359189016813, + "language_loss": 0.7710827, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.79993677, + "num_input_tokens_seen": 73183215, + "step": 3385, + "time_per_iteration": 2.8802075386047363 + }, + { + "auxiliary_loss_clip": 0.01542451, + "auxiliary_loss_mlp": 0.01328784, + "balance_loss_clip": 1.19346988, + "balance_loss_mlp": 1.0550797, + "epoch": 0.2035773335337442, + "flos": 23333219622720.0, + "grad_norm": 2.6607610247399713, + "language_loss": 0.68575239, + "learning_rate": 3.692181763924639e-06, + "loss": 0.71446466, + "num_input_tokens_seen": 73203290, + "step": 3386, + "time_per_iteration": 2.8658881187438965 + }, + { + "auxiliary_loss_clip": 0.01538228, + "auxiliary_loss_mlp": 0.01325465, + "balance_loss_clip": 1.18860602, + "balance_loss_mlp": 1.0494715, + "epoch": 0.20363745678641215, + "flos": 28332845801280.0, + "grad_norm": 1.7518939215034992, + "language_loss": 0.8106631, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83929998, + "num_input_tokens_seen": 73226185, + "step": 3387, + "time_per_iteration": 2.961358070373535 + }, + { + "auxiliary_loss_clip": 0.01548888, + "auxiliary_loss_mlp": 0.01330157, + "balance_loss_clip": 1.19837713, + "balance_loss_mlp": 1.06465459, + "epoch": 0.20369758003908012, + "flos": 18917617080960.0, + "grad_norm": 2.152305855681892, + "language_loss": 0.80291289, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.83170331, + "num_input_tokens_seen": 73243300, + "step": 3388, + "time_per_iteration": 2.8957319259643555 + }, + { + "auxiliary_loss_clip": 0.01535139, + "auxiliary_loss_mlp": 0.01329035, + "balance_loss_clip": 1.18656707, + "balance_loss_mlp": 1.0559026, + "epoch": 0.20375770329174808, + "flos": 19208415198240.0, + "grad_norm": 1.8371583030122622, + "language_loss": 0.72254938, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.75119114, + "num_input_tokens_seen": 73261490, + "step": 3389, + "time_per_iteration": 2.8152873516082764 + }, + { + "auxiliary_loss_clip": 0.01541859, + "auxiliary_loss_mlp": 0.0132251, + "balance_loss_clip": 1.19228101, + "balance_loss_mlp": 1.0476613, + "epoch": 0.20381782654441605, + "flos": 19393051302720.0, + "grad_norm": 1.7831378560750126, + "language_loss": 0.87240124, + "learning_rate": 3.691350858126404e-06, + "loss": 0.90104496, + "num_input_tokens_seen": 73280180, + "step": 3390, + "time_per_iteration": 2.919914960861206 + }, + { + "auxiliary_loss_clip": 0.01540163, + "auxiliary_loss_mlp": 0.01324465, + "balance_loss_clip": 1.19104767, + "balance_loss_mlp": 1.04370379, + "epoch": 0.203877949797084, + "flos": 24830069394240.0, + "grad_norm": 2.2859917256002347, + "language_loss": 0.71325773, + "learning_rate": 3.691142971316662e-06, + "loss": 0.74190402, + "num_input_tokens_seen": 73300680, + "step": 3391, + "time_per_iteration": 2.8509409427642822 + }, + { + "auxiliary_loss_clip": 0.01534311, + "auxiliary_loss_mlp": 0.01322312, + "balance_loss_clip": 1.18525696, + "balance_loss_mlp": 1.0467, + "epoch": 0.20393807304975198, + "flos": 18005625365760.0, + "grad_norm": 2.9073083274606546, + "language_loss": 0.86501831, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.89358449, + "num_input_tokens_seen": 73316760, + "step": 3392, + "time_per_iteration": 2.8149220943450928 + }, + { + "auxiliary_loss_clip": 0.01529849, + "auxiliary_loss_mlp": 0.01318112, + "balance_loss_clip": 1.18132854, + "balance_loss_mlp": 1.04097414, + "epoch": 0.20399819630241997, + "flos": 24209293006080.0, + "grad_norm": 1.5340776260053395, + "language_loss": 0.8091507, + "learning_rate": 3.69072700532013e-06, + "loss": 0.83763033, + "num_input_tokens_seen": 73339385, + "step": 3393, + "time_per_iteration": 2.8711161613464355 + }, + { + "auxiliary_loss_clip": 0.01540368, + "auxiliary_loss_mlp": 0.01324575, + "balance_loss_clip": 1.19108307, + "balance_loss_mlp": 1.04266894, + "epoch": 0.20405831955508794, + "flos": 20779225610400.0, + "grad_norm": 1.7725950480006867, + "language_loss": 0.86346221, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.89211166, + "num_input_tokens_seen": 73357235, + "step": 3394, + "time_per_iteration": 2.8497719764709473 + }, + { + "auxiliary_loss_clip": 0.01538709, + "auxiliary_loss_mlp": 0.01319475, + "balance_loss_clip": 1.19094419, + "balance_loss_mlp": 1.03642464, + "epoch": 0.2041184428077559, + "flos": 15488725458240.0, + "grad_norm": 2.728067715645233, + "language_loss": 0.84088159, + "learning_rate": 3.69031078287345e-06, + "loss": 0.86946344, + "num_input_tokens_seen": 73374435, + "step": 3395, + "time_per_iteration": 2.9025774002075195 + }, + { + "auxiliary_loss_clip": 0.01545995, + "auxiliary_loss_mlp": 0.01330067, + "balance_loss_clip": 1.19740796, + "balance_loss_mlp": 1.04797029, + "epoch": 0.20417856606042387, + "flos": 15589843025760.0, + "grad_norm": 2.3074143056581864, + "language_loss": 0.83840632, + "learning_rate": 3.690102575501033e-06, + "loss": 0.86716694, + "num_input_tokens_seen": 73391025, + "step": 3396, + "time_per_iteration": 2.8155531883239746 + }, + { + "auxiliary_loss_clip": 0.01545227, + "auxiliary_loss_mlp": 0.01318472, + "balance_loss_clip": 1.19515777, + "balance_loss_mlp": 1.04553032, + "epoch": 0.20423868931309183, + "flos": 24281622735840.0, + "grad_norm": 2.669446747680031, + "language_loss": 0.76839978, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79703677, + "num_input_tokens_seen": 73409270, + "step": 3397, + "time_per_iteration": 2.8910398483276367 + }, + { + "auxiliary_loss_clip": 0.01551476, + "auxiliary_loss_mlp": 0.01322364, + "balance_loss_clip": 1.20163, + "balance_loss_mlp": 1.04999471, + "epoch": 0.2042988125657598, + "flos": 18616312863360.0, + "grad_norm": 2.686819021926571, + "language_loss": 0.87337446, + "learning_rate": 3.689685968497518e-06, + "loss": 0.90211284, + "num_input_tokens_seen": 73425225, + "step": 3398, + "time_per_iteration": 2.7956607341766357 + }, + { + "auxiliary_loss_clip": 0.01552933, + "auxiliary_loss_mlp": 0.01350654, + "balance_loss_clip": 1.20264673, + "balance_loss_mlp": 1.07866633, + "epoch": 0.2043589358184278, + "flos": 17852773423680.0, + "grad_norm": 2.068373425882441, + "language_loss": 0.78379422, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.81283009, + "num_input_tokens_seen": 73440940, + "step": 3399, + "time_per_iteration": 2.816413402557373 + }, + { + "auxiliary_loss_clip": 0.01536928, + "auxiliary_loss_mlp": 0.01320076, + "balance_loss_clip": 1.18708515, + "balance_loss_mlp": 1.04618037, + "epoch": 0.20441905907109575, + "flos": 21437437456800.0, + "grad_norm": 2.741197131141001, + "language_loss": 0.76958078, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.79815078, + "num_input_tokens_seen": 73458805, + "step": 3400, + "time_per_iteration": 2.836477756500244 + }, + { + "auxiliary_loss_clip": 0.01540702, + "auxiliary_loss_mlp": 0.0130923, + "balance_loss_clip": 1.19014251, + "balance_loss_mlp": 1.03914952, + "epoch": 0.20447918232376372, + "flos": 27710097148800.0, + "grad_norm": 1.8035855990447536, + "language_loss": 0.79448366, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.82298297, + "num_input_tokens_seen": 73479380, + "step": 3401, + "time_per_iteration": 2.919539213180542 + }, + { + "auxiliary_loss_clip": 0.01536658, + "auxiliary_loss_mlp": 0.0132018, + "balance_loss_clip": 1.18601847, + "balance_loss_mlp": 1.04647565, + "epoch": 0.20453930557643168, + "flos": 30527163429120.0, + "grad_norm": 1.6699790936606953, + "language_loss": 0.69926178, + "learning_rate": 3.688851985676991e-06, + "loss": 0.72783017, + "num_input_tokens_seen": 73505105, + "step": 3402, + "time_per_iteration": 4.630762815475464 + }, + { + "auxiliary_loss_clip": 0.01550531, + "auxiliary_loss_mlp": 0.0133152, + "balance_loss_clip": 1.20028198, + "balance_loss_mlp": 1.06220245, + "epoch": 0.20459942882909965, + "flos": 18989681313600.0, + "grad_norm": 2.1356322102662615, + "language_loss": 0.80830604, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83712649, + "num_input_tokens_seen": 73523700, + "step": 3403, + "time_per_iteration": 2.84511661529541 + }, + { + "auxiliary_loss_clip": 0.01547229, + "auxiliary_loss_mlp": 0.01330839, + "balance_loss_clip": 1.19704723, + "balance_loss_mlp": 1.05866015, + "epoch": 0.20465955208176762, + "flos": 20341113062400.0, + "grad_norm": 2.0864225510047634, + "language_loss": 0.83426094, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.86304164, + "num_input_tokens_seen": 73542625, + "step": 3404, + "time_per_iteration": 2.8109073638916016 + }, + { + "auxiliary_loss_clip": 0.01545488, + "auxiliary_loss_mlp": 0.01323817, + "balance_loss_clip": 1.19559896, + "balance_loss_mlp": 1.0546906, + "epoch": 0.20471967533443558, + "flos": 21253408202880.0, + "grad_norm": 2.2253831006091005, + "language_loss": 0.8621856, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.89087868, + "num_input_tokens_seen": 73561450, + "step": 3405, + "time_per_iteration": 2.8441922664642334 + }, + { + "auxiliary_loss_clip": 0.01552304, + "auxiliary_loss_mlp": 0.01328232, + "balance_loss_clip": 1.20195603, + "balance_loss_mlp": 1.06368303, + "epoch": 0.20477979858710357, + "flos": 14503304096640.0, + "grad_norm": 2.3466081433844144, + "language_loss": 0.84717053, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.87597591, + "num_input_tokens_seen": 73577155, + "step": 3406, + "time_per_iteration": 4.354383230209351 + }, + { + "auxiliary_loss_clip": 0.01540345, + "auxiliary_loss_mlp": 0.01314014, + "balance_loss_clip": 1.19109559, + "balance_loss_mlp": 1.04908288, + "epoch": 0.20483992183977154, + "flos": 11401887486240.0, + "grad_norm": 2.385941170427161, + "language_loss": 0.68110365, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.7096473, + "num_input_tokens_seen": 73594900, + "step": 3407, + "time_per_iteration": 2.784459352493286 + }, + { + "auxiliary_loss_clip": 0.01547866, + "auxiliary_loss_mlp": 0.01334519, + "balance_loss_clip": 1.19766414, + "balance_loss_mlp": 1.07035184, + "epoch": 0.2049000450924395, + "flos": 19062162756000.0, + "grad_norm": 2.2960679680798077, + "language_loss": 0.84470308, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.87352687, + "num_input_tokens_seen": 73613810, + "step": 3408, + "time_per_iteration": 2.889819383621216 + }, + { + "auxiliary_loss_clip": 0.01543132, + "auxiliary_loss_mlp": 0.0133857, + "balance_loss_clip": 1.19465613, + "balance_loss_mlp": 1.06868005, + "epoch": 0.20496016834510747, + "flos": 14576316533280.0, + "grad_norm": 2.8338055452895703, + "language_loss": 0.64603853, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.67485553, + "num_input_tokens_seen": 73631495, + "step": 3409, + "time_per_iteration": 2.8324460983276367 + }, + { + "auxiliary_loss_clip": 0.0154365, + "auxiliary_loss_mlp": 0.01321122, + "balance_loss_clip": 1.19460332, + "balance_loss_mlp": 1.04779875, + "epoch": 0.20502029159777543, + "flos": 22128798879360.0, + "grad_norm": 1.5396021839359337, + "language_loss": 0.806615, + "learning_rate": 3.687180946553745e-06, + "loss": 0.83526278, + "num_input_tokens_seen": 73652840, + "step": 3410, + "time_per_iteration": 4.407557249069214 + }, + { + "auxiliary_loss_clip": 0.01563419, + "auxiliary_loss_mlp": 0.01330688, + "balance_loss_clip": 1.21456802, + "balance_loss_mlp": 1.06919074, + "epoch": 0.2050804148504434, + "flos": 25369906360320.0, + "grad_norm": 3.2391962281679296, + "language_loss": 0.76870859, + "learning_rate": 3.686971778678803e-06, + "loss": 0.79764968, + "num_input_tokens_seen": 73672150, + "step": 3411, + "time_per_iteration": 2.8426971435546875 + }, + { + "auxiliary_loss_clip": 0.01550682, + "auxiliary_loss_mlp": 0.01351646, + "balance_loss_clip": 1.20078754, + "balance_loss_mlp": 1.09014821, + "epoch": 0.2051405381031114, + "flos": 23622273044640.0, + "grad_norm": 1.956676879459682, + "language_loss": 0.73604333, + "learning_rate": 3.686762546833722e-06, + "loss": 0.76506668, + "num_input_tokens_seen": 73691940, + "step": 3412, + "time_per_iteration": 4.535698413848877 + }, + { + "auxiliary_loss_clip": 0.01549676, + "auxiliary_loss_mlp": 0.01351888, + "balance_loss_clip": 1.19884455, + "balance_loss_mlp": 1.09019971, + "epoch": 0.20520066135577936, + "flos": 19567143378720.0, + "grad_norm": 2.598723009328062, + "language_loss": 0.78057206, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.80958772, + "num_input_tokens_seen": 73709080, + "step": 3413, + "time_per_iteration": 2.821035623550415 + }, + { + "auxiliary_loss_clip": 0.01561527, + "auxiliary_loss_mlp": 0.01344246, + "balance_loss_clip": 1.21160483, + "balance_loss_mlp": 1.07950556, + "epoch": 0.20526078460844732, + "flos": 17678302066080.0, + "grad_norm": 2.17507008932077, + "language_loss": 0.85240597, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.88146371, + "num_input_tokens_seen": 73727670, + "step": 3414, + "time_per_iteration": 2.788360357284546 + }, + { + "auxiliary_loss_clip": 0.01538874, + "auxiliary_loss_mlp": 0.013271, + "balance_loss_clip": 1.19126642, + "balance_loss_mlp": 1.06178737, + "epoch": 0.2053209078611153, + "flos": 21502105698240.0, + "grad_norm": 3.1696577028193, + "language_loss": 0.81420767, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.84286737, + "num_input_tokens_seen": 73747170, + "step": 3415, + "time_per_iteration": 2.8038809299468994 + }, + { + "auxiliary_loss_clip": 0.0155713, + "auxiliary_loss_mlp": 0.01343821, + "balance_loss_clip": 1.21101499, + "balance_loss_mlp": 1.07545698, + "epoch": 0.20538103111378325, + "flos": 25665748922880.0, + "grad_norm": 1.7575124569537286, + "language_loss": 0.72704095, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75605047, + "num_input_tokens_seen": 73767690, + "step": 3416, + "time_per_iteration": 2.892591953277588 + }, + { + "auxiliary_loss_clip": 0.0155281, + "auxiliary_loss_mlp": 0.01326327, + "balance_loss_clip": 1.20611787, + "balance_loss_mlp": 1.05414844, + "epoch": 0.20544115436645122, + "flos": 23151200561280.0, + "grad_norm": 2.395109826904813, + "language_loss": 0.79087532, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.81966674, + "num_input_tokens_seen": 73786900, + "step": 3417, + "time_per_iteration": 2.8710169792175293 + }, + { + "auxiliary_loss_clip": 0.01560483, + "auxiliary_loss_mlp": 0.0132163, + "balance_loss_clip": 1.21405399, + "balance_loss_mlp": 1.04792523, + "epoch": 0.20550127761911918, + "flos": 19392444452160.0, + "grad_norm": 2.467228083453456, + "language_loss": 0.87468493, + "learning_rate": 3.685505812834798e-06, + "loss": 0.9035061, + "num_input_tokens_seen": 73804515, + "step": 3418, + "time_per_iteration": 2.8884427547454834 + }, + { + "auxiliary_loss_clip": 0.0154138, + "auxiliary_loss_mlp": 0.01314452, + "balance_loss_clip": 1.19579685, + "balance_loss_mlp": 1.04341817, + "epoch": 0.20556140087178718, + "flos": 22895334643680.0, + "grad_norm": 2.500248182796156, + "language_loss": 0.62121737, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64977568, + "num_input_tokens_seen": 73822910, + "step": 3419, + "time_per_iteration": 2.8815832138061523 + }, + { + "auxiliary_loss_clip": 0.01550873, + "auxiliary_loss_mlp": 0.01342773, + "balance_loss_clip": 1.20492005, + "balance_loss_mlp": 1.07192993, + "epoch": 0.20562152412445514, + "flos": 19791566487360.0, + "grad_norm": 3.6644014162784027, + "language_loss": 0.86417001, + "learning_rate": 3.685086390100674e-06, + "loss": 0.8931064, + "num_input_tokens_seen": 73841160, + "step": 3420, + "time_per_iteration": 2.849923610687256 + }, + { + "auxiliary_loss_clip": 0.01554257, + "auxiliary_loss_mlp": 0.01313343, + "balance_loss_clip": 1.20754564, + "balance_loss_mlp": 1.043262, + "epoch": 0.2056816473771231, + "flos": 31504581948960.0, + "grad_norm": 2.707082702803135, + "language_loss": 0.71696258, + "learning_rate": 3.684876582881668e-06, + "loss": 0.74563861, + "num_input_tokens_seen": 73862795, + "step": 3421, + "time_per_iteration": 3.065523624420166 + }, + { + "auxiliary_loss_clip": 0.01544335, + "auxiliary_loss_mlp": 0.01309911, + "balance_loss_clip": 1.19846559, + "balance_loss_mlp": 1.03963923, + "epoch": 0.20574177062979107, + "flos": 23260624395840.0, + "grad_norm": 2.311764347244202, + "language_loss": 0.7084446, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.73698705, + "num_input_tokens_seen": 73881525, + "step": 3422, + "time_per_iteration": 2.827146530151367 + }, + { + "auxiliary_loss_clip": 0.01740582, + "auxiliary_loss_mlp": 0.01269928, + "balance_loss_clip": 1.40126491, + "balance_loss_mlp": 1.05172729, + "epoch": 0.20580189388245904, + "flos": 70318751757120.0, + "grad_norm": 0.7668672104784955, + "language_loss": 0.5543676, + "learning_rate": 3.684456776779548e-06, + "loss": 0.58447272, + "num_input_tokens_seen": 73937775, + "step": 3423, + "time_per_iteration": 3.366382598876953 + }, + { + "auxiliary_loss_clip": 0.01542078, + "auxiliary_loss_mlp": 0.01304734, + "balance_loss_clip": 1.19506967, + "balance_loss_mlp": 1.03083873, + "epoch": 0.205862017135127, + "flos": 30740511515040.0, + "grad_norm": 1.8182236860023004, + "language_loss": 0.72095478, + "learning_rate": 3.684246777912353e-06, + "loss": 0.74942291, + "num_input_tokens_seen": 73958250, + "step": 3424, + "time_per_iteration": 2.9333112239837646 + }, + { + "auxiliary_loss_clip": 0.0154692, + "auxiliary_loss_mlp": 0.01316766, + "balance_loss_clip": 1.20043397, + "balance_loss_mlp": 1.04840159, + "epoch": 0.20592214038779497, + "flos": 21326686136640.0, + "grad_norm": 1.6792198492314956, + "language_loss": 0.75232661, + "learning_rate": 3.684036715178351e-06, + "loss": 0.78096342, + "num_input_tokens_seen": 73977775, + "step": 3425, + "time_per_iteration": 2.799931049346924 + }, + { + "auxiliary_loss_clip": 0.01540468, + "auxiliary_loss_mlp": 0.01308061, + "balance_loss_clip": 1.19285691, + "balance_loss_mlp": 1.03759921, + "epoch": 0.20598226364046296, + "flos": 22893779589120.0, + "grad_norm": 1.9613167922220938, + "language_loss": 0.88399428, + "learning_rate": 3.683826588585508e-06, + "loss": 0.91247958, + "num_input_tokens_seen": 73996590, + "step": 3426, + "time_per_iteration": 2.8731155395507812 + }, + { + "auxiliary_loss_clip": 0.01548435, + "auxiliary_loss_mlp": 0.01320438, + "balance_loss_clip": 1.20086884, + "balance_loss_mlp": 1.05665171, + "epoch": 0.20604238689313092, + "flos": 23880831861600.0, + "grad_norm": 2.2613555470014677, + "language_loss": 0.7741428, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.80283153, + "num_input_tokens_seen": 74015935, + "step": 3427, + "time_per_iteration": 2.83892560005188 + }, + { + "auxiliary_loss_clip": 0.01542128, + "auxiliary_loss_mlp": 0.01331008, + "balance_loss_clip": 1.19317532, + "balance_loss_mlp": 1.06550479, + "epoch": 0.2061025101457989, + "flos": 22493709349920.0, + "grad_norm": 1.9547241937024527, + "language_loss": 0.74405724, + "learning_rate": 3.683406143855174e-06, + "loss": 0.77278858, + "num_input_tokens_seen": 74036575, + "step": 3428, + "time_per_iteration": 2.8902480602264404 + }, + { + "auxiliary_loss_clip": 0.0154079, + "auxiliary_loss_mlp": 0.01315902, + "balance_loss_clip": 1.1935885, + "balance_loss_mlp": 1.04410446, + "epoch": 0.20616263339846685, + "flos": 22780866363840.0, + "grad_norm": 2.0186373581494976, + "language_loss": 0.73327237, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.76183927, + "num_input_tokens_seen": 74055365, + "step": 3429, + "time_per_iteration": 2.811326503753662 + }, + { + "auxiliary_loss_clip": 0.01541439, + "auxiliary_loss_mlp": 0.01323737, + "balance_loss_clip": 1.19511342, + "balance_loss_mlp": 1.05651784, + "epoch": 0.20622275665113482, + "flos": 20884060137600.0, + "grad_norm": 2.7395570723333273, + "language_loss": 0.85082006, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87947184, + "num_input_tokens_seen": 74074875, + "step": 3430, + "time_per_iteration": 2.8733949661254883 + }, + { + "auxiliary_loss_clip": 0.01538958, + "auxiliary_loss_mlp": 0.01318591, + "balance_loss_clip": 1.19203806, + "balance_loss_mlp": 1.04736602, + "epoch": 0.20628287990380278, + "flos": 19356450264000.0, + "grad_norm": 2.6006953944243922, + "language_loss": 0.69447088, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.7230463, + "num_input_tokens_seen": 74094505, + "step": 3431, + "time_per_iteration": 2.90086030960083 + }, + { + "auxiliary_loss_clip": 0.01715532, + "auxiliary_loss_mlp": 0.01330337, + "balance_loss_clip": 1.37627125, + "balance_loss_mlp": 1.11213684, + "epoch": 0.20634300315647078, + "flos": 71524310345280.0, + "grad_norm": 0.826438552332051, + "language_loss": 0.60215616, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.63261485, + "num_input_tokens_seen": 74158500, + "step": 3432, + "time_per_iteration": 3.5274994373321533 + }, + { + "auxiliary_loss_clip": 0.01537081, + "auxiliary_loss_mlp": 0.01310853, + "balance_loss_clip": 1.19054055, + "balance_loss_mlp": 1.03314281, + "epoch": 0.20640312640913874, + "flos": 21725997812640.0, + "grad_norm": 1.6237901317507775, + "language_loss": 0.72676492, + "learning_rate": 3.682353915057679e-06, + "loss": 0.75524431, + "num_input_tokens_seen": 74176685, + "step": 3433, + "time_per_iteration": 2.855487823486328 + }, + { + "auxiliary_loss_clip": 0.01537602, + "auxiliary_loss_mlp": 0.01312619, + "balance_loss_clip": 1.19084096, + "balance_loss_mlp": 1.0404408, + "epoch": 0.2064632496618067, + "flos": 20556509268960.0, + "grad_norm": 2.4731542853133575, + "language_loss": 0.87171245, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.90021467, + "num_input_tokens_seen": 74194935, + "step": 3434, + "time_per_iteration": 2.836012840270996 + }, + { + "auxiliary_loss_clip": 0.01534377, + "auxiliary_loss_mlp": 0.01330294, + "balance_loss_clip": 1.18854141, + "balance_loss_mlp": 1.05754352, + "epoch": 0.20652337291447467, + "flos": 29825751044160.0, + "grad_norm": 2.022345465103773, + "language_loss": 0.69903767, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.72768438, + "num_input_tokens_seen": 74215400, + "step": 3435, + "time_per_iteration": 2.900843620300293 + }, + { + "auxiliary_loss_clip": 0.01533318, + "auxiliary_loss_mlp": 0.01313464, + "balance_loss_clip": 1.18659925, + "balance_loss_mlp": 1.04223943, + "epoch": 0.20658349616714264, + "flos": 26216054061120.0, + "grad_norm": 1.8349976627679487, + "language_loss": 0.89356798, + "learning_rate": 3.681721812174988e-06, + "loss": 0.92203581, + "num_input_tokens_seen": 74234090, + "step": 3436, + "time_per_iteration": 2.882908582687378 + }, + { + "auxiliary_loss_clip": 0.01533102, + "auxiliary_loss_mlp": 0.01318071, + "balance_loss_clip": 1.18607378, + "balance_loss_mlp": 1.04589272, + "epoch": 0.2066436194198106, + "flos": 25996751254080.0, + "grad_norm": 1.91767493790282, + "language_loss": 0.76442158, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.79293334, + "num_input_tokens_seen": 74253345, + "step": 3437, + "time_per_iteration": 2.866673231124878 + }, + { + "auxiliary_loss_clip": 0.01528543, + "auxiliary_loss_mlp": 0.01304851, + "balance_loss_clip": 1.1815753, + "balance_loss_mlp": 1.0328629, + "epoch": 0.20670374267247857, + "flos": 21363211319040.0, + "grad_norm": 2.4244352077622495, + "language_loss": 0.77450848, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.80284238, + "num_input_tokens_seen": 74271615, + "step": 3438, + "time_per_iteration": 2.839869499206543 + }, + { + "auxiliary_loss_clip": 0.01690431, + "auxiliary_loss_mlp": 0.01263481, + "balance_loss_clip": 1.35131872, + "balance_loss_mlp": 1.03765106, + "epoch": 0.20676386592514656, + "flos": 66389923956960.0, + "grad_norm": 0.8625680037064507, + "language_loss": 0.67083502, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.70037413, + "num_input_tokens_seen": 74331390, + "step": 3439, + "time_per_iteration": 3.3207149505615234 + }, + { + "auxiliary_loss_clip": 0.01522838, + "auxiliary_loss_mlp": 0.01314818, + "balance_loss_clip": 1.17754447, + "balance_loss_mlp": 1.04130447, + "epoch": 0.20682398917781453, + "flos": 17276487131520.0, + "grad_norm": 2.168789027797565, + "language_loss": 0.84899032, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.8773669, + "num_input_tokens_seen": 74347335, + "step": 3440, + "time_per_iteration": 4.545644521713257 + }, + { + "auxiliary_loss_clip": 0.01520373, + "auxiliary_loss_mlp": 0.01307497, + "balance_loss_clip": 1.17448115, + "balance_loss_mlp": 1.036654, + "epoch": 0.2068841124304825, + "flos": 18079244652960.0, + "grad_norm": 2.311374079169718, + "language_loss": 0.85501599, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.8832947, + "num_input_tokens_seen": 74366310, + "step": 3441, + "time_per_iteration": 2.93672513961792 + }, + { + "auxiliary_loss_clip": 0.01520546, + "auxiliary_loss_mlp": 0.01313748, + "balance_loss_clip": 1.17435861, + "balance_loss_mlp": 1.04443049, + "epoch": 0.20694423568315046, + "flos": 27349624272960.0, + "grad_norm": 3.7155794359306986, + "language_loss": 0.85977316, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88811606, + "num_input_tokens_seen": 74387100, + "step": 3442, + "time_per_iteration": 2.894260883331299 + }, + { + "auxiliary_loss_clip": 0.01533546, + "auxiliary_loss_mlp": 0.01336264, + "balance_loss_clip": 1.18644047, + "balance_loss_mlp": 1.07324076, + "epoch": 0.20700435893581842, + "flos": 20231916796800.0, + "grad_norm": 2.0897530531018433, + "language_loss": 0.72953665, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75823474, + "num_input_tokens_seen": 74404460, + "step": 3443, + "time_per_iteration": 2.8312151432037354 + }, + { + "auxiliary_loss_clip": 0.01520471, + "auxiliary_loss_mlp": 0.01323681, + "balance_loss_clip": 1.17410517, + "balance_loss_mlp": 1.0591321, + "epoch": 0.2070644821884864, + "flos": 20633200737120.0, + "grad_norm": 1.9388879539140862, + "language_loss": 0.85524064, + "learning_rate": 3.680033399147797e-06, + "loss": 0.88368213, + "num_input_tokens_seen": 74423790, + "step": 3444, + "time_per_iteration": 4.29696798324585 + }, + { + "auxiliary_loss_clip": 0.01687752, + "auxiliary_loss_mlp": 0.01288948, + "balance_loss_clip": 1.34741688, + "balance_loss_mlp": 1.06769562, + "epoch": 0.20712460544115438, + "flos": 65947449670560.0, + "grad_norm": 0.6957813814561333, + "language_loss": 0.56998599, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59975296, + "num_input_tokens_seen": 74488130, + "step": 3445, + "time_per_iteration": 3.356703758239746 + }, + { + "auxiliary_loss_clip": 0.01529607, + "auxiliary_loss_mlp": 0.01329894, + "balance_loss_clip": 1.18138337, + "balance_loss_mlp": 1.06362808, + "epoch": 0.20718472869382235, + "flos": 19427680077120.0, + "grad_norm": 1.5327954912691095, + "language_loss": 0.78572369, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.81431866, + "num_input_tokens_seen": 74506720, + "step": 3446, + "time_per_iteration": 2.8544962406158447 + }, + { + "auxiliary_loss_clip": 0.01524134, + "auxiliary_loss_mlp": 0.01330971, + "balance_loss_clip": 1.17540503, + "balance_loss_mlp": 1.05650377, + "epoch": 0.2072448519464903, + "flos": 24501380680800.0, + "grad_norm": 2.384804560628635, + "language_loss": 0.62958992, + "learning_rate": 3.679399192876334e-06, + "loss": 0.65814096, + "num_input_tokens_seen": 74525330, + "step": 3447, + "time_per_iteration": 2.8601512908935547 + }, + { + "auxiliary_loss_clip": 0.01518772, + "auxiliary_loss_mlp": 0.01317034, + "balance_loss_clip": 1.16909456, + "balance_loss_mlp": 1.0469532, + "epoch": 0.20730497519915828, + "flos": 23077922627520.0, + "grad_norm": 1.716928283466579, + "language_loss": 0.866768, + "learning_rate": 3.679187663409184e-06, + "loss": 0.89512599, + "num_input_tokens_seen": 74544535, + "step": 3448, + "time_per_iteration": 2.850432872772217 + }, + { + "auxiliary_loss_clip": 0.01526752, + "auxiliary_loss_mlp": 0.01326832, + "balance_loss_clip": 1.17646527, + "balance_loss_mlp": 1.05884933, + "epoch": 0.20736509845182624, + "flos": 21071161572480.0, + "grad_norm": 2.235799607443124, + "language_loss": 0.7563259, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.78486168, + "num_input_tokens_seen": 74562300, + "step": 3449, + "time_per_iteration": 4.269132852554321 + }, + { + "auxiliary_loss_clip": 0.0152689, + "auxiliary_loss_mlp": 0.01319928, + "balance_loss_clip": 1.17727804, + "balance_loss_mlp": 1.04259992, + "epoch": 0.2074252217044942, + "flos": 17634873958560.0, + "grad_norm": 2.913955154016695, + "language_loss": 0.76378638, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.79225457, + "num_input_tokens_seen": 74580080, + "step": 3450, + "time_per_iteration": 4.312192440032959 + }, + { + "auxiliary_loss_clip": 0.01521299, + "auxiliary_loss_mlp": 0.01315157, + "balance_loss_clip": 1.17067516, + "balance_loss_mlp": 1.03935432, + "epoch": 0.20748534495716217, + "flos": 23549070967200.0, + "grad_norm": 3.275943514925523, + "language_loss": 0.82673991, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.85510445, + "num_input_tokens_seen": 74598980, + "step": 3451, + "time_per_iteration": 2.954756021499634 + }, + { + "auxiliary_loss_clip": 0.01688725, + "auxiliary_loss_mlp": 0.01252625, + "balance_loss_clip": 1.34614384, + "balance_loss_mlp": 1.02526855, + "epoch": 0.20754546820983016, + "flos": 52258205687040.0, + "grad_norm": 0.8210203245673888, + "language_loss": 0.56473196, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.59414542, + "num_input_tokens_seen": 74655275, + "step": 3452, + "time_per_iteration": 3.288100242614746 + }, + { + "auxiliary_loss_clip": 0.01531286, + "auxiliary_loss_mlp": 0.01331658, + "balance_loss_clip": 1.18010759, + "balance_loss_mlp": 1.05852592, + "epoch": 0.20760559146249813, + "flos": 20414466852480.0, + "grad_norm": 28.586933528898772, + "language_loss": 0.88097954, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90960902, + "num_input_tokens_seen": 74674560, + "step": 3453, + "time_per_iteration": 2.8690004348754883 + }, + { + "auxiliary_loss_clip": 0.01525342, + "auxiliary_loss_mlp": 0.01319505, + "balance_loss_clip": 1.17615819, + "balance_loss_mlp": 1.04255748, + "epoch": 0.2076657147151661, + "flos": 23188294666080.0, + "grad_norm": 1.8041142638206344, + "language_loss": 0.80348974, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.83193821, + "num_input_tokens_seen": 74694500, + "step": 3454, + "time_per_iteration": 2.851771831512451 + }, + { + "auxiliary_loss_clip": 0.01527067, + "auxiliary_loss_mlp": 0.01312694, + "balance_loss_clip": 1.17752266, + "balance_loss_mlp": 1.0391804, + "epoch": 0.20772583796783406, + "flos": 18295020141120.0, + "grad_norm": 3.1983753216157047, + "language_loss": 0.77480543, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.80320311, + "num_input_tokens_seen": 74710485, + "step": 3455, + "time_per_iteration": 2.893225908279419 + }, + { + "auxiliary_loss_clip": 0.01534909, + "auxiliary_loss_mlp": 0.01318954, + "balance_loss_clip": 1.18346548, + "balance_loss_mlp": 1.05078125, + "epoch": 0.20778596122050202, + "flos": 17604720707040.0, + "grad_norm": 1.9211261207391068, + "language_loss": 0.80532086, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.83385944, + "num_input_tokens_seen": 74727450, + "step": 3456, + "time_per_iteration": 2.855140209197998 + }, + { + "auxiliary_loss_clip": 0.01524086, + "auxiliary_loss_mlp": 0.01329994, + "balance_loss_clip": 1.17460358, + "balance_loss_mlp": 1.05495417, + "epoch": 0.20784608447317, + "flos": 23807857353120.0, + "grad_norm": 1.5351433058705817, + "language_loss": 0.78368247, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.81222332, + "num_input_tokens_seen": 74746725, + "step": 3457, + "time_per_iteration": 2.86747407913208 + }, + { + "auxiliary_loss_clip": 0.01520878, + "auxiliary_loss_mlp": 0.01324784, + "balance_loss_clip": 1.17093086, + "balance_loss_mlp": 1.04802787, + "epoch": 0.20790620772583795, + "flos": 17641207961280.0, + "grad_norm": 2.072289407781683, + "language_loss": 0.83452886, + "learning_rate": 3.677068867939333e-06, + "loss": 0.86298549, + "num_input_tokens_seen": 74765255, + "step": 3458, + "time_per_iteration": 2.8997254371643066 + }, + { + "auxiliary_loss_clip": 0.01534989, + "auxiliary_loss_mlp": 0.0131824, + "balance_loss_clip": 1.18471014, + "balance_loss_mlp": 1.04567969, + "epoch": 0.20796633097850595, + "flos": 27675999368640.0, + "grad_norm": 2.153618702503356, + "language_loss": 0.76319873, + "learning_rate": 3.676856638489272e-06, + "loss": 0.79173112, + "num_input_tokens_seen": 74785710, + "step": 3459, + "time_per_iteration": 2.955366611480713 + }, + { + "auxiliary_loss_clip": 0.01529523, + "auxiliary_loss_mlp": 0.01330284, + "balance_loss_clip": 1.17944348, + "balance_loss_mlp": 1.06077576, + "epoch": 0.2080264542311739, + "flos": 19247671208160.0, + "grad_norm": 4.04890838804469, + "language_loss": 0.77785134, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.80644941, + "num_input_tokens_seen": 74804490, + "step": 3460, + "time_per_iteration": 2.931748390197754 + }, + { + "auxiliary_loss_clip": 0.01519408, + "auxiliary_loss_mlp": 0.0133113, + "balance_loss_clip": 1.16815257, + "balance_loss_mlp": 1.06276631, + "epoch": 0.20808657748384188, + "flos": 27528343584480.0, + "grad_norm": 1.8545199984407403, + "language_loss": 0.75702369, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.78552902, + "num_input_tokens_seen": 74826340, + "step": 3461, + "time_per_iteration": 2.8404922485351562 + }, + { + "auxiliary_loss_clip": 0.01524829, + "auxiliary_loss_mlp": 0.01325167, + "balance_loss_clip": 1.17410815, + "balance_loss_mlp": 1.05432367, + "epoch": 0.20814670073650984, + "flos": 26909387748000.0, + "grad_norm": 2.6855623680560305, + "language_loss": 0.8890335, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.91753352, + "num_input_tokens_seen": 74844960, + "step": 3462, + "time_per_iteration": 2.9767940044403076 + }, + { + "auxiliary_loss_clip": 0.01698375, + "auxiliary_loss_mlp": 0.01280045, + "balance_loss_clip": 1.35546184, + "balance_loss_mlp": 1.0526886, + "epoch": 0.2082068239891778, + "flos": 70182474420960.0, + "grad_norm": 0.7642549008773208, + "language_loss": 0.5899021, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.61968631, + "num_input_tokens_seen": 74909075, + "step": 3463, + "time_per_iteration": 3.520904064178467 + }, + { + "auxiliary_loss_clip": 0.01524588, + "auxiliary_loss_mlp": 0.01329305, + "balance_loss_clip": 1.17264509, + "balance_loss_mlp": 1.0556004, + "epoch": 0.20826694724184577, + "flos": 24610387305600.0, + "grad_norm": 4.123217951042222, + "language_loss": 0.66853571, + "learning_rate": 3.675794537601429e-06, + "loss": 0.69707465, + "num_input_tokens_seen": 74928125, + "step": 3464, + "time_per_iteration": 2.88175892829895 + }, + { + "auxiliary_loss_clip": 0.01534233, + "auxiliary_loss_mlp": 0.01318905, + "balance_loss_clip": 1.18370509, + "balance_loss_mlp": 1.04253006, + "epoch": 0.20832707049451377, + "flos": 12894299663040.0, + "grad_norm": 1.982885692572909, + "language_loss": 0.84091413, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.8694455, + "num_input_tokens_seen": 74945090, + "step": 3465, + "time_per_iteration": 2.8382976055145264 + }, + { + "auxiliary_loss_clip": 0.01541074, + "auxiliary_loss_mlp": 0.01319726, + "balance_loss_clip": 1.1913321, + "balance_loss_mlp": 1.04716611, + "epoch": 0.20838719374718173, + "flos": 22200938968320.0, + "grad_norm": 2.2900902447483413, + "language_loss": 0.81538445, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.84399247, + "num_input_tokens_seen": 74963630, + "step": 3466, + "time_per_iteration": 2.8674635887145996 + }, + { + "auxiliary_loss_clip": 0.01529961, + "auxiliary_loss_mlp": 0.01318652, + "balance_loss_clip": 1.17895937, + "balance_loss_mlp": 1.05028808, + "epoch": 0.2084473169998497, + "flos": 15160453954560.0, + "grad_norm": 1.9400138479312323, + "language_loss": 0.82209325, + "learning_rate": 3.675156514448716e-06, + "loss": 0.85057938, + "num_input_tokens_seen": 74981875, + "step": 3467, + "time_per_iteration": 2.855102300643921 + }, + { + "auxiliary_loss_clip": 0.01538926, + "auxiliary_loss_mlp": 0.01308421, + "balance_loss_clip": 1.18778133, + "balance_loss_mlp": 1.03910375, + "epoch": 0.20850744025251766, + "flos": 17458695833760.0, + "grad_norm": 2.2912045807940693, + "language_loss": 0.81856996, + "learning_rate": 3.674943713009518e-06, + "loss": 0.8470434, + "num_input_tokens_seen": 74999155, + "step": 3468, + "time_per_iteration": 2.8281333446502686 + }, + { + "auxiliary_loss_clip": 0.01533434, + "auxiliary_loss_mlp": 0.01324978, + "balance_loss_clip": 1.18352926, + "balance_loss_mlp": 1.04803133, + "epoch": 0.20856756350518563, + "flos": 25701136260480.0, + "grad_norm": 1.9059748189870012, + "language_loss": 0.90364993, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.93223405, + "num_input_tokens_seen": 75017850, + "step": 3469, + "time_per_iteration": 2.868697166442871 + }, + { + "auxiliary_loss_clip": 0.01534786, + "auxiliary_loss_mlp": 0.01312595, + "balance_loss_clip": 1.18523288, + "balance_loss_mlp": 1.03545725, + "epoch": 0.2086276867578536, + "flos": 37892127130560.0, + "grad_norm": 1.72032163910533, + "language_loss": 0.76701266, + "learning_rate": 3.674517919597092e-06, + "loss": 0.79548645, + "num_input_tokens_seen": 75039270, + "step": 3470, + "time_per_iteration": 2.9396591186523438 + }, + { + "auxiliary_loss_clip": 0.01531071, + "auxiliary_loss_mlp": 0.01314298, + "balance_loss_clip": 1.17904019, + "balance_loss_mlp": 1.04154754, + "epoch": 0.20868781001052156, + "flos": 25559435197440.0, + "grad_norm": 6.628062808377524, + "language_loss": 0.76638705, + "learning_rate": 3.674304927640011e-06, + "loss": 0.79484075, + "num_input_tokens_seen": 75059350, + "step": 3471, + "time_per_iteration": 2.874417304992676 + }, + { + "auxiliary_loss_clip": 0.01520819, + "auxiliary_loss_mlp": 0.01321341, + "balance_loss_clip": 1.17144561, + "balance_loss_mlp": 1.03638315, + "epoch": 0.20874793326318955, + "flos": 27531795047040.0, + "grad_norm": 1.912967652442087, + "language_loss": 0.76145011, + "learning_rate": 3.67409187219312e-06, + "loss": 0.78987169, + "num_input_tokens_seen": 75080150, + "step": 3472, + "time_per_iteration": 2.913931369781494 + }, + { + "auxiliary_loss_clip": 0.01532705, + "auxiliary_loss_mlp": 0.01326623, + "balance_loss_clip": 1.18326974, + "balance_loss_mlp": 1.05139279, + "epoch": 0.20880805651585752, + "flos": 18550620561600.0, + "grad_norm": 2.3703527838270992, + "language_loss": 0.8482995, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.87689281, + "num_input_tokens_seen": 75097920, + "step": 3473, + "time_per_iteration": 2.8523929119110107 + }, + { + "auxiliary_loss_clip": 0.01702267, + "auxiliary_loss_mlp": 0.01250175, + "balance_loss_clip": 1.36140275, + "balance_loss_mlp": 1.02205658, + "epoch": 0.20886817976852548, + "flos": 65953328535360.0, + "grad_norm": 0.8966038886014432, + "language_loss": 0.63637197, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.66589642, + "num_input_tokens_seen": 75152410, + "step": 3474, + "time_per_iteration": 3.373375415802002 + }, + { + "auxiliary_loss_clip": 0.0153955, + "auxiliary_loss_mlp": 0.01336723, + "balance_loss_clip": 1.19014859, + "balance_loss_mlp": 1.06053925, + "epoch": 0.20892830302119345, + "flos": 36542174580000.0, + "grad_norm": 4.7540502370708655, + "language_loss": 0.70004725, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72880995, + "num_input_tokens_seen": 75173265, + "step": 3475, + "time_per_iteration": 3.100341558456421 + }, + { + "auxiliary_loss_clip": 0.01530792, + "auxiliary_loss_mlp": 0.01317746, + "balance_loss_clip": 1.18142748, + "balance_loss_mlp": 1.04194331, + "epoch": 0.2089884262738614, + "flos": 20958703485120.0, + "grad_norm": 1.7087450996025306, + "language_loss": 0.70231563, + "learning_rate": 3.673239015669065e-06, + "loss": 0.73080105, + "num_input_tokens_seen": 75193640, + "step": 3476, + "time_per_iteration": 2.844290256500244 + }, + { + "auxiliary_loss_clip": 0.01532527, + "auxiliary_loss_mlp": 0.01316755, + "balance_loss_clip": 1.18235421, + "balance_loss_mlp": 1.0432415, + "epoch": 0.20904854952652938, + "flos": 22786252162560.0, + "grad_norm": 1.8337433409233528, + "language_loss": 0.89662153, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.92511433, + "num_input_tokens_seen": 75212545, + "step": 3477, + "time_per_iteration": 2.9012413024902344 + }, + { + "auxiliary_loss_clip": 0.01529581, + "auxiliary_loss_mlp": 0.0131872, + "balance_loss_clip": 1.18084908, + "balance_loss_mlp": 1.04272687, + "epoch": 0.20910867277919734, + "flos": 27305475530400.0, + "grad_norm": 2.3295671429038984, + "language_loss": 0.68088168, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70936465, + "num_input_tokens_seen": 75230865, + "step": 3478, + "time_per_iteration": 4.607254266738892 + }, + { + "auxiliary_loss_clip": 0.01529711, + "auxiliary_loss_mlp": 0.01326086, + "balance_loss_clip": 1.17955673, + "balance_loss_mlp": 1.05295336, + "epoch": 0.20916879603186533, + "flos": 14320981609920.0, + "grad_norm": 2.2780418644235825, + "language_loss": 0.8478986, + "learning_rate": 3.672598707029127e-06, + "loss": 0.8764565, + "num_input_tokens_seen": 75248285, + "step": 3479, + "time_per_iteration": 2.8591668605804443 + }, + { + "auxiliary_loss_clip": 0.01527944, + "auxiliary_loss_mlp": 0.01341983, + "balance_loss_clip": 1.17814541, + "balance_loss_mlp": 1.06942332, + "epoch": 0.2092289192845333, + "flos": 22275203034240.0, + "grad_norm": 2.5355132795611004, + "language_loss": 0.74451911, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.77321833, + "num_input_tokens_seen": 75266310, + "step": 3480, + "time_per_iteration": 2.853964328765869 + }, + { + "auxiliary_loss_clip": 0.01532839, + "auxiliary_loss_mlp": 0.01318804, + "balance_loss_clip": 1.18372869, + "balance_loss_mlp": 1.05177546, + "epoch": 0.20928904253720126, + "flos": 14832523804320.0, + "grad_norm": 2.0075976252008645, + "language_loss": 0.75669438, + "learning_rate": 3.67217151746346e-06, + "loss": 0.78521079, + "num_input_tokens_seen": 75284175, + "step": 3481, + "time_per_iteration": 2.873940944671631 + }, + { + "auxiliary_loss_clip": 0.01531392, + "auxiliary_loss_mlp": 0.01320013, + "balance_loss_clip": 1.18229783, + "balance_loss_mlp": 1.04993284, + "epoch": 0.20934916578986923, + "flos": 23261458815360.0, + "grad_norm": 1.8082707004315306, + "language_loss": 0.85531741, + "learning_rate": 3.671957827563209e-06, + "loss": 0.8838315, + "num_input_tokens_seen": 75303465, + "step": 3482, + "time_per_iteration": 2.889214515686035 + }, + { + "auxiliary_loss_clip": 0.01528769, + "auxiliary_loss_mlp": 0.01335663, + "balance_loss_clip": 1.17867875, + "balance_loss_mlp": 1.06634569, + "epoch": 0.2094092890425372, + "flos": 32017110275520.0, + "grad_norm": 2.2292859397755076, + "language_loss": 0.70830965, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.73695397, + "num_input_tokens_seen": 75325290, + "step": 3483, + "time_per_iteration": 4.415517330169678 + }, + { + "auxiliary_loss_clip": 0.01530359, + "auxiliary_loss_mlp": 0.013304, + "balance_loss_clip": 1.18102133, + "balance_loss_mlp": 1.0627985, + "epoch": 0.20946941229520516, + "flos": 20012689846080.0, + "grad_norm": 1.8925308192361232, + "language_loss": 0.75076276, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77937031, + "num_input_tokens_seen": 75343895, + "step": 3484, + "time_per_iteration": 2.9679811000823975 + }, + { + "auxiliary_loss_clip": 0.01539222, + "auxiliary_loss_mlp": 0.01318979, + "balance_loss_clip": 1.19190764, + "balance_loss_mlp": 1.04584694, + "epoch": 0.20952953554787315, + "flos": 30742863060960.0, + "grad_norm": 1.9382874650804574, + "language_loss": 0.70433456, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.73291659, + "num_input_tokens_seen": 75367100, + "step": 3485, + "time_per_iteration": 2.977222442626953 + }, + { + "auxiliary_loss_clip": 0.01536964, + "auxiliary_loss_mlp": 0.01326792, + "balance_loss_clip": 1.19098163, + "balance_loss_mlp": 1.05499458, + "epoch": 0.20958965880054112, + "flos": 27051468092640.0, + "grad_norm": 2.1336785960244087, + "language_loss": 0.83350015, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.86213773, + "num_input_tokens_seen": 75389925, + "step": 3486, + "time_per_iteration": 4.428226470947266 + }, + { + "auxiliary_loss_clip": 0.01538133, + "auxiliary_loss_mlp": 0.01323193, + "balance_loss_clip": 1.19121504, + "balance_loss_mlp": 1.05559158, + "epoch": 0.20964978205320908, + "flos": 34206990308640.0, + "grad_norm": 1.8046850294536443, + "language_loss": 0.87446439, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.9030776, + "num_input_tokens_seen": 75408575, + "step": 3487, + "time_per_iteration": 2.9888243675231934 + }, + { + "auxiliary_loss_clip": 0.01530672, + "auxiliary_loss_mlp": 0.01307545, + "balance_loss_clip": 1.18347812, + "balance_loss_mlp": 1.03689241, + "epoch": 0.20970990530587705, + "flos": 23479661705760.0, + "grad_norm": 2.508646674999836, + "language_loss": 0.7293303, + "learning_rate": 3.670674357028504e-06, + "loss": 0.75771248, + "num_input_tokens_seen": 75427155, + "step": 3488, + "time_per_iteration": 4.449512720108032 + }, + { + "auxiliary_loss_clip": 0.01533597, + "auxiliary_loss_mlp": 0.01319564, + "balance_loss_clip": 1.18839645, + "balance_loss_mlp": 1.04852939, + "epoch": 0.209770028558545, + "flos": 18553427245440.0, + "grad_norm": 2.820336651242878, + "language_loss": 0.80525827, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.83378994, + "num_input_tokens_seen": 75444450, + "step": 3489, + "time_per_iteration": 2.88171648979187 + }, + { + "auxiliary_loss_clip": 0.01534008, + "auxiliary_loss_mlp": 0.01322572, + "balance_loss_clip": 1.18719208, + "balance_loss_mlp": 1.05325401, + "epoch": 0.20983015181121298, + "flos": 21619191021120.0, + "grad_norm": 1.9228683203040708, + "language_loss": 0.73576069, + "learning_rate": 3.670246026613266e-06, + "loss": 0.76432657, + "num_input_tokens_seen": 75462625, + "step": 3490, + "time_per_iteration": 2.852966785430908 + }, + { + "auxiliary_loss_clip": 0.01539238, + "auxiliary_loss_mlp": 0.01316874, + "balance_loss_clip": 1.19295049, + "balance_loss_mlp": 1.0528965, + "epoch": 0.20989027506388094, + "flos": 16616416805280.0, + "grad_norm": 2.0072571882839387, + "language_loss": 0.7067101, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.73527122, + "num_input_tokens_seen": 75480640, + "step": 3491, + "time_per_iteration": 2.781050443649292 + }, + { + "auxiliary_loss_clip": 0.01526636, + "auxiliary_loss_mlp": 0.01330818, + "balance_loss_clip": 1.18027925, + "balance_loss_mlp": 1.06779444, + "epoch": 0.20995039831654894, + "flos": 23218789271040.0, + "grad_norm": 2.6387859708583474, + "language_loss": 0.80109334, + "learning_rate": 3.669817442854444e-06, + "loss": 0.82966793, + "num_input_tokens_seen": 75494900, + "step": 3492, + "time_per_iteration": 2.8923017978668213 + }, + { + "auxiliary_loss_clip": 0.01531861, + "auxiliary_loss_mlp": 0.01323427, + "balance_loss_clip": 1.18450236, + "balance_loss_mlp": 1.06231117, + "epoch": 0.2100105215692169, + "flos": 18149298693120.0, + "grad_norm": 3.1295379387296838, + "language_loss": 0.86887425, + "learning_rate": 3.669603055991502e-06, + "loss": 0.89742714, + "num_input_tokens_seen": 75513370, + "step": 3493, + "time_per_iteration": 2.875497341156006 + }, + { + "auxiliary_loss_clip": 0.01535509, + "auxiliary_loss_mlp": 0.01318768, + "balance_loss_clip": 1.18920016, + "balance_loss_mlp": 1.0566982, + "epoch": 0.21007064482188487, + "flos": 15963476973120.0, + "grad_norm": 1.9351906604304219, + "language_loss": 0.69355595, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.72209877, + "num_input_tokens_seen": 75532480, + "step": 3494, + "time_per_iteration": 2.845963954925537 + }, + { + "auxiliary_loss_clip": 0.01533717, + "auxiliary_loss_mlp": 0.01363034, + "balance_loss_clip": 1.18918836, + "balance_loss_mlp": 1.09352541, + "epoch": 0.21013076807455283, + "flos": 32237095789440.0, + "grad_norm": 1.9459853192605074, + "language_loss": 0.79239136, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.82135892, + "num_input_tokens_seen": 75552745, + "step": 3495, + "time_per_iteration": 2.9730803966522217 + }, + { + "auxiliary_loss_clip": 0.01528173, + "auxiliary_loss_mlp": 0.01311166, + "balance_loss_clip": 1.18384898, + "balance_loss_mlp": 1.03975034, + "epoch": 0.2101908913272208, + "flos": 23698926584640.0, + "grad_norm": 1.6311583655390454, + "language_loss": 0.77631968, + "learning_rate": 3.668959515566116e-06, + "loss": 0.80471307, + "num_input_tokens_seen": 75574355, + "step": 3496, + "time_per_iteration": 2.8639914989471436 + }, + { + "auxiliary_loss_clip": 0.0152949, + "auxiliary_loss_mlp": 0.01328091, + "balance_loss_clip": 1.18450141, + "balance_loss_mlp": 1.05858243, + "epoch": 0.21025101457988876, + "flos": 20377865813760.0, + "grad_norm": 2.144806317706846, + "language_loss": 0.8223803, + "learning_rate": 3.668744875505915e-06, + "loss": 0.85095608, + "num_input_tokens_seen": 75592215, + "step": 3497, + "time_per_iteration": 2.835479497909546 + }, + { + "auxiliary_loss_clip": 0.01523293, + "auxiliary_loss_mlp": 0.0132717, + "balance_loss_clip": 1.17672455, + "balance_loss_mlp": 1.05975974, + "epoch": 0.21031113783255675, + "flos": 25778017369440.0, + "grad_norm": 2.002296014613022, + "language_loss": 0.67783582, + "learning_rate": 3.668530172166741e-06, + "loss": 0.70634043, + "num_input_tokens_seen": 75610740, + "step": 3498, + "time_per_iteration": 2.925950527191162 + }, + { + "auxiliary_loss_clip": 0.01521613, + "auxiliary_loss_mlp": 0.01321394, + "balance_loss_clip": 1.17529988, + "balance_loss_mlp": 1.05436468, + "epoch": 0.21037126108522472, + "flos": 22020399105120.0, + "grad_norm": 1.8078642765841366, + "language_loss": 0.80750364, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.83593369, + "num_input_tokens_seen": 75631005, + "step": 3499, + "time_per_iteration": 2.920484781265259 + }, + { + "auxiliary_loss_clip": 0.01533477, + "auxiliary_loss_mlp": 0.01336122, + "balance_loss_clip": 1.1879766, + "balance_loss_mlp": 1.07252693, + "epoch": 0.21043138433789269, + "flos": 25336567143360.0, + "grad_norm": 3.6122977499880364, + "language_loss": 0.78617227, + "learning_rate": 3.668100575684043e-06, + "loss": 0.81486833, + "num_input_tokens_seen": 75650655, + "step": 3500, + "time_per_iteration": 2.8578152656555176 + }, + { + "auxiliary_loss_clip": 0.01525975, + "auxiliary_loss_mlp": 0.0132213, + "balance_loss_clip": 1.18056798, + "balance_loss_mlp": 1.05224001, + "epoch": 0.21049150759056065, + "flos": 25559017987680.0, + "grad_norm": 2.1797244708507755, + "language_loss": 0.74256265, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.77104378, + "num_input_tokens_seen": 75669895, + "step": 3501, + "time_per_iteration": 2.888660430908203 + }, + { + "auxiliary_loss_clip": 0.01529283, + "auxiliary_loss_mlp": 0.01319082, + "balance_loss_clip": 1.18334138, + "balance_loss_mlp": 1.05357862, + "epoch": 0.21055163084322862, + "flos": 24497739577440.0, + "grad_norm": 1.561441194059912, + "language_loss": 0.75614697, + "learning_rate": 3.667670726183183e-06, + "loss": 0.78463066, + "num_input_tokens_seen": 75689535, + "step": 3502, + "time_per_iteration": 2.884209632873535 + }, + { + "auxiliary_loss_clip": 0.01528234, + "auxiliary_loss_mlp": 0.01315287, + "balance_loss_clip": 1.18311191, + "balance_loss_mlp": 1.04787636, + "epoch": 0.21061175409589658, + "flos": 25741416330720.0, + "grad_norm": 2.4735759977389273, + "language_loss": 0.77521133, + "learning_rate": 3.667455706571316e-06, + "loss": 0.80364656, + "num_input_tokens_seen": 75709265, + "step": 3503, + "time_per_iteration": 2.826450824737549 + }, + { + "auxiliary_loss_clip": 0.01528485, + "auxiliary_loss_mlp": 0.01322865, + "balance_loss_clip": 1.18277407, + "balance_loss_mlp": 1.05049586, + "epoch": 0.21067187734856455, + "flos": 18991160511840.0, + "grad_norm": 2.322204690947992, + "language_loss": 0.78942972, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.81794322, + "num_input_tokens_seen": 75727050, + "step": 3504, + "time_per_iteration": 2.8713035583496094 + }, + { + "auxiliary_loss_clip": 0.01518672, + "auxiliary_loss_mlp": 0.01322115, + "balance_loss_clip": 1.17375958, + "balance_loss_mlp": 1.05165291, + "epoch": 0.21073200060123254, + "flos": 24683703167520.0, + "grad_norm": 3.105393782465756, + "language_loss": 0.76834476, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.79675257, + "num_input_tokens_seen": 75747175, + "step": 3505, + "time_per_iteration": 2.9915435314178467 + }, + { + "auxiliary_loss_clip": 0.01526629, + "auxiliary_loss_mlp": 0.01310422, + "balance_loss_clip": 1.18121135, + "balance_loss_mlp": 1.04510951, + "epoch": 0.2107921238539005, + "flos": 28551921039360.0, + "grad_norm": 1.8528277293650297, + "language_loss": 0.64360583, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.67197633, + "num_input_tokens_seen": 75767690, + "step": 3506, + "time_per_iteration": 2.873467445373535 + }, + { + "auxiliary_loss_clip": 0.01532418, + "auxiliary_loss_mlp": 0.01314282, + "balance_loss_clip": 1.18564296, + "balance_loss_mlp": 1.04706264, + "epoch": 0.21085224710656847, + "flos": 25888275623520.0, + "grad_norm": 1.721592602731091, + "language_loss": 0.82221842, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.85068536, + "num_input_tokens_seen": 75787255, + "step": 3507, + "time_per_iteration": 2.889724016189575 + }, + { + "auxiliary_loss_clip": 0.01535382, + "auxiliary_loss_mlp": 0.01315133, + "balance_loss_clip": 1.19021893, + "balance_loss_mlp": 1.04753184, + "epoch": 0.21091237035923643, + "flos": 14978093539680.0, + "grad_norm": 1.837385777911439, + "language_loss": 0.75876862, + "learning_rate": 3.666379660223824e-06, + "loss": 0.78727376, + "num_input_tokens_seen": 75805890, + "step": 3508, + "time_per_iteration": 2.848802089691162 + }, + { + "auxiliary_loss_clip": 0.01527353, + "auxiliary_loss_mlp": 0.01320831, + "balance_loss_clip": 1.18213308, + "balance_loss_mlp": 1.05609059, + "epoch": 0.2109724936119044, + "flos": 16364381631840.0, + "grad_norm": 5.027845815916329, + "language_loss": 0.8580572, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.88653904, + "num_input_tokens_seen": 75821620, + "step": 3509, + "time_per_iteration": 2.8213438987731934 + }, + { + "auxiliary_loss_clip": 0.015199, + "auxiliary_loss_mlp": 0.01313403, + "balance_loss_clip": 1.17478132, + "balance_loss_mlp": 1.04389429, + "epoch": 0.21103261686457236, + "flos": 31505112943200.0, + "grad_norm": 1.7263667267625418, + "language_loss": 0.68188196, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.71021497, + "num_input_tokens_seen": 75842490, + "step": 3510, + "time_per_iteration": 2.940093755722046 + }, + { + "auxiliary_loss_clip": 0.01520583, + "auxiliary_loss_mlp": 0.0132008, + "balance_loss_clip": 1.17520857, + "balance_loss_mlp": 1.05133438, + "epoch": 0.21109274011724033, + "flos": 27346514163840.0, + "grad_norm": 1.748617453426404, + "language_loss": 0.7237007, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.75210726, + "num_input_tokens_seen": 75865985, + "step": 3511, + "time_per_iteration": 2.9726593494415283 + }, + { + "auxiliary_loss_clip": 0.01531104, + "auxiliary_loss_mlp": 0.01328164, + "balance_loss_clip": 1.18534076, + "balance_loss_mlp": 1.05808306, + "epoch": 0.21115286336990832, + "flos": 17822240890560.0, + "grad_norm": 3.230168697943477, + "language_loss": 0.69258136, + "learning_rate": 3.665517685689794e-06, + "loss": 0.721174, + "num_input_tokens_seen": 75882745, + "step": 3512, + "time_per_iteration": 2.8434364795684814 + }, + { + "auxiliary_loss_clip": 0.01528682, + "auxiliary_loss_mlp": 0.01329985, + "balance_loss_clip": 1.18361771, + "balance_loss_mlp": 1.06448174, + "epoch": 0.2112129866225763, + "flos": 27200413434240.0, + "grad_norm": 3.658800532829986, + "language_loss": 0.73715377, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.76574045, + "num_input_tokens_seen": 75904305, + "step": 3513, + "time_per_iteration": 2.9180521965026855 + }, + { + "auxiliary_loss_clip": 0.01528781, + "auxiliary_loss_mlp": 0.01319306, + "balance_loss_clip": 1.18547022, + "balance_loss_mlp": 1.0570457, + "epoch": 0.21127310987524425, + "flos": 23733479502720.0, + "grad_norm": 1.6903952193552854, + "language_loss": 0.74374634, + "learning_rate": 3.665086319450502e-06, + "loss": 0.77222717, + "num_input_tokens_seen": 75923710, + "step": 3514, + "time_per_iteration": 2.9260005950927734 + }, + { + "auxiliary_loss_clip": 0.0152155, + "auxiliary_loss_mlp": 0.01324949, + "balance_loss_clip": 1.17782068, + "balance_loss_mlp": 1.05906439, + "epoch": 0.21133323312791222, + "flos": 18334465791840.0, + "grad_norm": 1.9664672319854353, + "language_loss": 0.76994181, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.79840678, + "num_input_tokens_seen": 75942625, + "step": 3515, + "time_per_iteration": 2.8950934410095215 + }, + { + "auxiliary_loss_clip": 0.01525054, + "auxiliary_loss_mlp": 0.01322263, + "balance_loss_clip": 1.18128586, + "balance_loss_mlp": 1.05580676, + "epoch": 0.21139335638058018, + "flos": 17933143923360.0, + "grad_norm": 2.1343829147169577, + "language_loss": 0.68679035, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.71526355, + "num_input_tokens_seen": 75959930, + "step": 3516, + "time_per_iteration": 4.49897313117981 + }, + { + "auxiliary_loss_clip": 0.01521225, + "auxiliary_loss_mlp": 0.01328291, + "balance_loss_clip": 1.17683709, + "balance_loss_mlp": 1.05954552, + "epoch": 0.21145347963324815, + "flos": 24574431045600.0, + "grad_norm": 1.9021179957696868, + "language_loss": 0.85054159, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87903678, + "num_input_tokens_seen": 75980335, + "step": 3517, + "time_per_iteration": 2.955000877380371 + }, + { + "auxiliary_loss_clip": 0.0152311, + "auxiliary_loss_mlp": 0.01316961, + "balance_loss_clip": 1.1781466, + "balance_loss_mlp": 1.05241203, + "epoch": 0.21151360288591614, + "flos": 35848916749440.0, + "grad_norm": 1.7664606467150024, + "language_loss": 0.62722909, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65562975, + "num_input_tokens_seen": 76002095, + "step": 3518, + "time_per_iteration": 2.9589176177978516 + }, + { + "auxiliary_loss_clip": 0.01527838, + "auxiliary_loss_mlp": 0.01325549, + "balance_loss_clip": 1.18440199, + "balance_loss_mlp": 1.06176269, + "epoch": 0.2115737261385841, + "flos": 24643726522560.0, + "grad_norm": 2.1315012113968246, + "language_loss": 0.89685869, + "learning_rate": 3.664006799041303e-06, + "loss": 0.92539257, + "num_input_tokens_seen": 76020425, + "step": 3519, + "time_per_iteration": 2.962531089782715 + }, + { + "auxiliary_loss_clip": 0.01528923, + "auxiliary_loss_mlp": 0.01326014, + "balance_loss_clip": 1.18356252, + "balance_loss_mlp": 1.06241894, + "epoch": 0.21163384939125207, + "flos": 25229153501280.0, + "grad_norm": 2.5194569739712667, + "language_loss": 0.81389856, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.84244794, + "num_input_tokens_seen": 76041210, + "step": 3520, + "time_per_iteration": 2.8942317962646484 + }, + { + "auxiliary_loss_clip": 0.0152864, + "auxiliary_loss_mlp": 0.01321176, + "balance_loss_clip": 1.18440831, + "balance_loss_mlp": 1.05681705, + "epoch": 0.21169397264392004, + "flos": 26069915403360.0, + "grad_norm": 1.642274640472085, + "language_loss": 0.75899446, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78749263, + "num_input_tokens_seen": 76062685, + "step": 3521, + "time_per_iteration": 4.431105136871338 + }, + { + "auxiliary_loss_clip": 0.01521853, + "auxiliary_loss_mlp": 0.01312475, + "balance_loss_clip": 1.17827392, + "balance_loss_mlp": 1.04640007, + "epoch": 0.211754095896588, + "flos": 23110275712320.0, + "grad_norm": 2.1886979202519217, + "language_loss": 0.75966728, + "learning_rate": 3.663358329538626e-06, + "loss": 0.7880106, + "num_input_tokens_seen": 76082300, + "step": 3522, + "time_per_iteration": 2.946143627166748 + }, + { + "auxiliary_loss_clip": 0.0152253, + "auxiliary_loss_mlp": 0.01317234, + "balance_loss_clip": 1.17765164, + "balance_loss_mlp": 1.04886973, + "epoch": 0.21181421914925597, + "flos": 27924507223200.0, + "grad_norm": 2.1504886965494374, + "language_loss": 0.70489514, + "learning_rate": 3.663142046877374e-06, + "loss": 0.7332927, + "num_input_tokens_seen": 76101135, + "step": 3523, + "time_per_iteration": 2.9199225902557373 + }, + { + "auxiliary_loss_clip": 0.01523704, + "auxiliary_loss_mlp": 0.01319395, + "balance_loss_clip": 1.1801306, + "balance_loss_mlp": 1.05331993, + "epoch": 0.21187434240192393, + "flos": 17130538114560.0, + "grad_norm": 2.406997588095578, + "language_loss": 0.77224672, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.80067778, + "num_input_tokens_seen": 76119320, + "step": 3524, + "time_per_iteration": 2.8444840908050537 + }, + { + "auxiliary_loss_clip": 0.01520026, + "auxiliary_loss_mlp": 0.0133071, + "balance_loss_clip": 1.17548299, + "balance_loss_mlp": 1.06444407, + "epoch": 0.21193446565459192, + "flos": 22349580884640.0, + "grad_norm": 2.9094646755341436, + "language_loss": 0.81940109, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.84790838, + "num_input_tokens_seen": 76137445, + "step": 3525, + "time_per_iteration": 4.462768316268921 + }, + { + "auxiliary_loss_clip": 0.01523134, + "auxiliary_loss_mlp": 0.01319965, + "balance_loss_clip": 1.17938972, + "balance_loss_mlp": 1.05141068, + "epoch": 0.2119945889072599, + "flos": 27201816776160.0, + "grad_norm": 1.8751493846235472, + "language_loss": 0.75184536, + "learning_rate": 3.662492820527356e-06, + "loss": 0.78027642, + "num_input_tokens_seen": 76159500, + "step": 3526, + "time_per_iteration": 4.456703424453735 + }, + { + "auxiliary_loss_clip": 0.01523208, + "auxiliary_loss_mlp": 0.01311658, + "balance_loss_clip": 1.17954612, + "balance_loss_mlp": 1.0410049, + "epoch": 0.21205471215992786, + "flos": 20993749469280.0, + "grad_norm": 1.8467020305262414, + "language_loss": 0.76852739, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79687607, + "num_input_tokens_seen": 76177990, + "step": 3527, + "time_per_iteration": 2.8520383834838867 + }, + { + "auxiliary_loss_clip": 0.01524978, + "auxiliary_loss_mlp": 0.01321274, + "balance_loss_clip": 1.17997444, + "balance_loss_mlp": 1.05157518, + "epoch": 0.21211483541259582, + "flos": 20779794532800.0, + "grad_norm": 1.8189620607502095, + "language_loss": 0.7850132, + "learning_rate": 3.662059687737528e-06, + "loss": 0.81347573, + "num_input_tokens_seen": 76197125, + "step": 3528, + "time_per_iteration": 2.9131603240966797 + }, + { + "auxiliary_loss_clip": 0.0151895, + "auxiliary_loss_mlp": 0.01320277, + "balance_loss_clip": 1.17454791, + "balance_loss_mlp": 1.05401158, + "epoch": 0.21217495866526379, + "flos": 18992032859520.0, + "grad_norm": 1.9179099290172101, + "language_loss": 0.82201898, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.85041124, + "num_input_tokens_seen": 76216215, + "step": 3529, + "time_per_iteration": 2.9133968353271484 + }, + { + "auxiliary_loss_clip": 0.01519425, + "auxiliary_loss_mlp": 0.01317161, + "balance_loss_clip": 1.17562914, + "balance_loss_mlp": 1.04650784, + "epoch": 0.21223508191793175, + "flos": 20669384566080.0, + "grad_norm": 2.8216855401528544, + "language_loss": 0.76734501, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.79571092, + "num_input_tokens_seen": 76237010, + "step": 3530, + "time_per_iteration": 2.895073652267456 + }, + { + "auxiliary_loss_clip": 0.01516291, + "auxiliary_loss_mlp": 0.01307098, + "balance_loss_clip": 1.17277431, + "balance_loss_mlp": 1.03949738, + "epoch": 0.21229520517059972, + "flos": 21618622098720.0, + "grad_norm": 2.4766674327702964, + "language_loss": 0.82921433, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85744822, + "num_input_tokens_seen": 76255965, + "step": 3531, + "time_per_iteration": 2.892127275466919 + }, + { + "auxiliary_loss_clip": 0.01518685, + "auxiliary_loss_mlp": 0.01310764, + "balance_loss_clip": 1.17536235, + "balance_loss_mlp": 1.04697728, + "epoch": 0.2123553284232677, + "flos": 13992785962560.0, + "grad_norm": 2.712675315652968, + "language_loss": 0.73296487, + "learning_rate": 3.661192665917977e-06, + "loss": 0.76125938, + "num_input_tokens_seen": 76272150, + "step": 3532, + "time_per_iteration": 2.846606731414795 + }, + { + "auxiliary_loss_clip": 0.0151822, + "auxiliary_loss_mlp": 0.01310582, + "balance_loss_clip": 1.17472589, + "balance_loss_mlp": 1.04107356, + "epoch": 0.21241545167593567, + "flos": 18298775028960.0, + "grad_norm": 2.488433177974562, + "language_loss": 0.7398892, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76817727, + "num_input_tokens_seen": 76291425, + "step": 3533, + "time_per_iteration": 2.973177433013916 + }, + { + "auxiliary_loss_clip": 0.0152669, + "auxiliary_loss_mlp": 0.01312118, + "balance_loss_clip": 1.18326974, + "balance_loss_mlp": 1.03936732, + "epoch": 0.21247557492860364, + "flos": 34715915460000.0, + "grad_norm": 2.1567118437125257, + "language_loss": 0.71464962, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.7430377, + "num_input_tokens_seen": 76313975, + "step": 3534, + "time_per_iteration": 3.065920352935791 + }, + { + "auxiliary_loss_clip": 0.01530051, + "auxiliary_loss_mlp": 0.01317299, + "balance_loss_clip": 1.18691194, + "balance_loss_mlp": 1.0489347, + "epoch": 0.2125356981812716, + "flos": 22055976083520.0, + "grad_norm": 2.4996297466399113, + "language_loss": 0.72133929, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.74981278, + "num_input_tokens_seen": 76330955, + "step": 3535, + "time_per_iteration": 2.863579750061035 + }, + { + "auxiliary_loss_clip": 0.01518798, + "auxiliary_loss_mlp": 0.01308422, + "balance_loss_clip": 1.17474222, + "balance_loss_mlp": 1.04063034, + "epoch": 0.21259582143393957, + "flos": 28550859050880.0, + "grad_norm": 2.5347940668326023, + "language_loss": 0.709638, + "learning_rate": 3.660324636216996e-06, + "loss": 0.73791015, + "num_input_tokens_seen": 76352680, + "step": 3536, + "time_per_iteration": 2.9385616779327393 + }, + { + "auxiliary_loss_clip": 0.01518733, + "auxiliary_loss_mlp": 0.01314986, + "balance_loss_clip": 1.17602313, + "balance_loss_mlp": 1.04681277, + "epoch": 0.21265594468660753, + "flos": 20122986028320.0, + "grad_norm": 1.9318099379586895, + "language_loss": 0.88149107, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90982831, + "num_input_tokens_seen": 76370750, + "step": 3537, + "time_per_iteration": 2.8267176151275635 + }, + { + "auxiliary_loss_clip": 0.01520338, + "auxiliary_loss_mlp": 0.01317784, + "balance_loss_clip": 1.17698121, + "balance_loss_mlp": 1.0530442, + "epoch": 0.21271606793927553, + "flos": 23078415693600.0, + "grad_norm": 2.8416677297656587, + "language_loss": 0.80517948, + "learning_rate": 3.659890243575524e-06, + "loss": 0.83356065, + "num_input_tokens_seen": 76390610, + "step": 3538, + "time_per_iteration": 2.9330434799194336 + }, + { + "auxiliary_loss_clip": 0.01516882, + "auxiliary_loss_mlp": 0.01318067, + "balance_loss_clip": 1.17397046, + "balance_loss_mlp": 1.05389988, + "epoch": 0.2127761911919435, + "flos": 26390184065280.0, + "grad_norm": 2.298387232881489, + "language_loss": 0.87567931, + "learning_rate": 3.659672952835863e-06, + "loss": 0.90402877, + "num_input_tokens_seen": 76408860, + "step": 3539, + "time_per_iteration": 2.899073362350464 + }, + { + "auxiliary_loss_clip": 0.01522789, + "auxiliary_loss_mlp": 0.01326035, + "balance_loss_clip": 1.1796478, + "balance_loss_mlp": 1.05995941, + "epoch": 0.21283631444461146, + "flos": 20230247957760.0, + "grad_norm": 3.8880681536306727, + "language_loss": 0.58430082, + "learning_rate": 3.659455599161237e-06, + "loss": 0.61278903, + "num_input_tokens_seen": 76424980, + "step": 3540, + "time_per_iteration": 2.857584238052368 + }, + { + "auxiliary_loss_clip": 0.01525604, + "auxiliary_loss_mlp": 0.01340069, + "balance_loss_clip": 1.18226314, + "balance_loss_mlp": 1.07532883, + "epoch": 0.21289643769727942, + "flos": 13518717154560.0, + "grad_norm": 2.047404326840527, + "language_loss": 0.75902545, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78768218, + "num_input_tokens_seen": 76443135, + "step": 3541, + "time_per_iteration": 2.9449572563171387 + }, + { + "auxiliary_loss_clip": 0.01527323, + "auxiliary_loss_mlp": 0.01317918, + "balance_loss_clip": 1.18367338, + "balance_loss_mlp": 1.05489421, + "epoch": 0.2129565609499474, + "flos": 24829348759200.0, + "grad_norm": 2.064644864556968, + "language_loss": 0.69319737, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.72164977, + "num_input_tokens_seen": 76462470, + "step": 3542, + "time_per_iteration": 2.9169070720672607 + }, + { + "auxiliary_loss_clip": 0.01518159, + "auxiliary_loss_mlp": 0.01314216, + "balance_loss_clip": 1.17595434, + "balance_loss_mlp": 1.05081141, + "epoch": 0.21301668420261535, + "flos": 23661491126400.0, + "grad_norm": 1.9665561738206359, + "language_loss": 0.75815308, + "learning_rate": 3.658803160610004e-06, + "loss": 0.78647685, + "num_input_tokens_seen": 76481995, + "step": 3543, + "time_per_iteration": 2.8491291999816895 + }, + { + "auxiliary_loss_clip": 0.01527028, + "auxiliary_loss_mlp": 0.01322229, + "balance_loss_clip": 1.18395281, + "balance_loss_mlp": 1.05748904, + "epoch": 0.21307680745528332, + "flos": 16364685057120.0, + "grad_norm": 1.9784583255601036, + "language_loss": 0.67213333, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.7006259, + "num_input_tokens_seen": 76500245, + "step": 3544, + "time_per_iteration": 2.8563106060028076 + }, + { + "auxiliary_loss_clip": 0.01524574, + "auxiliary_loss_mlp": 0.01322649, + "balance_loss_clip": 1.18136072, + "balance_loss_mlp": 1.05485761, + "epoch": 0.2131369307079513, + "flos": 19101077412480.0, + "grad_norm": 1.825557915405483, + "language_loss": 0.71286494, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.74133718, + "num_input_tokens_seen": 76519535, + "step": 3545, + "time_per_iteration": 2.9767348766326904 + }, + { + "auxiliary_loss_clip": 0.01530051, + "auxiliary_loss_mlp": 0.01308093, + "balance_loss_clip": 1.18752289, + "balance_loss_mlp": 1.0420177, + "epoch": 0.21319705396061928, + "flos": 30374880409440.0, + "grad_norm": 4.310836719500164, + "language_loss": 0.72513068, + "learning_rate": 3.658150155940946e-06, + "loss": 0.75351208, + "num_input_tokens_seen": 76542065, + "step": 3546, + "time_per_iteration": 2.9918429851531982 + }, + { + "auxiliary_loss_clip": 0.01528288, + "auxiliary_loss_mlp": 0.01314279, + "balance_loss_clip": 1.18616247, + "balance_loss_mlp": 1.04667783, + "epoch": 0.21325717721328724, + "flos": 21758009544000.0, + "grad_norm": 2.1292596161943798, + "language_loss": 0.8020485, + "learning_rate": 3.657932361952479e-06, + "loss": 0.83047414, + "num_input_tokens_seen": 76560540, + "step": 3547, + "time_per_iteration": 2.8259308338165283 + }, + { + "auxiliary_loss_clip": 0.01522221, + "auxiliary_loss_mlp": 0.01313617, + "balance_loss_clip": 1.17895508, + "balance_loss_mlp": 1.04601634, + "epoch": 0.2133173004659552, + "flos": 28733409106560.0, + "grad_norm": 3.6305738865440924, + "language_loss": 0.74582237, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.77418077, + "num_input_tokens_seen": 76581760, + "step": 3548, + "time_per_iteration": 3.017038345336914 + }, + { + "auxiliary_loss_clip": 0.01521137, + "auxiliary_loss_mlp": 0.0132051, + "balance_loss_clip": 1.17652702, + "balance_loss_mlp": 1.05519736, + "epoch": 0.21337742371862317, + "flos": 16838981434080.0, + "grad_norm": 2.4299523957023874, + "language_loss": 0.749488, + "learning_rate": 3.657496585376922e-06, + "loss": 0.77790451, + "num_input_tokens_seen": 76599940, + "step": 3549, + "time_per_iteration": 2.831346035003662 + }, + { + "auxiliary_loss_clip": 0.01518605, + "auxiliary_loss_mlp": 0.01317947, + "balance_loss_clip": 1.17562866, + "balance_loss_mlp": 1.04614985, + "epoch": 0.21343754697129114, + "flos": 24427192471200.0, + "grad_norm": 3.04722085104099, + "language_loss": 0.80609596, + "learning_rate": 3.657278602806357e-06, + "loss": 0.83446151, + "num_input_tokens_seen": 76619580, + "step": 3550, + "time_per_iteration": 2.8739476203918457 + }, + { + "auxiliary_loss_clip": 0.01526381, + "auxiliary_loss_mlp": 0.01304323, + "balance_loss_clip": 1.18445194, + "balance_loss_mlp": 1.03920174, + "epoch": 0.21349767022395913, + "flos": 19279645011360.0, + "grad_norm": 1.631874770667364, + "language_loss": 0.87851083, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90681785, + "num_input_tokens_seen": 76638195, + "step": 3551, + "time_per_iteration": 2.9282424449920654 + }, + { + "auxiliary_loss_clip": 0.01523539, + "auxiliary_loss_mlp": 0.01310353, + "balance_loss_clip": 1.1801064, + "balance_loss_mlp": 1.04027212, + "epoch": 0.2135577934766271, + "flos": 17349461640000.0, + "grad_norm": 1.9542132527839968, + "language_loss": 0.83441031, + "learning_rate": 3.656842449140983e-06, + "loss": 0.86274922, + "num_input_tokens_seen": 76656695, + "step": 3552, + "time_per_iteration": 2.8470470905303955 + }, + { + "auxiliary_loss_clip": 0.01529138, + "auxiliary_loss_mlp": 0.01331791, + "balance_loss_clip": 1.1868701, + "balance_loss_mlp": 1.06762278, + "epoch": 0.21361791672929506, + "flos": 24059285676000.0, + "grad_norm": 1.853120066338387, + "language_loss": 0.76967233, + "learning_rate": 3.656624278062713e-06, + "loss": 0.79828167, + "num_input_tokens_seen": 76677430, + "step": 3553, + "time_per_iteration": 2.865992307662964 + }, + { + "auxiliary_loss_clip": 0.0153134, + "auxiliary_loss_mlp": 0.01321252, + "balance_loss_clip": 1.18743396, + "balance_loss_mlp": 1.05651212, + "epoch": 0.21367803998196302, + "flos": 22164413785920.0, + "grad_norm": 2.693294623373154, + "language_loss": 0.72867674, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.75720263, + "num_input_tokens_seen": 76697615, + "step": 3554, + "time_per_iteration": 2.895451068878174 + }, + { + "auxiliary_loss_clip": 0.01525304, + "auxiliary_loss_mlp": 0.01335011, + "balance_loss_clip": 1.18272853, + "balance_loss_mlp": 1.07217824, + "epoch": 0.213738163234631, + "flos": 20888990798400.0, + "grad_norm": 1.8733816294184515, + "language_loss": 0.67455685, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.70315999, + "num_input_tokens_seen": 76715685, + "step": 3555, + "time_per_iteration": 4.58592414855957 + }, + { + "auxiliary_loss_clip": 0.01528158, + "auxiliary_loss_mlp": 0.01308186, + "balance_loss_clip": 1.1846838, + "balance_loss_mlp": 1.03276443, + "epoch": 0.21379828648729896, + "flos": 28405668597120.0, + "grad_norm": 1.7878431313237513, + "language_loss": 0.65224016, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.68060362, + "num_input_tokens_seen": 76735405, + "step": 3556, + "time_per_iteration": 2.956282138824463 + }, + { + "auxiliary_loss_clip": 0.01526418, + "auxiliary_loss_mlp": 0.0132446, + "balance_loss_clip": 1.18184304, + "balance_loss_mlp": 1.05991054, + "epoch": 0.21385840973996692, + "flos": 25481795525280.0, + "grad_norm": 1.7555230545213578, + "language_loss": 0.72898108, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.75748986, + "num_input_tokens_seen": 76754395, + "step": 3557, + "time_per_iteration": 2.854196548461914 + }, + { + "auxiliary_loss_clip": 0.01525778, + "auxiliary_loss_mlp": 0.01321842, + "balance_loss_clip": 1.18216181, + "balance_loss_mlp": 1.0513804, + "epoch": 0.2139185329926349, + "flos": 28076790242880.0, + "grad_norm": 1.6534178724238464, + "language_loss": 0.67173994, + "learning_rate": 3.655532480546528e-06, + "loss": 0.70021611, + "num_input_tokens_seen": 76777210, + "step": 3558, + "time_per_iteration": 2.9599809646606445 + }, + { + "auxiliary_loss_clip": 0.01515974, + "auxiliary_loss_mlp": 0.01305673, + "balance_loss_clip": 1.17296243, + "balance_loss_mlp": 1.03101468, + "epoch": 0.21397865624530288, + "flos": 19610685270720.0, + "grad_norm": 1.8720104363990155, + "language_loss": 0.79879868, + "learning_rate": 3.655313932676286e-06, + "loss": 0.82701516, + "num_input_tokens_seen": 76795830, + "step": 3559, + "time_per_iteration": 4.512813091278076 + }, + { + "auxiliary_loss_clip": 0.0152317, + "auxiliary_loss_mlp": 0.01315919, + "balance_loss_clip": 1.17962813, + "balance_loss_mlp": 1.04984438, + "epoch": 0.21403877949797084, + "flos": 24683816952000.0, + "grad_norm": 1.673223622475695, + "language_loss": 0.6781444, + "learning_rate": 3.655095322036373e-06, + "loss": 0.70653534, + "num_input_tokens_seen": 76814700, + "step": 3560, + "time_per_iteration": 2.953395366668701 + }, + { + "auxiliary_loss_clip": 0.01525476, + "auxiliary_loss_mlp": 0.01319033, + "balance_loss_clip": 1.18203366, + "balance_loss_mlp": 1.04952478, + "epoch": 0.2140989027506388, + "flos": 19862985941280.0, + "grad_norm": 2.4479026272399733, + "language_loss": 0.73516184, + "learning_rate": 3.65487664863508e-06, + "loss": 0.76360691, + "num_input_tokens_seen": 76833400, + "step": 3561, + "time_per_iteration": 2.891702890396118 + }, + { + "auxiliary_loss_clip": 0.01528448, + "auxiliary_loss_mlp": 0.01316896, + "balance_loss_clip": 1.18457961, + "balance_loss_mlp": 1.04281044, + "epoch": 0.21415902600330677, + "flos": 19137337097760.0, + "grad_norm": 5.3686842473879555, + "language_loss": 0.7786603, + "learning_rate": 3.654657912480698e-06, + "loss": 0.80711377, + "num_input_tokens_seen": 76850645, + "step": 3562, + "time_per_iteration": 4.211057186126709 + }, + { + "auxiliary_loss_clip": 0.01527649, + "auxiliary_loss_mlp": 0.01314426, + "balance_loss_clip": 1.18369198, + "balance_loss_mlp": 1.04415512, + "epoch": 0.21421914925597474, + "flos": 22274785824480.0, + "grad_norm": 1.6160114893104267, + "language_loss": 0.84623289, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.87465358, + "num_input_tokens_seen": 76870135, + "step": 3563, + "time_per_iteration": 2.8665871620178223 + }, + { + "auxiliary_loss_clip": 0.01529436, + "auxiliary_loss_mlp": 0.01313499, + "balance_loss_clip": 1.18592083, + "balance_loss_mlp": 1.04093933, + "epoch": 0.2142792725086427, + "flos": 33877922313600.0, + "grad_norm": 1.65338560762098, + "language_loss": 0.76859641, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.79702574, + "num_input_tokens_seen": 76893905, + "step": 3564, + "time_per_iteration": 3.0092973709106445 + }, + { + "auxiliary_loss_clip": 0.0152901, + "auxiliary_loss_mlp": 0.01327378, + "balance_loss_clip": 1.18552864, + "balance_loss_mlp": 1.05596244, + "epoch": 0.2143393957613107, + "flos": 19861999809120.0, + "grad_norm": 2.0345303168175377, + "language_loss": 0.88878447, + "learning_rate": 3.654001327581981e-06, + "loss": 0.91734838, + "num_input_tokens_seen": 76914205, + "step": 3565, + "time_per_iteration": 4.424698352813721 + }, + { + "auxiliary_loss_clip": 0.0168431, + "auxiliary_loss_mlp": 0.01358681, + "balance_loss_clip": 1.35478354, + "balance_loss_mlp": 1.13895416, + "epoch": 0.21439951901397866, + "flos": 68536527595200.0, + "grad_norm": 0.8609247667667902, + "language_loss": 0.52257049, + "learning_rate": 3.653782340498215e-06, + "loss": 0.55300039, + "num_input_tokens_seen": 76975650, + "step": 3566, + "time_per_iteration": 3.304781913757324 + }, + { + "auxiliary_loss_clip": 0.01532943, + "auxiliary_loss_mlp": 0.01316799, + "balance_loss_clip": 1.18721271, + "balance_loss_mlp": 1.05377531, + "epoch": 0.21445964226664663, + "flos": 19685063121120.0, + "grad_norm": 2.00584597266793, + "language_loss": 0.67640215, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.70489955, + "num_input_tokens_seen": 76992615, + "step": 3567, + "time_per_iteration": 2.9051854610443115 + }, + { + "auxiliary_loss_clip": 0.0152881, + "auxiliary_loss_mlp": 0.01307138, + "balance_loss_clip": 1.18464696, + "balance_loss_mlp": 1.04010892, + "epoch": 0.2145197655193146, + "flos": 31110200933760.0, + "grad_norm": 1.7946852625939438, + "language_loss": 0.74377209, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.77213156, + "num_input_tokens_seen": 77017005, + "step": 3568, + "time_per_iteration": 2.986182451248169 + }, + { + "auxiliary_loss_clip": 0.0152978, + "auxiliary_loss_mlp": 0.0131314, + "balance_loss_clip": 1.18545771, + "balance_loss_mlp": 1.04286838, + "epoch": 0.21457988877198256, + "flos": 20122872243840.0, + "grad_norm": 1.6437499159931968, + "language_loss": 0.77631378, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.80474299, + "num_input_tokens_seen": 77034990, + "step": 3569, + "time_per_iteration": 2.82914400100708 + }, + { + "auxiliary_loss_clip": 0.01527846, + "auxiliary_loss_mlp": 0.01317933, + "balance_loss_clip": 1.18361712, + "balance_loss_mlp": 1.04193997, + "epoch": 0.21464001202465052, + "flos": 18590028284160.0, + "grad_norm": 2.7632453753240016, + "language_loss": 0.70768738, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.73614514, + "num_input_tokens_seen": 77052610, + "step": 3570, + "time_per_iteration": 2.841205596923828 + }, + { + "auxiliary_loss_clip": 0.01520609, + "auxiliary_loss_mlp": 0.01322359, + "balance_loss_clip": 1.17658675, + "balance_loss_mlp": 1.04674757, + "epoch": 0.21470013527731852, + "flos": 21837318055200.0, + "grad_norm": 2.1988794124408217, + "language_loss": 0.78578162, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.81421125, + "num_input_tokens_seen": 77072475, + "step": 3571, + "time_per_iteration": 2.8530991077423096 + }, + { + "auxiliary_loss_clip": 0.0152974, + "auxiliary_loss_mlp": 0.01325475, + "balance_loss_clip": 1.18534303, + "balance_loss_mlp": 1.05310535, + "epoch": 0.21476025852998648, + "flos": 17605137916800.0, + "grad_norm": 3.2531583871341803, + "language_loss": 0.82946289, + "learning_rate": 3.652467101342991e-06, + "loss": 0.858015, + "num_input_tokens_seen": 77089930, + "step": 3572, + "time_per_iteration": 2.866586208343506 + }, + { + "auxiliary_loss_clip": 0.01526188, + "auxiliary_loss_mlp": 0.01314806, + "balance_loss_clip": 1.18207943, + "balance_loss_mlp": 1.03900301, + "epoch": 0.21482038178265445, + "flos": 24830524532160.0, + "grad_norm": 2.50183791140166, + "language_loss": 0.64906919, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67747915, + "num_input_tokens_seen": 77108970, + "step": 3573, + "time_per_iteration": 2.8616762161254883 + }, + { + "auxiliary_loss_clip": 0.01526847, + "auxiliary_loss_mlp": 0.0130848, + "balance_loss_clip": 1.18342686, + "balance_loss_mlp": 1.03935289, + "epoch": 0.2148805050353224, + "flos": 23260472683200.0, + "grad_norm": 2.1363368672805807, + "language_loss": 0.76088107, + "learning_rate": 3.652028186908807e-06, + "loss": 0.78923428, + "num_input_tokens_seen": 77126045, + "step": 3574, + "time_per_iteration": 2.855262517929077 + }, + { + "auxiliary_loss_clip": 0.01522085, + "auxiliary_loss_mlp": 0.01310647, + "balance_loss_clip": 1.1778717, + "balance_loss_mlp": 1.04285574, + "epoch": 0.21494062828799038, + "flos": 21323045033280.0, + "grad_norm": 2.460760314682646, + "language_loss": 0.72559607, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.75392336, + "num_input_tokens_seen": 77144600, + "step": 3575, + "time_per_iteration": 2.8877294063568115 + }, + { + "auxiliary_loss_clip": 0.01534053, + "auxiliary_loss_mlp": 0.01309984, + "balance_loss_clip": 1.18930674, + "balance_loss_mlp": 1.03856814, + "epoch": 0.21500075154065834, + "flos": 18845325279360.0, + "grad_norm": 1.88400660762353, + "language_loss": 0.6860128, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.71445316, + "num_input_tokens_seen": 77162965, + "step": 3576, + "time_per_iteration": 2.8509204387664795 + }, + { + "auxiliary_loss_clip": 0.01524365, + "auxiliary_loss_mlp": 0.01311161, + "balance_loss_clip": 1.18119061, + "balance_loss_mlp": 1.03612137, + "epoch": 0.2150608747933263, + "flos": 18443965482720.0, + "grad_norm": 2.2135067351652085, + "language_loss": 0.88817871, + "learning_rate": 3.651369345440292e-06, + "loss": 0.91653401, + "num_input_tokens_seen": 77179960, + "step": 3577, + "time_per_iteration": 2.864824056625366 + }, + { + "auxiliary_loss_clip": 0.01661447, + "auxiliary_loss_mlp": 0.01266418, + "balance_loss_clip": 1.33165956, + "balance_loss_mlp": 1.03295898, + "epoch": 0.2151209980459943, + "flos": 66604675384800.0, + "grad_norm": 0.8172916984457088, + "language_loss": 0.56197709, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.59125578, + "num_input_tokens_seen": 77239500, + "step": 3578, + "time_per_iteration": 3.3102922439575195 + }, + { + "auxiliary_loss_clip": 0.01524708, + "auxiliary_loss_mlp": 0.01324609, + "balance_loss_clip": 1.18013716, + "balance_loss_mlp": 1.05967796, + "epoch": 0.21518112129866226, + "flos": 21577811034240.0, + "grad_norm": 1.8527833728896617, + "language_loss": 0.88755548, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.91604865, + "num_input_tokens_seen": 77254680, + "step": 3579, + "time_per_iteration": 2.8355872631073 + }, + { + "auxiliary_loss_clip": 0.01518702, + "auxiliary_loss_mlp": 0.01324647, + "balance_loss_clip": 1.17407727, + "balance_loss_mlp": 1.05685544, + "epoch": 0.21524124455133023, + "flos": 20049708094560.0, + "grad_norm": 2.0183419253128583, + "language_loss": 0.78123057, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80966401, + "num_input_tokens_seen": 77274060, + "step": 3580, + "time_per_iteration": 2.834886312484741 + }, + { + "auxiliary_loss_clip": 0.01514667, + "auxiliary_loss_mlp": 0.01330189, + "balance_loss_clip": 1.17061281, + "balance_loss_mlp": 1.0656395, + "epoch": 0.2153013678039982, + "flos": 23954109795360.0, + "grad_norm": 2.609531329978983, + "language_loss": 0.72420752, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.7526561, + "num_input_tokens_seen": 77293255, + "step": 3581, + "time_per_iteration": 2.9712696075439453 + }, + { + "auxiliary_loss_clip": 0.01521852, + "auxiliary_loss_mlp": 0.01344211, + "balance_loss_clip": 1.17617869, + "balance_loss_mlp": 1.0823319, + "epoch": 0.21536149105666616, + "flos": 20596941051840.0, + "grad_norm": 2.1307182939875995, + "language_loss": 0.71433407, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.74299467, + "num_input_tokens_seen": 77312390, + "step": 3582, + "time_per_iteration": 2.8665013313293457 + }, + { + "auxiliary_loss_clip": 0.01522527, + "auxiliary_loss_mlp": 0.01344761, + "balance_loss_clip": 1.17812359, + "balance_loss_mlp": 1.08078372, + "epoch": 0.21542161430933413, + "flos": 12861719009280.0, + "grad_norm": 2.560605448375526, + "language_loss": 0.83644581, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86511874, + "num_input_tokens_seen": 77330985, + "step": 3583, + "time_per_iteration": 2.7160966396331787 + }, + { + "auxiliary_loss_clip": 0.01523142, + "auxiliary_loss_mlp": 0.01325073, + "balance_loss_clip": 1.17812395, + "balance_loss_mlp": 1.05957031, + "epoch": 0.21548173756200212, + "flos": 26106326801280.0, + "grad_norm": 2.254698243973252, + "language_loss": 0.82891876, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.85740089, + "num_input_tokens_seen": 77350770, + "step": 3584, + "time_per_iteration": 2.809648275375366 + }, + { + "auxiliary_loss_clip": 0.01526139, + "auxiliary_loss_mlp": 0.01335456, + "balance_loss_clip": 1.18219471, + "balance_loss_mlp": 1.07338583, + "epoch": 0.21554186081467008, + "flos": 22165968840480.0, + "grad_norm": 2.1456502399433566, + "language_loss": 0.90381551, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.93243146, + "num_input_tokens_seen": 77370510, + "step": 3585, + "time_per_iteration": 2.8516809940338135 + }, + { + "auxiliary_loss_clip": 0.01525727, + "auxiliary_loss_mlp": 0.01334141, + "balance_loss_clip": 1.18163812, + "balance_loss_mlp": 1.07111776, + "epoch": 0.21560198406733805, + "flos": 22968991859040.0, + "grad_norm": 2.043077576733036, + "language_loss": 0.74844587, + "learning_rate": 3.649389440450277e-06, + "loss": 0.77704459, + "num_input_tokens_seen": 77390645, + "step": 3586, + "time_per_iteration": 2.8051953315734863 + }, + { + "auxiliary_loss_clip": 0.01519134, + "auxiliary_loss_mlp": 0.01330986, + "balance_loss_clip": 1.17498326, + "balance_loss_mlp": 1.06662714, + "epoch": 0.215662107320006, + "flos": 22786290090720.0, + "grad_norm": 1.7822866789571397, + "language_loss": 0.83020663, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85870779, + "num_input_tokens_seen": 77409655, + "step": 3587, + "time_per_iteration": 2.8995134830474854 + }, + { + "auxiliary_loss_clip": 0.01520306, + "auxiliary_loss_mlp": 0.0132365, + "balance_loss_clip": 1.17613077, + "balance_loss_mlp": 1.05662084, + "epoch": 0.21572223057267398, + "flos": 30886536388320.0, + "grad_norm": 1.903130691527552, + "language_loss": 0.75925034, + "learning_rate": 3.648948773354224e-06, + "loss": 0.78768992, + "num_input_tokens_seen": 77430560, + "step": 3588, + "time_per_iteration": 2.899031639099121 + }, + { + "auxiliary_loss_clip": 0.01518251, + "auxiliary_loss_mlp": 0.01320076, + "balance_loss_clip": 1.17440557, + "balance_loss_mlp": 1.05285609, + "epoch": 0.21578235382534194, + "flos": 26913142635840.0, + "grad_norm": 1.8699797533763387, + "language_loss": 0.81092584, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83930916, + "num_input_tokens_seen": 77455000, + "step": 3589, + "time_per_iteration": 2.9646739959716797 + }, + { + "auxiliary_loss_clip": 0.01522975, + "auxiliary_loss_mlp": 0.01325995, + "balance_loss_clip": 1.17930567, + "balance_loss_mlp": 1.05476975, + "epoch": 0.2158424770780099, + "flos": 24428026890720.0, + "grad_norm": 2.718040539042682, + "language_loss": 0.72700274, + "learning_rate": 3.648507856144961e-06, + "loss": 0.75549245, + "num_input_tokens_seen": 77475075, + "step": 3590, + "time_per_iteration": 2.877183437347412 + }, + { + "auxiliary_loss_clip": 0.01517266, + "auxiliary_loss_mlp": 0.01323773, + "balance_loss_clip": 1.17173898, + "balance_loss_mlp": 1.05636299, + "epoch": 0.2159026003306779, + "flos": 23952175459200.0, + "grad_norm": 2.7379370873250446, + "language_loss": 0.8388952, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86730564, + "num_input_tokens_seen": 77495945, + "step": 3591, + "time_per_iteration": 2.9326939582824707 + }, + { + "auxiliary_loss_clip": 0.01520809, + "auxiliary_loss_mlp": 0.01327738, + "balance_loss_clip": 1.17685771, + "balance_loss_mlp": 1.05784845, + "epoch": 0.21596272358334587, + "flos": 30043385012160.0, + "grad_norm": 1.809048697022114, + "language_loss": 0.69443601, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.72292149, + "num_input_tokens_seen": 77517140, + "step": 3592, + "time_per_iteration": 4.584801912307739 + }, + { + "auxiliary_loss_clip": 0.01520533, + "auxiliary_loss_mlp": 0.01312188, + "balance_loss_clip": 1.1769731, + "balance_loss_mlp": 1.04077196, + "epoch": 0.21602284683601383, + "flos": 20378320951680.0, + "grad_norm": 2.416278726310595, + "language_loss": 0.84017515, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86850238, + "num_input_tokens_seen": 77536085, + "step": 3593, + "time_per_iteration": 2.836700201034546 + }, + { + "auxiliary_loss_clip": 0.01518976, + "auxiliary_loss_mlp": 0.01339618, + "balance_loss_clip": 1.17523098, + "balance_loss_mlp": 1.07449687, + "epoch": 0.2160829700886818, + "flos": 20779453179360.0, + "grad_norm": 2.5424794105803623, + "language_loss": 0.75319672, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.78178269, + "num_input_tokens_seen": 77553675, + "step": 3594, + "time_per_iteration": 2.8666880130767822 + }, + { + "auxiliary_loss_clip": 0.01517815, + "auxiliary_loss_mlp": 0.01320567, + "balance_loss_clip": 1.17489123, + "balance_loss_mlp": 1.05601764, + "epoch": 0.21614309334134976, + "flos": 22311955785600.0, + "grad_norm": 2.617336255230559, + "language_loss": 0.80687958, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.83526343, + "num_input_tokens_seen": 77573360, + "step": 3595, + "time_per_iteration": 2.911928176879883 + }, + { + "auxiliary_loss_clip": 0.01519426, + "auxiliary_loss_mlp": 0.01320904, + "balance_loss_clip": 1.17738748, + "balance_loss_mlp": 1.04910707, + "epoch": 0.21620321659401773, + "flos": 19611557618400.0, + "grad_norm": 3.7992630688282825, + "language_loss": 0.78769004, + "learning_rate": 3.647183604506897e-06, + "loss": 0.81609344, + "num_input_tokens_seen": 77591865, + "step": 3596, + "time_per_iteration": 2.8179800510406494 + }, + { + "auxiliary_loss_clip": 0.01522129, + "auxiliary_loss_mlp": 0.01316017, + "balance_loss_clip": 1.1787653, + "balance_loss_mlp": 1.05146813, + "epoch": 0.2162633398466857, + "flos": 18846614836800.0, + "grad_norm": 3.074992937952818, + "language_loss": 0.83521235, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.86359382, + "num_input_tokens_seen": 77611600, + "step": 3597, + "time_per_iteration": 2.938516139984131 + }, + { + "auxiliary_loss_clip": 0.015213, + "auxiliary_loss_mlp": 0.01328348, + "balance_loss_clip": 1.17841125, + "balance_loss_mlp": 1.05922091, + "epoch": 0.21632346309935369, + "flos": 18770719860000.0, + "grad_norm": 2.0719485899637533, + "language_loss": 0.80943614, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.83793259, + "num_input_tokens_seen": 77630665, + "step": 3598, + "time_per_iteration": 4.331009149551392 + }, + { + "auxiliary_loss_clip": 0.01522578, + "auxiliary_loss_mlp": 0.01340032, + "balance_loss_clip": 1.17832446, + "balance_loss_mlp": 1.0728128, + "epoch": 0.21638358635202165, + "flos": 26326653668640.0, + "grad_norm": 1.8804371539676854, + "language_loss": 0.82333601, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.85196209, + "num_input_tokens_seen": 77650835, + "step": 3599, + "time_per_iteration": 2.9229674339294434 + }, + { + "auxiliary_loss_clip": 0.01520021, + "auxiliary_loss_mlp": 0.01326594, + "balance_loss_clip": 1.1784153, + "balance_loss_mlp": 1.05689514, + "epoch": 0.21644370960468962, + "flos": 20742965925120.0, + "grad_norm": 2.28081504264274, + "language_loss": 0.76487279, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.79333895, + "num_input_tokens_seen": 77669000, + "step": 3600, + "time_per_iteration": 2.886855125427246 + }, + { + "auxiliary_loss_clip": 0.01522525, + "auxiliary_loss_mlp": 0.01308588, + "balance_loss_clip": 1.18003631, + "balance_loss_mlp": 1.04270327, + "epoch": 0.21650383285735758, + "flos": 23954564933280.0, + "grad_norm": 2.226407777124237, + "language_loss": 0.80354834, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.83185941, + "num_input_tokens_seen": 77688745, + "step": 3601, + "time_per_iteration": 4.376439332962036 + }, + { + "auxiliary_loss_clip": 0.01518065, + "auxiliary_loss_mlp": 0.01316655, + "balance_loss_clip": 1.17581391, + "balance_loss_mlp": 1.04562044, + "epoch": 0.21656395611002555, + "flos": 23698357662240.0, + "grad_norm": 2.2145314291956235, + "language_loss": 0.83588827, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.86423552, + "num_input_tokens_seen": 77708445, + "step": 3602, + "time_per_iteration": 2.836829662322998 + }, + { + "auxiliary_loss_clip": 0.01518608, + "auxiliary_loss_mlp": 0.01317886, + "balance_loss_clip": 1.17611766, + "balance_loss_mlp": 1.04551649, + "epoch": 0.2166240793626935, + "flos": 20668057080480.0, + "grad_norm": 2.6025675569139115, + "language_loss": 0.74685681, + "learning_rate": 3.645635802397693e-06, + "loss": 0.77522177, + "num_input_tokens_seen": 77728465, + "step": 3603, + "time_per_iteration": 4.317977666854858 + }, + { + "auxiliary_loss_clip": 0.01529228, + "auxiliary_loss_mlp": 0.01334064, + "balance_loss_clip": 1.18674576, + "balance_loss_mlp": 1.06684422, + "epoch": 0.2166842026153615, + "flos": 21582589982400.0, + "grad_norm": 1.7928229766209405, + "language_loss": 0.74447638, + "learning_rate": 3.645414438132855e-06, + "loss": 0.77310926, + "num_input_tokens_seen": 77746735, + "step": 3604, + "time_per_iteration": 2.926887035369873 + }, + { + "auxiliary_loss_clip": 0.01522046, + "auxiliary_loss_mlp": 0.01310301, + "balance_loss_clip": 1.17969978, + "balance_loss_mlp": 1.04727793, + "epoch": 0.21674432586802947, + "flos": 25632333849600.0, + "grad_norm": 1.6867267928219267, + "language_loss": 0.79898381, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82730728, + "num_input_tokens_seen": 77768105, + "step": 3605, + "time_per_iteration": 2.9546687602996826 + }, + { + "auxiliary_loss_clip": 0.01631712, + "auxiliary_loss_mlp": 0.0127166, + "balance_loss_clip": 1.30472922, + "balance_loss_mlp": 1.04659271, + "epoch": 0.21680444912069743, + "flos": 56423593607040.0, + "grad_norm": 0.7051032444282375, + "language_loss": 0.58309859, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.61213231, + "num_input_tokens_seen": 77833750, + "step": 3606, + "time_per_iteration": 3.5275466442108154 + }, + { + "auxiliary_loss_clip": 0.01520578, + "auxiliary_loss_mlp": 0.01316153, + "balance_loss_clip": 1.17748475, + "balance_loss_mlp": 1.04778862, + "epoch": 0.2168645723733654, + "flos": 23881249071360.0, + "grad_norm": 2.0652431379727734, + "language_loss": 0.72760767, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75597501, + "num_input_tokens_seen": 77853780, + "step": 3607, + "time_per_iteration": 2.9812514781951904 + }, + { + "auxiliary_loss_clip": 0.01517875, + "auxiliary_loss_mlp": 0.01323764, + "balance_loss_clip": 1.17478871, + "balance_loss_mlp": 1.0557816, + "epoch": 0.21692469562603336, + "flos": 16948063915200.0, + "grad_norm": 3.9487759229031907, + "language_loss": 0.7719245, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.80034089, + "num_input_tokens_seen": 77872575, + "step": 3608, + "time_per_iteration": 2.8234376907348633 + }, + { + "auxiliary_loss_clip": 0.01515922, + "auxiliary_loss_mlp": 0.01336692, + "balance_loss_clip": 1.17270374, + "balance_loss_mlp": 1.07157016, + "epoch": 0.21698481887870133, + "flos": 25121398505760.0, + "grad_norm": 1.9230940130332725, + "language_loss": 0.74305022, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.77157634, + "num_input_tokens_seen": 77892700, + "step": 3609, + "time_per_iteration": 2.9016470909118652 + }, + { + "auxiliary_loss_clip": 0.01520792, + "auxiliary_loss_mlp": 0.01339643, + "balance_loss_clip": 1.17677569, + "balance_loss_mlp": 1.07280529, + "epoch": 0.2170449421313693, + "flos": 17896618740960.0, + "grad_norm": 1.9127062391255685, + "language_loss": 0.88768011, + "learning_rate": 3.6440849425579e-06, + "loss": 0.91628444, + "num_input_tokens_seen": 77911060, + "step": 3610, + "time_per_iteration": 2.776498317718506 + }, + { + "auxiliary_loss_clip": 0.015171, + "auxiliary_loss_mlp": 0.01317647, + "balance_loss_clip": 1.17337108, + "balance_loss_mlp": 1.05462337, + "epoch": 0.2171050653840373, + "flos": 22640682427200.0, + "grad_norm": 3.9867004230845726, + "language_loss": 0.77864718, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.80699468, + "num_input_tokens_seen": 77929930, + "step": 3611, + "time_per_iteration": 2.873528003692627 + }, + { + "auxiliary_loss_clip": 0.01520546, + "auxiliary_loss_mlp": 0.01305763, + "balance_loss_clip": 1.17864168, + "balance_loss_mlp": 1.04045069, + "epoch": 0.21716518863670525, + "flos": 19502019999360.0, + "grad_norm": 2.2558511284594664, + "language_loss": 0.63390446, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.66216755, + "num_input_tokens_seen": 77949060, + "step": 3612, + "time_per_iteration": 2.8205947875976562 + }, + { + "auxiliary_loss_clip": 0.0151278, + "auxiliary_loss_mlp": 0.01311618, + "balance_loss_clip": 1.16910553, + "balance_loss_mlp": 1.04306376, + "epoch": 0.21722531188937322, + "flos": 19794069745920.0, + "grad_norm": 1.9288258144802672, + "language_loss": 0.7612201, + "learning_rate": 3.643419353014776e-06, + "loss": 0.78946412, + "num_input_tokens_seen": 77967920, + "step": 3613, + "time_per_iteration": 2.8600330352783203 + }, + { + "auxiliary_loss_clip": 0.01511087, + "auxiliary_loss_mlp": 0.01315137, + "balance_loss_clip": 1.16837525, + "balance_loss_mlp": 1.04906201, + "epoch": 0.21728543514204118, + "flos": 13336091242560.0, + "grad_norm": 2.358735288461731, + "language_loss": 0.70818281, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73644507, + "num_input_tokens_seen": 77985330, + "step": 3614, + "time_per_iteration": 2.7721948623657227 + }, + { + "auxiliary_loss_clip": 0.01518233, + "auxiliary_loss_mlp": 0.01320242, + "balance_loss_clip": 1.17546844, + "balance_loss_mlp": 1.05416632, + "epoch": 0.21734555839470915, + "flos": 15233693960160.0, + "grad_norm": 2.9098095951825083, + "language_loss": 0.73810071, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.76648545, + "num_input_tokens_seen": 78003105, + "step": 3615, + "time_per_iteration": 2.805853843688965 + }, + { + "auxiliary_loss_clip": 0.01516063, + "auxiliary_loss_mlp": 0.01322847, + "balance_loss_clip": 1.17332804, + "balance_loss_mlp": 1.05009651, + "epoch": 0.2174056816473771, + "flos": 19976012951040.0, + "grad_norm": 2.550377751557621, + "language_loss": 0.9010511, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92944014, + "num_input_tokens_seen": 78019655, + "step": 3616, + "time_per_iteration": 2.8944251537323 + }, + { + "auxiliary_loss_clip": 0.01520235, + "auxiliary_loss_mlp": 0.01319918, + "balance_loss_clip": 1.17705333, + "balance_loss_mlp": 1.05174446, + "epoch": 0.21746580490004508, + "flos": 16688974104000.0, + "grad_norm": 2.9478762058197434, + "language_loss": 0.81725347, + "learning_rate": 3.642531027869148e-06, + "loss": 0.84565496, + "num_input_tokens_seen": 78036025, + "step": 3617, + "time_per_iteration": 2.7673046588897705 + }, + { + "auxiliary_loss_clip": 0.01516156, + "auxiliary_loss_mlp": 0.01318888, + "balance_loss_clip": 1.17264485, + "balance_loss_mlp": 1.04861689, + "epoch": 0.21752592815271307, + "flos": 25774490050560.0, + "grad_norm": 2.879638853813007, + "language_loss": 0.75322163, + "learning_rate": 3.642308790849329e-06, + "loss": 0.7815721, + "num_input_tokens_seen": 78055645, + "step": 3618, + "time_per_iteration": 2.91098690032959 + }, + { + "auxiliary_loss_clip": 0.01516707, + "auxiliary_loss_mlp": 0.01320534, + "balance_loss_clip": 1.1729424, + "balance_loss_mlp": 1.05140686, + "epoch": 0.21758605140538104, + "flos": 11256279822720.0, + "grad_norm": 2.174404514109106, + "language_loss": 0.68941426, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71778667, + "num_input_tokens_seen": 78071660, + "step": 3619, + "time_per_iteration": 2.846327304840088 + }, + { + "auxiliary_loss_clip": 0.01507441, + "auxiliary_loss_mlp": 0.01306191, + "balance_loss_clip": 1.16447854, + "balance_loss_mlp": 1.03725481, + "epoch": 0.217646174658049, + "flos": 19244523170880.0, + "grad_norm": 2.19829012300304, + "language_loss": 0.78951663, + "learning_rate": 3.641864129988579e-06, + "loss": 0.81765294, + "num_input_tokens_seen": 78091265, + "step": 3620, + "time_per_iteration": 2.8969380855560303 + }, + { + "auxiliary_loss_clip": 0.01513675, + "auxiliary_loss_mlp": 0.01307923, + "balance_loss_clip": 1.17022538, + "balance_loss_mlp": 1.04299235, + "epoch": 0.21770629791071697, + "flos": 21947272884000.0, + "grad_norm": 1.8180141408559767, + "language_loss": 0.80180013, + "learning_rate": 3.641641706164509e-06, + "loss": 0.83001614, + "num_input_tokens_seen": 78110095, + "step": 3621, + "time_per_iteration": 2.8294122219085693 + }, + { + "auxiliary_loss_clip": 0.0150838, + "auxiliary_loss_mlp": 0.01317272, + "balance_loss_clip": 1.1654371, + "balance_loss_mlp": 1.05195963, + "epoch": 0.21776642116338493, + "flos": 24939113947200.0, + "grad_norm": 2.1303465124826793, + "language_loss": 0.87977803, + "learning_rate": 3.641419220089221e-06, + "loss": 0.90803456, + "num_input_tokens_seen": 78129475, + "step": 3622, + "time_per_iteration": 2.865994691848755 + }, + { + "auxiliary_loss_clip": 0.01512761, + "auxiliary_loss_mlp": 0.01321725, + "balance_loss_clip": 1.16948986, + "balance_loss_mlp": 1.05317008, + "epoch": 0.2178265444160529, + "flos": 17823227022720.0, + "grad_norm": 2.327120944427436, + "language_loss": 0.7740308, + "learning_rate": 3.641196671771152e-06, + "loss": 0.80237573, + "num_input_tokens_seen": 78146880, + "step": 3623, + "time_per_iteration": 2.877200126647949 + }, + { + "auxiliary_loss_clip": 0.01516137, + "auxiliary_loss_mlp": 0.01317764, + "balance_loss_clip": 1.17314804, + "balance_loss_mlp": 1.04615736, + "epoch": 0.2178866676687209, + "flos": 17714865176640.0, + "grad_norm": 2.0356039475387697, + "language_loss": 0.84629059, + "learning_rate": 3.640974061218741e-06, + "loss": 0.87462968, + "num_input_tokens_seen": 78165065, + "step": 3624, + "time_per_iteration": 2.802942991256714 + }, + { + "auxiliary_loss_clip": 0.01521898, + "auxiliary_loss_mlp": 0.01330838, + "balance_loss_clip": 1.17877293, + "balance_loss_mlp": 1.06571627, + "epoch": 0.21794679092138886, + "flos": 16947722561760.0, + "grad_norm": 3.3854077849181348, + "language_loss": 0.77578342, + "learning_rate": 3.640751388440429e-06, + "loss": 0.80431074, + "num_input_tokens_seen": 78180005, + "step": 3625, + "time_per_iteration": 2.8185338973999023 + }, + { + "auxiliary_loss_clip": 0.0177701, + "auxiliary_loss_mlp": 0.01368591, + "balance_loss_clip": 1.44323778, + "balance_loss_mlp": 1.16793823, + "epoch": 0.21800691417405682, + "flos": 63724988983680.0, + "grad_norm": 0.8419497653597816, + "language_loss": 0.60660779, + "learning_rate": 3.64052865344466e-06, + "loss": 0.63806379, + "num_input_tokens_seen": 78245350, + "step": 3626, + "time_per_iteration": 3.4309628009796143 + }, + { + "auxiliary_loss_clip": 0.01508292, + "auxiliary_loss_mlp": 0.01315623, + "balance_loss_clip": 1.16550589, + "balance_loss_mlp": 1.04744983, + "epoch": 0.21806703742672479, + "flos": 21618546242400.0, + "grad_norm": 2.565320112763128, + "language_loss": 0.90878248, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.93702161, + "num_input_tokens_seen": 78264165, + "step": 3627, + "time_per_iteration": 2.915863513946533 + }, + { + "auxiliary_loss_clip": 0.01504888, + "auxiliary_loss_mlp": 0.01314748, + "balance_loss_clip": 1.16182983, + "balance_loss_mlp": 1.04790998, + "epoch": 0.21812716067939275, + "flos": 19356943330080.0, + "grad_norm": 2.5465423139640038, + "language_loss": 0.74046969, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.76866615, + "num_input_tokens_seen": 78283745, + "step": 3628, + "time_per_iteration": 2.8398847579956055 + }, + { + "auxiliary_loss_clip": 0.01508644, + "auxiliary_loss_mlp": 0.01311868, + "balance_loss_clip": 1.16480386, + "balance_loss_mlp": 1.04598379, + "epoch": 0.21818728393206072, + "flos": 23550208812000.0, + "grad_norm": 2.111883570220448, + "language_loss": 0.7775563, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.8057614, + "num_input_tokens_seen": 78302900, + "step": 3629, + "time_per_iteration": 2.9036953449249268 + }, + { + "auxiliary_loss_clip": 0.01512867, + "auxiliary_loss_mlp": 0.01319379, + "balance_loss_clip": 1.16949832, + "balance_loss_mlp": 1.05349493, + "epoch": 0.21824740718472868, + "flos": 30228210757440.0, + "grad_norm": 1.7985741127554236, + "language_loss": 0.71797955, + "learning_rate": 3.63963709145597e-06, + "loss": 0.74630201, + "num_input_tokens_seen": 78326470, + "step": 3630, + "time_per_iteration": 4.559979438781738 + }, + { + "auxiliary_loss_clip": 0.01514049, + "auxiliary_loss_mlp": 0.01321426, + "balance_loss_clip": 1.17056715, + "balance_loss_mlp": 1.06202698, + "epoch": 0.21830753043739667, + "flos": 26136745549920.0, + "grad_norm": 2.1680483918415434, + "language_loss": 0.76979816, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.79815292, + "num_input_tokens_seen": 78345810, + "step": 3631, + "time_per_iteration": 2.9535598754882812 + }, + { + "auxiliary_loss_clip": 0.01502725, + "auxiliary_loss_mlp": 0.01312444, + "balance_loss_clip": 1.15966392, + "balance_loss_mlp": 1.04713142, + "epoch": 0.21836765369006464, + "flos": 21722584278240.0, + "grad_norm": 2.234758852377251, + "language_loss": 0.75659138, + "learning_rate": 3.639190937376594e-06, + "loss": 0.78474307, + "num_input_tokens_seen": 78364085, + "step": 3632, + "time_per_iteration": 2.824432611465454 + }, + { + "auxiliary_loss_clip": 0.01510342, + "auxiliary_loss_mlp": 0.01331385, + "balance_loss_clip": 1.16697657, + "balance_loss_mlp": 1.06817102, + "epoch": 0.2184277769427326, + "flos": 19939639481280.0, + "grad_norm": 4.159628979249111, + "language_loss": 0.84536099, + "learning_rate": 3.638967767095249e-06, + "loss": 0.87377822, + "num_input_tokens_seen": 78381385, + "step": 3633, + "time_per_iteration": 2.93538498878479 + }, + { + "auxiliary_loss_clip": 0.01511194, + "auxiliary_loss_mlp": 0.01320772, + "balance_loss_clip": 1.16866803, + "balance_loss_mlp": 1.05679512, + "epoch": 0.21848790019540057, + "flos": 20342364691680.0, + "grad_norm": 5.571696043221181, + "language_loss": 0.81836158, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.84668124, + "num_input_tokens_seen": 78400500, + "step": 3634, + "time_per_iteration": 2.89048171043396 + }, + { + "auxiliary_loss_clip": 0.01499029, + "auxiliary_loss_mlp": 0.01323567, + "balance_loss_clip": 1.15509677, + "balance_loss_mlp": 1.05920839, + "epoch": 0.21854802344806853, + "flos": 15452655413760.0, + "grad_norm": 2.295026181259789, + "language_loss": 0.75171983, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77994573, + "num_input_tokens_seen": 78418340, + "step": 3635, + "time_per_iteration": 4.322491645812988 + }, + { + "auxiliary_loss_clip": 0.01505774, + "auxiliary_loss_mlp": 0.013079, + "balance_loss_clip": 1.16140902, + "balance_loss_mlp": 1.04563904, + "epoch": 0.2186081467007365, + "flos": 16322281009920.0, + "grad_norm": 2.0465791397675455, + "language_loss": 0.88616222, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.91429895, + "num_input_tokens_seen": 78434375, + "step": 3636, + "time_per_iteration": 2.8610122203826904 + }, + { + "auxiliary_loss_clip": 0.01502048, + "auxiliary_loss_mlp": 0.01318543, + "balance_loss_clip": 1.15829611, + "balance_loss_mlp": 1.05819011, + "epoch": 0.2186682699534045, + "flos": 21691406966400.0, + "grad_norm": 2.8657838228944295, + "language_loss": 0.76284063, + "learning_rate": 3.638074464556311e-06, + "loss": 0.79104656, + "num_input_tokens_seen": 78451735, + "step": 3637, + "time_per_iteration": 2.788654327392578 + }, + { + "auxiliary_loss_clip": 0.01510195, + "auxiliary_loss_mlp": 0.01336912, + "balance_loss_clip": 1.16508532, + "balance_loss_mlp": 1.0738883, + "epoch": 0.21872839320607246, + "flos": 17740656689760.0, + "grad_norm": 2.5051083296230914, + "language_loss": 0.8984803, + "learning_rate": 3.63785098361053e-06, + "loss": 0.92695141, + "num_input_tokens_seen": 78462730, + "step": 3638, + "time_per_iteration": 4.4062488079071045 + }, + { + "auxiliary_loss_clip": 0.01507394, + "auxiliary_loss_mlp": 0.01325932, + "balance_loss_clip": 1.16156363, + "balance_loss_mlp": 1.06443453, + "epoch": 0.21878851645874042, + "flos": 18653786249760.0, + "grad_norm": 2.564730303033782, + "language_loss": 0.90000302, + "learning_rate": 3.637627440557275e-06, + "loss": 0.92833626, + "num_input_tokens_seen": 78476300, + "step": 3639, + "time_per_iteration": 2.8698530197143555 + }, + { + "auxiliary_loss_clip": 0.01511962, + "auxiliary_loss_mlp": 0.01337135, + "balance_loss_clip": 1.16647243, + "balance_loss_mlp": 1.07582819, + "epoch": 0.2188486397114084, + "flos": 25559700694560.0, + "grad_norm": 2.1798846068831903, + "language_loss": 0.79814684, + "learning_rate": 3.637403835405024e-06, + "loss": 0.82663774, + "num_input_tokens_seen": 78496135, + "step": 3640, + "time_per_iteration": 2.892629623413086 + }, + { + "auxiliary_loss_clip": 0.01506396, + "auxiliary_loss_mlp": 0.01321866, + "balance_loss_clip": 1.16193962, + "balance_loss_mlp": 1.05865169, + "epoch": 0.21890876296407635, + "flos": 17893849985280.0, + "grad_norm": 2.3059164284530786, + "language_loss": 0.72014815, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74843073, + "num_input_tokens_seen": 78513855, + "step": 3641, + "time_per_iteration": 2.881849765777588 + }, + { + "auxiliary_loss_clip": 0.01510268, + "auxiliary_loss_mlp": 0.01331991, + "balance_loss_clip": 1.16450083, + "balance_loss_mlp": 1.0703032, + "epoch": 0.21896888621674432, + "flos": 17751049005600.0, + "grad_norm": 2.262896344857151, + "language_loss": 0.80871809, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83714068, + "num_input_tokens_seen": 78531740, + "step": 3642, + "time_per_iteration": 4.321478843688965 + }, + { + "auxiliary_loss_clip": 0.01509549, + "auxiliary_loss_mlp": 0.01330541, + "balance_loss_clip": 1.16286731, + "balance_loss_mlp": 1.06770861, + "epoch": 0.21902900946941228, + "flos": 23078415693600.0, + "grad_norm": 1.8405932451343752, + "language_loss": 0.72253442, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.75093532, + "num_input_tokens_seen": 78549600, + "step": 3643, + "time_per_iteration": 2.834167718887329 + }, + { + "auxiliary_loss_clip": 0.01509469, + "auxiliary_loss_mlp": 0.01344063, + "balance_loss_clip": 1.16363621, + "balance_loss_mlp": 1.08695245, + "epoch": 0.21908913272208028, + "flos": 48182215533120.0, + "grad_norm": 2.399640863548416, + "language_loss": 0.68749762, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.71603298, + "num_input_tokens_seen": 78573350, + "step": 3644, + "time_per_iteration": 3.049814462661743 + }, + { + "auxiliary_loss_clip": 0.01504874, + "auxiliary_loss_mlp": 0.01328793, + "balance_loss_clip": 1.15893555, + "balance_loss_mlp": 1.06634152, + "epoch": 0.21914925597474824, + "flos": 22238943348960.0, + "grad_norm": 2.492502935439559, + "language_loss": 0.78114182, + "learning_rate": 3.636284878455669e-06, + "loss": 0.8094784, + "num_input_tokens_seen": 78591005, + "step": 3645, + "time_per_iteration": 2.8914644718170166 + }, + { + "auxiliary_loss_clip": 0.01511581, + "auxiliary_loss_mlp": 0.0131832, + "balance_loss_clip": 1.16696393, + "balance_loss_mlp": 1.06178164, + "epoch": 0.2192093792274162, + "flos": 22127698962720.0, + "grad_norm": 1.614883374374868, + "language_loss": 0.82589203, + "learning_rate": 3.636060900887582e-06, + "loss": 0.85419101, + "num_input_tokens_seen": 78610645, + "step": 3646, + "time_per_iteration": 3.068681240081787 + }, + { + "auxiliary_loss_clip": 0.01505944, + "auxiliary_loss_mlp": 0.01311631, + "balance_loss_clip": 1.16027892, + "balance_loss_mlp": 1.04784513, + "epoch": 0.21926950248008417, + "flos": 15671275513920.0, + "grad_norm": 1.8506750487286059, + "language_loss": 0.82990748, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85808325, + "num_input_tokens_seen": 78628340, + "step": 3647, + "time_per_iteration": 2.8680930137634277 + }, + { + "auxiliary_loss_clip": 0.01504709, + "auxiliary_loss_mlp": 0.01330026, + "balance_loss_clip": 1.15947342, + "balance_loss_mlp": 1.07024539, + "epoch": 0.21932962573275214, + "flos": 30265077293280.0, + "grad_norm": 1.7910891163011138, + "language_loss": 0.72829926, + "learning_rate": 3.635612759641123e-06, + "loss": 0.75664669, + "num_input_tokens_seen": 78649355, + "step": 3648, + "time_per_iteration": 2.9400856494903564 + }, + { + "auxiliary_loss_clip": 0.01500251, + "auxiliary_loss_mlp": 0.01323313, + "balance_loss_clip": 1.1538341, + "balance_loss_mlp": 1.05990863, + "epoch": 0.2193897489854201, + "flos": 10781680020480.0, + "grad_norm": 3.303204651338055, + "language_loss": 0.74537694, + "learning_rate": 3.635388595979745e-06, + "loss": 0.77361262, + "num_input_tokens_seen": 78664915, + "step": 3649, + "time_per_iteration": 2.866487503051758 + }, + { + "auxiliary_loss_clip": 0.01505655, + "auxiliary_loss_mlp": 0.01316534, + "balance_loss_clip": 1.16010499, + "balance_loss_mlp": 1.05980456, + "epoch": 0.21944987223808807, + "flos": 19135213120800.0, + "grad_norm": 2.0795642793985487, + "language_loss": 0.86673462, + "learning_rate": 3.635164370304267e-06, + "loss": 0.89495659, + "num_input_tokens_seen": 78681475, + "step": 3650, + "time_per_iteration": 2.8231775760650635 + }, + { + "auxiliary_loss_clip": 0.01501078, + "auxiliary_loss_mlp": 0.0132587, + "balance_loss_clip": 1.15544558, + "balance_loss_mlp": 1.06418228, + "epoch": 0.21950999549075606, + "flos": 22713315582240.0, + "grad_norm": 2.097714940429223, + "language_loss": 0.84231526, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.87058473, + "num_input_tokens_seen": 78702300, + "step": 3651, + "time_per_iteration": 2.8879828453063965 + }, + { + "auxiliary_loss_clip": 0.01504278, + "auxiliary_loss_mlp": 0.01317145, + "balance_loss_clip": 1.15897369, + "balance_loss_mlp": 1.05450368, + "epoch": 0.21957011874342403, + "flos": 10562870279520.0, + "grad_norm": 4.548137589845908, + "language_loss": 0.74820065, + "learning_rate": 3.634715732945027e-06, + "loss": 0.77641487, + "num_input_tokens_seen": 78720230, + "step": 3652, + "time_per_iteration": 2.8785171508789062 + }, + { + "auxiliary_loss_clip": 0.01669705, + "auxiliary_loss_mlp": 0.01318993, + "balance_loss_clip": 1.33392012, + "balance_loss_mlp": 1.11071014, + "epoch": 0.219630241996092, + "flos": 65753672879520.0, + "grad_norm": 0.739432325352685, + "language_loss": 0.5151751, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.54506207, + "num_input_tokens_seen": 78780200, + "step": 3653, + "time_per_iteration": 3.3791136741638184 + }, + { + "auxiliary_loss_clip": 0.01511846, + "auxiliary_loss_mlp": 0.01349929, + "balance_loss_clip": 1.16662455, + "balance_loss_mlp": 1.09587038, + "epoch": 0.21969036524875996, + "flos": 23698812800160.0, + "grad_norm": 2.616305591632034, + "language_loss": 0.75789702, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.78651476, + "num_input_tokens_seen": 78800575, + "step": 3654, + "time_per_iteration": 2.8731091022491455 + }, + { + "auxiliary_loss_clip": 0.01507263, + "auxiliary_loss_mlp": 0.01326449, + "balance_loss_clip": 1.16209078, + "balance_loss_mlp": 1.06056452, + "epoch": 0.21975048850142792, + "flos": 19642659073920.0, + "grad_norm": 3.412026211125506, + "language_loss": 0.7286678, + "learning_rate": 3.634042312013064e-06, + "loss": 0.75700486, + "num_input_tokens_seen": 78819585, + "step": 3655, + "time_per_iteration": 2.7959513664245605 + }, + { + "auxiliary_loss_clip": 0.01506553, + "auxiliary_loss_mlp": 0.01326363, + "balance_loss_clip": 1.16049504, + "balance_loss_mlp": 1.07020569, + "epoch": 0.21981061175409589, + "flos": 22450357098720.0, + "grad_norm": 1.9991489303004337, + "language_loss": 0.81009525, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83842444, + "num_input_tokens_seen": 78837330, + "step": 3656, + "time_per_iteration": 2.741760015487671 + }, + { + "auxiliary_loss_clip": 0.01514314, + "auxiliary_loss_mlp": 0.01329574, + "balance_loss_clip": 1.16988063, + "balance_loss_mlp": 1.06483424, + "epoch": 0.21987073500676388, + "flos": 18153053580960.0, + "grad_norm": 2.4190958812449317, + "language_loss": 0.84703147, + "learning_rate": 3.63359305489566e-06, + "loss": 0.8754704, + "num_input_tokens_seen": 78854955, + "step": 3657, + "time_per_iteration": 2.7084219455718994 + }, + { + "auxiliary_loss_clip": 0.01502954, + "auxiliary_loss_mlp": 0.013156, + "balance_loss_clip": 1.15704882, + "balance_loss_mlp": 1.04971588, + "epoch": 0.21993085825943184, + "flos": 25628351392800.0, + "grad_norm": 1.8256100222011404, + "language_loss": 0.80831951, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.83650506, + "num_input_tokens_seen": 78874965, + "step": 3658, + "time_per_iteration": 2.7086949348449707 + }, + { + "auxiliary_loss_clip": 0.01605563, + "auxiliary_loss_mlp": 0.01285126, + "balance_loss_clip": 1.27489221, + "balance_loss_mlp": 1.06387329, + "epoch": 0.2199909815120998, + "flos": 70930993944960.0, + "grad_norm": 0.7809875598133444, + "language_loss": 0.58160925, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.61051619, + "num_input_tokens_seen": 78937740, + "step": 3659, + "time_per_iteration": 3.363156795501709 + }, + { + "auxiliary_loss_clip": 0.01504685, + "auxiliary_loss_mlp": 0.013224, + "balance_loss_clip": 1.16040838, + "balance_loss_mlp": 1.05746913, + "epoch": 0.22005110476476777, + "flos": 21545685518400.0, + "grad_norm": 2.6361669875477767, + "language_loss": 0.74749482, + "learning_rate": 3.632918704645772e-06, + "loss": 0.77576566, + "num_input_tokens_seen": 78955055, + "step": 3660, + "time_per_iteration": 2.9427928924560547 + }, + { + "auxiliary_loss_clip": 0.01504001, + "auxiliary_loss_mlp": 0.01320273, + "balance_loss_clip": 1.15900564, + "balance_loss_mlp": 1.0576309, + "epoch": 0.22011122801743574, + "flos": 22056696718560.0, + "grad_norm": 2.0286947867462417, + "language_loss": 0.81167758, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83992028, + "num_input_tokens_seen": 78974895, + "step": 3661, + "time_per_iteration": 2.8299801349639893 + }, + { + "auxiliary_loss_clip": 0.01504817, + "auxiliary_loss_mlp": 0.01297218, + "balance_loss_clip": 1.16003716, + "balance_loss_mlp": 1.03076172, + "epoch": 0.2201713512701037, + "flos": 26690009084640.0, + "grad_norm": 2.6277437403653647, + "language_loss": 0.73349816, + "learning_rate": 3.632468828196102e-06, + "loss": 0.76151854, + "num_input_tokens_seen": 78994990, + "step": 3662, + "time_per_iteration": 3.013742208480835 + }, + { + "auxiliary_loss_clip": 0.01508315, + "auxiliary_loss_mlp": 0.013177, + "balance_loss_clip": 1.16265059, + "balance_loss_mlp": 1.0563935, + "epoch": 0.22023147452277167, + "flos": 22164300001440.0, + "grad_norm": 1.528896046205897, + "language_loss": 0.78531361, + "learning_rate": 3.632243797111929e-06, + "loss": 0.81357378, + "num_input_tokens_seen": 79014405, + "step": 3663, + "time_per_iteration": 2.825165271759033 + }, + { + "auxiliary_loss_clip": 0.01506439, + "auxiliary_loss_mlp": 0.01320244, + "balance_loss_clip": 1.1613276, + "balance_loss_mlp": 1.05359685, + "epoch": 0.22029159777543966, + "flos": 22525190087040.0, + "grad_norm": 2.895341719356976, + "language_loss": 0.80448389, + "learning_rate": 3.632018704132908e-06, + "loss": 0.83275068, + "num_input_tokens_seen": 79032375, + "step": 3664, + "time_per_iteration": 2.8304669857025146 + }, + { + "auxiliary_loss_clip": 0.01498454, + "auxiliary_loss_mlp": 0.01323513, + "balance_loss_clip": 1.1524241, + "balance_loss_mlp": 1.05400431, + "epoch": 0.22035172102810763, + "flos": 13044155280480.0, + "grad_norm": 2.7412883182102368, + "language_loss": 0.77193069, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.80015039, + "num_input_tokens_seen": 79049635, + "step": 3665, + "time_per_iteration": 2.786221504211426 + }, + { + "auxiliary_loss_clip": 0.01498865, + "auxiliary_loss_mlp": 0.01332129, + "balance_loss_clip": 1.15363646, + "balance_loss_mlp": 1.0736835, + "epoch": 0.2204118442807756, + "flos": 12167399190240.0, + "grad_norm": 2.626732651766603, + "language_loss": 0.97671771, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00502777, + "num_input_tokens_seen": 79062890, + "step": 3666, + "time_per_iteration": 2.7833926677703857 + }, + { + "auxiliary_loss_clip": 0.01504436, + "auxiliary_loss_mlp": 0.01337355, + "balance_loss_clip": 1.16015625, + "balance_loss_mlp": 1.07833672, + "epoch": 0.22047196753344356, + "flos": 40111667349120.0, + "grad_norm": 1.8249625696972478, + "language_loss": 0.8052007, + "learning_rate": 3.631343053912122e-06, + "loss": 0.83361864, + "num_input_tokens_seen": 79085495, + "step": 3667, + "time_per_iteration": 2.945223569869995 + }, + { + "auxiliary_loss_clip": 0.01508471, + "auxiliary_loss_mlp": 0.01329798, + "balance_loss_clip": 1.16290581, + "balance_loss_mlp": 1.06467676, + "epoch": 0.22053209078611152, + "flos": 20703065136480.0, + "grad_norm": 2.177173825304293, + "language_loss": 0.77354074, + "learning_rate": 3.631117713439087e-06, + "loss": 0.80192345, + "num_input_tokens_seen": 79101820, + "step": 3668, + "time_per_iteration": 5.059779167175293 + }, + { + "auxiliary_loss_clip": 0.01507679, + "auxiliary_loss_mlp": 0.01336276, + "balance_loss_clip": 1.16176534, + "balance_loss_mlp": 1.07477903, + "epoch": 0.2205922140387795, + "flos": 24718559510880.0, + "grad_norm": 4.092715696891099, + "language_loss": 0.71533275, + "learning_rate": 3.630892311113904e-06, + "loss": 0.74377227, + "num_input_tokens_seen": 79123320, + "step": 3669, + "time_per_iteration": 2.8794198036193848 + }, + { + "auxiliary_loss_clip": 0.01500009, + "auxiliary_loss_mlp": 0.01323546, + "balance_loss_clip": 1.15424681, + "balance_loss_mlp": 1.06471896, + "epoch": 0.22065233729144745, + "flos": 23479509993120.0, + "grad_norm": 2.4764728142167063, + "language_loss": 0.85714304, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.8853786, + "num_input_tokens_seen": 79141615, + "step": 3670, + "time_per_iteration": 2.9392571449279785 + }, + { + "auxiliary_loss_clip": 0.01504507, + "auxiliary_loss_mlp": 0.01325531, + "balance_loss_clip": 1.15942574, + "balance_loss_mlp": 1.05583239, + "epoch": 0.22071246054411545, + "flos": 35228481714720.0, + "grad_norm": 1.9278755089963209, + "language_loss": 0.77143741, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79973787, + "num_input_tokens_seen": 79164910, + "step": 3671, + "time_per_iteration": 2.9848263263702393 + }, + { + "auxiliary_loss_clip": 0.01502602, + "auxiliary_loss_mlp": 0.01317247, + "balance_loss_clip": 1.15789425, + "balance_loss_mlp": 1.05231595, + "epoch": 0.2207725837967834, + "flos": 18152257089600.0, + "grad_norm": 2.370778849631727, + "language_loss": 0.81506836, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.84326684, + "num_input_tokens_seen": 79179685, + "step": 3672, + "time_per_iteration": 2.849008083343506 + }, + { + "auxiliary_loss_clip": 0.01507901, + "auxiliary_loss_mlp": 0.01318237, + "balance_loss_clip": 1.16346765, + "balance_loss_mlp": 1.05025482, + "epoch": 0.22083270704945138, + "flos": 20481448711680.0, + "grad_norm": 2.3446088614474268, + "language_loss": 0.73815453, + "learning_rate": 3.629990083462682e-06, + "loss": 0.76641595, + "num_input_tokens_seen": 79196285, + "step": 3673, + "time_per_iteration": 2.8192138671875 + }, + { + "auxiliary_loss_clip": 0.01507924, + "auxiliary_loss_mlp": 0.0130863, + "balance_loss_clip": 1.16275549, + "balance_loss_mlp": 1.03988409, + "epoch": 0.22089283030211934, + "flos": 34128364504320.0, + "grad_norm": 2.3695565721504144, + "language_loss": 0.7667172, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.79488277, + "num_input_tokens_seen": 79216060, + "step": 3674, + "time_per_iteration": 4.381926774978638 + }, + { + "auxiliary_loss_clip": 0.01505486, + "auxiliary_loss_mlp": 0.0131943, + "balance_loss_clip": 1.1603322, + "balance_loss_mlp": 1.04858673, + "epoch": 0.2209529535547873, + "flos": 18079168796640.0, + "grad_norm": 2.01191810314241, + "language_loss": 0.74686605, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.77511519, + "num_input_tokens_seen": 79235145, + "step": 3675, + "time_per_iteration": 2.81856632232666 + }, + { + "auxiliary_loss_clip": 0.0150817, + "auxiliary_loss_mlp": 0.01311286, + "balance_loss_clip": 1.16311109, + "balance_loss_mlp": 1.04120564, + "epoch": 0.22101307680745527, + "flos": 27237810964320.0, + "grad_norm": 1.839384640007647, + "language_loss": 0.80201346, + "learning_rate": 3.629312763695772e-06, + "loss": 0.830208, + "num_input_tokens_seen": 79256960, + "step": 3676, + "time_per_iteration": 4.4698920249938965 + }, + { + "auxiliary_loss_clip": 0.01506574, + "auxiliary_loss_mlp": 0.01313602, + "balance_loss_clip": 1.16199923, + "balance_loss_mlp": 1.04542923, + "epoch": 0.22107320006012326, + "flos": 16545035279520.0, + "grad_norm": 2.3287255869439836, + "language_loss": 0.75541025, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.78361201, + "num_input_tokens_seen": 79274860, + "step": 3677, + "time_per_iteration": 2.8779642581939697 + }, + { + "auxiliary_loss_clip": 0.01503052, + "auxiliary_loss_mlp": 0.01308028, + "balance_loss_clip": 1.15857792, + "balance_loss_mlp": 1.04366994, + "epoch": 0.22113332331279123, + "flos": 22056658790400.0, + "grad_norm": 1.8728404755502681, + "language_loss": 0.83420658, + "learning_rate": 3.628860908251712e-06, + "loss": 0.86231732, + "num_input_tokens_seen": 79294005, + "step": 3678, + "time_per_iteration": 2.860452651977539 + }, + { + "auxiliary_loss_clip": 0.01508041, + "auxiliary_loss_mlp": 0.01305268, + "balance_loss_clip": 1.16262174, + "balance_loss_mlp": 1.04148221, + "epoch": 0.2211934465654592, + "flos": 26615100240000.0, + "grad_norm": 2.134580860863769, + "language_loss": 0.89014852, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91828167, + "num_input_tokens_seen": 79314005, + "step": 3679, + "time_per_iteration": 4.549888610839844 + }, + { + "auxiliary_loss_clip": 0.01508046, + "auxiliary_loss_mlp": 0.0131773, + "balance_loss_clip": 1.16403663, + "balance_loss_mlp": 1.04822159, + "epoch": 0.22125356981812716, + "flos": 16362105942240.0, + "grad_norm": 2.289116419207771, + "language_loss": 0.8684482, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.89670599, + "num_input_tokens_seen": 79331030, + "step": 3680, + "time_per_iteration": 2.8132879734039307 + }, + { + "auxiliary_loss_clip": 0.0151384, + "auxiliary_loss_mlp": 0.01307124, + "balance_loss_clip": 1.16937327, + "balance_loss_mlp": 1.03914154, + "epoch": 0.22131369307079513, + "flos": 21653175016800.0, + "grad_norm": 2.1791260941861377, + "language_loss": 0.81178987, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.8399995, + "num_input_tokens_seen": 79348560, + "step": 3681, + "time_per_iteration": 2.883009910583496 + }, + { + "auxiliary_loss_clip": 0.0150848, + "auxiliary_loss_mlp": 0.01314269, + "balance_loss_clip": 1.16359651, + "balance_loss_mlp": 1.05010152, + "epoch": 0.2213738163234631, + "flos": 19611557618400.0, + "grad_norm": 2.2627083342309318, + "language_loss": 0.79502004, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82324755, + "num_input_tokens_seen": 79367175, + "step": 3682, + "time_per_iteration": 2.850996732711792 + }, + { + "auxiliary_loss_clip": 0.01506155, + "auxiliary_loss_mlp": 0.01318224, + "balance_loss_clip": 1.16234994, + "balance_loss_mlp": 1.05119514, + "epoch": 0.22143393957613106, + "flos": 23625686579040.0, + "grad_norm": 1.6846118111632282, + "language_loss": 0.77474159, + "learning_rate": 3.627730188876638e-06, + "loss": 0.80298537, + "num_input_tokens_seen": 79388435, + "step": 3683, + "time_per_iteration": 2.8351759910583496 + }, + { + "auxiliary_loss_clip": 0.01504401, + "auxiliary_loss_mlp": 0.01305777, + "balance_loss_clip": 1.15980673, + "balance_loss_mlp": 1.03798532, + "epoch": 0.22149406282879905, + "flos": 26180249513760.0, + "grad_norm": 2.0057938405243414, + "language_loss": 0.7261911, + "learning_rate": 3.627503859796234e-06, + "loss": 0.75429285, + "num_input_tokens_seen": 79407910, + "step": 3684, + "time_per_iteration": 2.8443918228149414 + }, + { + "auxiliary_loss_clip": 0.01508381, + "auxiliary_loss_mlp": 0.01321888, + "balance_loss_clip": 1.16348529, + "balance_loss_mlp": 1.05848312, + "epoch": 0.221554186081467, + "flos": 14540549914080.0, + "grad_norm": 5.080894157553341, + "language_loss": 0.80095172, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82925439, + "num_input_tokens_seen": 79424020, + "step": 3685, + "time_per_iteration": 2.77939772605896 + }, + { + "auxiliary_loss_clip": 0.01507586, + "auxiliary_loss_mlp": 0.01314943, + "balance_loss_clip": 1.16283822, + "balance_loss_mlp": 1.05268216, + "epoch": 0.22161430933413498, + "flos": 22240536331680.0, + "grad_norm": 1.5293791017486587, + "language_loss": 0.87600559, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.90423089, + "num_input_tokens_seen": 79445605, + "step": 3686, + "time_per_iteration": 2.9377739429473877 + }, + { + "auxiliary_loss_clip": 0.01507763, + "auxiliary_loss_mlp": 0.01318826, + "balance_loss_clip": 1.16347623, + "balance_loss_mlp": 1.05198824, + "epoch": 0.22167443258680294, + "flos": 23478751429920.0, + "grad_norm": 2.535264483870059, + "language_loss": 0.77804196, + "learning_rate": 3.626824502298707e-06, + "loss": 0.80630785, + "num_input_tokens_seen": 79463850, + "step": 3687, + "time_per_iteration": 2.7942910194396973 + }, + { + "auxiliary_loss_clip": 0.01503619, + "auxiliary_loss_mlp": 0.01332062, + "balance_loss_clip": 1.1600014, + "balance_loss_mlp": 1.06312561, + "epoch": 0.2217345558394709, + "flos": 23223340650240.0, + "grad_norm": 2.0137465783644304, + "language_loss": 0.85075057, + "learning_rate": 3.626597926409383e-06, + "loss": 0.87910748, + "num_input_tokens_seen": 79482845, + "step": 3688, + "time_per_iteration": 2.7709860801696777 + }, + { + "auxiliary_loss_clip": 0.01505372, + "auxiliary_loss_mlp": 0.01331333, + "balance_loss_clip": 1.16087461, + "balance_loss_mlp": 1.06525755, + "epoch": 0.22179467909213887, + "flos": 20013144984000.0, + "grad_norm": 2.6466831539222513, + "language_loss": 0.81421876, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.8425858, + "num_input_tokens_seen": 79501550, + "step": 3689, + "time_per_iteration": 2.8439688682556152 + }, + { + "auxiliary_loss_clip": 0.0150448, + "auxiliary_loss_mlp": 0.01311983, + "balance_loss_clip": 1.16178274, + "balance_loss_mlp": 1.04476357, + "epoch": 0.22185480234480687, + "flos": 19685101049280.0, + "grad_norm": 2.520152692195032, + "language_loss": 0.70289052, + "learning_rate": 3.626144589597061e-06, + "loss": 0.73105508, + "num_input_tokens_seen": 79519680, + "step": 3690, + "time_per_iteration": 2.804797410964966 + }, + { + "auxiliary_loss_clip": 0.01502733, + "auxiliary_loss_mlp": 0.01315897, + "balance_loss_clip": 1.16001952, + "balance_loss_mlp": 1.04867744, + "epoch": 0.22191492559747483, + "flos": 21983494641120.0, + "grad_norm": 2.3110859778276485, + "language_loss": 0.72169739, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74988365, + "num_input_tokens_seen": 79539000, + "step": 3691, + "time_per_iteration": 2.7772960662841797 + }, + { + "auxiliary_loss_clip": 0.0151156, + "auxiliary_loss_mlp": 0.01318608, + "balance_loss_clip": 1.16782618, + "balance_loss_mlp": 1.05062532, + "epoch": 0.2219750488501428, + "flos": 23224326782400.0, + "grad_norm": 1.8302659143488773, + "language_loss": 0.71739542, + "learning_rate": 3.625691006130477e-06, + "loss": 0.74569714, + "num_input_tokens_seen": 79559695, + "step": 3692, + "time_per_iteration": 2.8187503814697266 + }, + { + "auxiliary_loss_clip": 0.01508069, + "auxiliary_loss_mlp": 0.01318193, + "balance_loss_clip": 1.16443908, + "balance_loss_mlp": 1.05288088, + "epoch": 0.22203517210281076, + "flos": 22455780825600.0, + "grad_norm": 1.6788988842077446, + "language_loss": 0.87717152, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.90543419, + "num_input_tokens_seen": 79579095, + "step": 3693, + "time_per_iteration": 2.7967677116394043 + }, + { + "auxiliary_loss_clip": 0.01502074, + "auxiliary_loss_mlp": 0.01305376, + "balance_loss_clip": 1.15870535, + "balance_loss_mlp": 1.04349756, + "epoch": 0.22209529535547873, + "flos": 17566374972960.0, + "grad_norm": 2.205294980415996, + "language_loss": 0.85711271, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.88518721, + "num_input_tokens_seen": 79596430, + "step": 3694, + "time_per_iteration": 2.7246081829071045 + }, + { + "auxiliary_loss_clip": 0.0149629, + "auxiliary_loss_mlp": 0.01306245, + "balance_loss_clip": 1.15179467, + "balance_loss_mlp": 1.03673673, + "epoch": 0.2221554186081467, + "flos": 21471080099040.0, + "grad_norm": 1.896723923650032, + "language_loss": 0.69718331, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.7252087, + "num_input_tokens_seen": 79615825, + "step": 3695, + "time_per_iteration": 2.7978289127349854 + }, + { + "auxiliary_loss_clip": 0.0150695, + "auxiliary_loss_mlp": 0.01323516, + "balance_loss_clip": 1.16304612, + "balance_loss_mlp": 1.06297266, + "epoch": 0.22221554186081466, + "flos": 27675961440480.0, + "grad_norm": 2.0352320152027112, + "language_loss": 0.71429336, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.742598, + "num_input_tokens_seen": 79637875, + "step": 3696, + "time_per_iteration": 2.855435848236084 + }, + { + "auxiliary_loss_clip": 0.01500894, + "auxiliary_loss_mlp": 0.0131422, + "balance_loss_clip": 1.1579417, + "balance_loss_mlp": 1.04681015, + "epoch": 0.22227566511348265, + "flos": 25961363916480.0, + "grad_norm": 1.6453541163314929, + "language_loss": 0.87637389, + "learning_rate": 3.624555968803217e-06, + "loss": 0.90452504, + "num_input_tokens_seen": 79656970, + "step": 3697, + "time_per_iteration": 2.855037212371826 + }, + { + "auxiliary_loss_clip": 0.01505388, + "auxiliary_loss_mlp": 0.01299117, + "balance_loss_clip": 1.16220975, + "balance_loss_mlp": 1.0364747, + "epoch": 0.22233578836615062, + "flos": 39206881984320.0, + "grad_norm": 1.7607663702325016, + "language_loss": 0.66290039, + "learning_rate": 3.624328776493346e-06, + "loss": 0.69094545, + "num_input_tokens_seen": 79680275, + "step": 3698, + "time_per_iteration": 2.9579436779022217 + }, + { + "auxiliary_loss_clip": 0.01503529, + "auxiliary_loss_mlp": 0.01304047, + "balance_loss_clip": 1.16004348, + "balance_loss_mlp": 1.03091431, + "epoch": 0.22239591161881858, + "flos": 36286763800320.0, + "grad_norm": 1.9988378985962985, + "language_loss": 0.82478881, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.8528645, + "num_input_tokens_seen": 79701255, + "step": 3699, + "time_per_iteration": 2.8976993560791016 + }, + { + "auxiliary_loss_clip": 0.0150328, + "auxiliary_loss_mlp": 0.01305278, + "balance_loss_clip": 1.16040444, + "balance_loss_mlp": 1.03481638, + "epoch": 0.22245603487148655, + "flos": 19721664159840.0, + "grad_norm": 2.5246040274508124, + "language_loss": 0.79664427, + "learning_rate": 3.62387420709809e-06, + "loss": 0.8247298, + "num_input_tokens_seen": 79721315, + "step": 3700, + "time_per_iteration": 2.7752506732940674 + }, + { + "auxiliary_loss_clip": 0.01505646, + "auxiliary_loss_mlp": 0.01324313, + "balance_loss_clip": 1.16127574, + "balance_loss_mlp": 1.05518579, + "epoch": 0.2225161581241545, + "flos": 46283968036800.0, + "grad_norm": 2.0415773270991244, + "language_loss": 0.72390091, + "learning_rate": 3.623646830029943e-06, + "loss": 0.75220048, + "num_input_tokens_seen": 79742705, + "step": 3701, + "time_per_iteration": 2.9272336959838867 + }, + { + "auxiliary_loss_clip": 0.01497713, + "auxiliary_loss_mlp": 0.01295081, + "balance_loss_clip": 1.15443659, + "balance_loss_mlp": 1.0276711, + "epoch": 0.22257628137682248, + "flos": 23698471446720.0, + "grad_norm": 1.9034091236728816, + "language_loss": 0.80192405, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.82985204, + "num_input_tokens_seen": 79763000, + "step": 3702, + "time_per_iteration": 2.906834125518799 + }, + { + "auxiliary_loss_clip": 0.01503999, + "auxiliary_loss_mlp": 0.01297239, + "balance_loss_clip": 1.16093886, + "balance_loss_mlp": 1.03497887, + "epoch": 0.22263640462949044, + "flos": 19355995126080.0, + "grad_norm": 3.26402985824073, + "language_loss": 0.77746522, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80547762, + "num_input_tokens_seen": 79781335, + "step": 3703, + "time_per_iteration": 2.7325334548950195 + }, + { + "auxiliary_loss_clip": 0.01495846, + "auxiliary_loss_mlp": 0.01305288, + "balance_loss_clip": 1.15151322, + "balance_loss_mlp": 1.03883171, + "epoch": 0.22269652788215843, + "flos": 20778087765600.0, + "grad_norm": 2.649310834506223, + "language_loss": 0.75176787, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.77977926, + "num_input_tokens_seen": 79800150, + "step": 3704, + "time_per_iteration": 2.76169753074646 + }, + { + "auxiliary_loss_clip": 0.01504432, + "auxiliary_loss_mlp": 0.01307966, + "balance_loss_clip": 1.15974092, + "balance_loss_mlp": 1.0468502, + "epoch": 0.2227566511348264, + "flos": 47962457588160.0, + "grad_norm": 1.8804877383372187, + "language_loss": 0.64889503, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.677019, + "num_input_tokens_seen": 79822390, + "step": 3705, + "time_per_iteration": 3.0226693153381348 + }, + { + "auxiliary_loss_clip": 0.0162452, + "auxiliary_loss_mlp": 0.01326378, + "balance_loss_clip": 1.29550195, + "balance_loss_mlp": 1.10207367, + "epoch": 0.22281677438749437, + "flos": 66224555722080.0, + "grad_norm": 1.3615780966518751, + "language_loss": 0.65148091, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.68098986, + "num_input_tokens_seen": 79873350, + "step": 3706, + "time_per_iteration": 4.863999366760254 + }, + { + "auxiliary_loss_clip": 0.01497667, + "auxiliary_loss_mlp": 0.01313767, + "balance_loss_clip": 1.15390158, + "balance_loss_mlp": 1.04711986, + "epoch": 0.22287689764016233, + "flos": 21873767381280.0, + "grad_norm": 4.04855272594677, + "language_loss": 0.81021833, + "learning_rate": 3.622281274977141e-06, + "loss": 0.83833265, + "num_input_tokens_seen": 79891715, + "step": 3707, + "time_per_iteration": 2.7863235473632812 + }, + { + "auxiliary_loss_clip": 0.01503822, + "auxiliary_loss_mlp": 0.01304525, + "balance_loss_clip": 1.15769815, + "balance_loss_mlp": 1.04531634, + "epoch": 0.2229370208928303, + "flos": 27674937380160.0, + "grad_norm": 1.9337414623398586, + "language_loss": 0.78511959, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.8132031, + "num_input_tokens_seen": 79911175, + "step": 3708, + "time_per_iteration": 2.8596742153167725 + }, + { + "auxiliary_loss_clip": 0.01496985, + "auxiliary_loss_mlp": 0.0130905, + "balance_loss_clip": 1.15231836, + "balance_loss_mlp": 1.03935051, + "epoch": 0.22299714414549826, + "flos": 30157322297760.0, + "grad_norm": 20.17010350279969, + "language_loss": 0.8027252, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.83078551, + "num_input_tokens_seen": 79931875, + "step": 3709, + "time_per_iteration": 2.8652377128601074 + }, + { + "auxiliary_loss_clip": 0.01497092, + "auxiliary_loss_mlp": 0.01296414, + "balance_loss_clip": 1.15224099, + "balance_loss_mlp": 1.02404475, + "epoch": 0.22305726739816625, + "flos": 23145056199360.0, + "grad_norm": 2.6285165969089355, + "language_loss": 0.69103557, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.71897066, + "num_input_tokens_seen": 79952445, + "step": 3710, + "time_per_iteration": 2.8293581008911133 + }, + { + "auxiliary_loss_clip": 0.01492536, + "auxiliary_loss_mlp": 0.01297827, + "balance_loss_clip": 1.14595675, + "balance_loss_mlp": 1.03156137, + "epoch": 0.22311739065083422, + "flos": 19174089849120.0, + "grad_norm": 2.1942060410048216, + "language_loss": 0.90991986, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.93782353, + "num_input_tokens_seen": 79971030, + "step": 3711, + "time_per_iteration": 2.74357533454895 + }, + { + "auxiliary_loss_clip": 0.01494406, + "auxiliary_loss_mlp": 0.01305312, + "balance_loss_clip": 1.14904571, + "balance_loss_mlp": 1.03561246, + "epoch": 0.22317751390350218, + "flos": 13619000302560.0, + "grad_norm": 2.5802277093193617, + "language_loss": 0.90125942, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.92925662, + "num_input_tokens_seen": 79982085, + "step": 3712, + "time_per_iteration": 4.327265977859497 + }, + { + "auxiliary_loss_clip": 0.01496687, + "auxiliary_loss_mlp": 0.01304947, + "balance_loss_clip": 1.15097296, + "balance_loss_mlp": 1.03486657, + "epoch": 0.22323763715617015, + "flos": 11030263731360.0, + "grad_norm": 3.2231236931548715, + "language_loss": 0.75228393, + "learning_rate": 3.620913505310117e-06, + "loss": 0.78030026, + "num_input_tokens_seen": 79997460, + "step": 3713, + "time_per_iteration": 2.7798995971679688 + }, + { + "auxiliary_loss_clip": 0.01498456, + "auxiliary_loss_mlp": 0.0130873, + "balance_loss_clip": 1.15326309, + "balance_loss_mlp": 1.03979421, + "epoch": 0.22329776040883811, + "flos": 41354585539200.0, + "grad_norm": 1.8706246879877968, + "language_loss": 0.63044602, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.65851784, + "num_input_tokens_seen": 80022450, + "step": 3714, + "time_per_iteration": 2.9098169803619385 + }, + { + "auxiliary_loss_clip": 0.01495815, + "auxiliary_loss_mlp": 0.01303318, + "balance_loss_clip": 1.1503942, + "balance_loss_mlp": 1.03533554, + "epoch": 0.22335788366150608, + "flos": 25121853643680.0, + "grad_norm": 3.206111246333968, + "language_loss": 0.79625273, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.82424402, + "num_input_tokens_seen": 80042100, + "step": 3715, + "time_per_iteration": 4.351353883743286 + }, + { + "auxiliary_loss_clip": 0.01497931, + "auxiliary_loss_mlp": 0.01309385, + "balance_loss_clip": 1.15309, + "balance_loss_mlp": 1.03987694, + "epoch": 0.22341800691417404, + "flos": 16985309732640.0, + "grad_norm": 2.3226071088930054, + "language_loss": 0.77544308, + "learning_rate": 3.620228790579645e-06, + "loss": 0.80351627, + "num_input_tokens_seen": 80059690, + "step": 3716, + "time_per_iteration": 2.747309684753418 + }, + { + "auxiliary_loss_clip": 0.01499122, + "auxiliary_loss_mlp": 0.01306847, + "balance_loss_clip": 1.15319443, + "balance_loss_mlp": 1.03943706, + "epoch": 0.22347813016684204, + "flos": 14138469482400.0, + "grad_norm": 4.401072303496646, + "language_loss": 0.7947197, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.82277942, + "num_input_tokens_seen": 80076060, + "step": 3717, + "time_per_iteration": 4.199835300445557 + }, + { + "auxiliary_loss_clip": 0.01489094, + "auxiliary_loss_mlp": 0.01299424, + "balance_loss_clip": 1.14357758, + "balance_loss_mlp": 1.03220439, + "epoch": 0.22353825341951, + "flos": 23585216868000.0, + "grad_norm": 2.6439002880574702, + "language_loss": 0.68718028, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.71506548, + "num_input_tokens_seen": 80094760, + "step": 3718, + "time_per_iteration": 2.761647939682007 + }, + { + "auxiliary_loss_clip": 0.01495124, + "auxiliary_loss_mlp": 0.01306517, + "balance_loss_clip": 1.14949882, + "balance_loss_mlp": 1.03815317, + "epoch": 0.22359837667217797, + "flos": 29826737176320.0, + "grad_norm": 1.503414846262132, + "language_loss": 0.81110471, + "learning_rate": 3.619543522896045e-06, + "loss": 0.8391211, + "num_input_tokens_seen": 80114475, + "step": 3719, + "time_per_iteration": 2.8375396728515625 + }, + { + "auxiliary_loss_clip": 0.01489574, + "auxiliary_loss_mlp": 0.01310092, + "balance_loss_clip": 1.14424849, + "balance_loss_mlp": 1.04153705, + "epoch": 0.22365849992484593, + "flos": 17605024132320.0, + "grad_norm": 1.8650930149045641, + "language_loss": 0.86945581, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.89745247, + "num_input_tokens_seen": 80132920, + "step": 3720, + "time_per_iteration": 2.733322858810425 + }, + { + "auxiliary_loss_clip": 0.01500418, + "auxiliary_loss_mlp": 0.01323397, + "balance_loss_clip": 1.15522814, + "balance_loss_mlp": 1.06094635, + "epoch": 0.2237186231775139, + "flos": 22713163869600.0, + "grad_norm": 1.7167442408203955, + "language_loss": 0.74720174, + "learning_rate": 3.619086370692945e-06, + "loss": 0.77543986, + "num_input_tokens_seen": 80152845, + "step": 3721, + "time_per_iteration": 2.7735512256622314 + }, + { + "auxiliary_loss_clip": 0.01493918, + "auxiliary_loss_mlp": 0.01315246, + "balance_loss_clip": 1.14832592, + "balance_loss_mlp": 1.04917061, + "epoch": 0.22377874643018186, + "flos": 13373261203680.0, + "grad_norm": 3.241276532454261, + "language_loss": 0.79179537, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81988704, + "num_input_tokens_seen": 80170680, + "step": 3722, + "time_per_iteration": 2.734623908996582 + }, + { + "auxiliary_loss_clip": 0.01498297, + "auxiliary_loss_mlp": 0.01310553, + "balance_loss_clip": 1.15274048, + "balance_loss_mlp": 1.04771996, + "epoch": 0.22383886968284986, + "flos": 17896922166240.0, + "grad_norm": 2.091805689977376, + "language_loss": 0.82524061, + "learning_rate": 3.618628972906178e-06, + "loss": 0.85332906, + "num_input_tokens_seen": 80189030, + "step": 3723, + "time_per_iteration": 2.787264823913574 + }, + { + "auxiliary_loss_clip": 0.01498107, + "auxiliary_loss_mlp": 0.01316533, + "balance_loss_clip": 1.1534512, + "balance_loss_mlp": 1.05465436, + "epoch": 0.22389899293551782, + "flos": 23881438712160.0, + "grad_norm": 2.182783416604343, + "language_loss": 0.84927005, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.87741643, + "num_input_tokens_seen": 80208365, + "step": 3724, + "time_per_iteration": 2.76566743850708 + }, + { + "auxiliary_loss_clip": 0.01497141, + "auxiliary_loss_mlp": 0.01319631, + "balance_loss_clip": 1.15207517, + "balance_loss_mlp": 1.05851519, + "epoch": 0.2239591161881858, + "flos": 27274942997280.0, + "grad_norm": 1.8208694202410203, + "language_loss": 0.79575896, + "learning_rate": 3.618171329605121e-06, + "loss": 0.82392663, + "num_input_tokens_seen": 80228685, + "step": 3725, + "time_per_iteration": 2.8822848796844482 + }, + { + "auxiliary_loss_clip": 0.01495617, + "auxiliary_loss_mlp": 0.01319358, + "balance_loss_clip": 1.14950597, + "balance_loss_mlp": 1.06053042, + "epoch": 0.22401923944085375, + "flos": 22239057133440.0, + "grad_norm": 8.784978442983741, + "language_loss": 0.77837098, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.8065207, + "num_input_tokens_seen": 80247635, + "step": 3726, + "time_per_iteration": 2.8113536834716797 + }, + { + "auxiliary_loss_clip": 0.01497332, + "auxiliary_loss_mlp": 0.01333228, + "balance_loss_clip": 1.15232944, + "balance_loss_mlp": 1.06734371, + "epoch": 0.22407936269352172, + "flos": 12055092815520.0, + "grad_norm": 3.3946590424528393, + "language_loss": 0.7250213, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.75332689, + "num_input_tokens_seen": 80260045, + "step": 3727, + "time_per_iteration": 2.683194875717163 + }, + { + "auxiliary_loss_clip": 0.01491189, + "auxiliary_loss_mlp": 0.01307674, + "balance_loss_clip": 1.14563417, + "balance_loss_mlp": 1.04445958, + "epoch": 0.22413948594618968, + "flos": 19355539988160.0, + "grad_norm": 2.377266021011125, + "language_loss": 0.86842388, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.89641249, + "num_input_tokens_seen": 80277680, + "step": 3728, + "time_per_iteration": 2.752732276916504 + }, + { + "auxiliary_loss_clip": 0.01497449, + "auxiliary_loss_mlp": 0.01340125, + "balance_loss_clip": 1.15157771, + "balance_loss_mlp": 1.08339584, + "epoch": 0.22419960919885765, + "flos": 24172274757600.0, + "grad_norm": 2.230667187862715, + "language_loss": 0.80302304, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.83139884, + "num_input_tokens_seen": 80294795, + "step": 3729, + "time_per_iteration": 2.770751714706421 + }, + { + "auxiliary_loss_clip": 0.01491349, + "auxiliary_loss_mlp": 0.01308528, + "balance_loss_clip": 1.14594197, + "balance_loss_mlp": 1.0481751, + "epoch": 0.22425973245152564, + "flos": 27381332579040.0, + "grad_norm": 1.8152638219522308, + "language_loss": 0.86657214, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.89457095, + "num_input_tokens_seen": 80315425, + "step": 3730, + "time_per_iteration": 2.8095686435699463 + }, + { + "auxiliary_loss_clip": 0.01486908, + "auxiliary_loss_mlp": 0.01303078, + "balance_loss_clip": 1.14240944, + "balance_loss_mlp": 1.03891027, + "epoch": 0.2243198557041936, + "flos": 13737375182880.0, + "grad_norm": 2.5078224101709985, + "language_loss": 0.73399144, + "learning_rate": 3.616796927310559e-06, + "loss": 0.76189125, + "num_input_tokens_seen": 80333905, + "step": 3731, + "time_per_iteration": 2.7264444828033447 + }, + { + "auxiliary_loss_clip": 0.01495772, + "auxiliary_loss_mlp": 0.01307374, + "balance_loss_clip": 1.15033698, + "balance_loss_mlp": 1.04282451, + "epoch": 0.22437997895686157, + "flos": 19532400819840.0, + "grad_norm": 2.2870409883659146, + "language_loss": 0.75581479, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.78384626, + "num_input_tokens_seen": 80352165, + "step": 3732, + "time_per_iteration": 2.8243212699890137 + }, + { + "auxiliary_loss_clip": 0.01495906, + "auxiliary_loss_mlp": 0.01307105, + "balance_loss_clip": 1.15060925, + "balance_loss_mlp": 1.04198337, + "epoch": 0.22444010220952954, + "flos": 23698509374880.0, + "grad_norm": 2.0356561713606705, + "language_loss": 0.88312435, + "learning_rate": 3.616338302646873e-06, + "loss": 0.91115445, + "num_input_tokens_seen": 80371305, + "step": 3733, + "time_per_iteration": 2.8640613555908203 + }, + { + "auxiliary_loss_clip": 0.01489378, + "auxiliary_loss_mlp": 0.01313543, + "balance_loss_clip": 1.14434409, + "balance_loss_mlp": 1.05204511, + "epoch": 0.2245002254621975, + "flos": 22385119934880.0, + "grad_norm": 1.7214749301262973, + "language_loss": 0.84537899, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.87340814, + "num_input_tokens_seen": 80391020, + "step": 3734, + "time_per_iteration": 2.822481393814087 + }, + { + "auxiliary_loss_clip": 0.01488876, + "auxiliary_loss_mlp": 0.01312654, + "balance_loss_clip": 1.1430105, + "balance_loss_mlp": 1.05039334, + "epoch": 0.22456034871486547, + "flos": 26944699229280.0, + "grad_norm": 1.9725663478499496, + "language_loss": 0.76699191, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.79500717, + "num_input_tokens_seen": 80411365, + "step": 3735, + "time_per_iteration": 2.8018798828125 + }, + { + "auxiliary_loss_clip": 0.01497466, + "auxiliary_loss_mlp": 0.01304445, + "balance_loss_clip": 1.15255725, + "balance_loss_mlp": 1.04161227, + "epoch": 0.22462047196753343, + "flos": 28985937346080.0, + "grad_norm": 2.0731398258328664, + "language_loss": 0.85043639, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.87845552, + "num_input_tokens_seen": 80431075, + "step": 3736, + "time_per_iteration": 2.8160529136657715 + }, + { + "auxiliary_loss_clip": 0.01494079, + "auxiliary_loss_mlp": 0.01302725, + "balance_loss_clip": 1.14907992, + "balance_loss_mlp": 1.03779411, + "epoch": 0.22468059522020142, + "flos": 20013410481120.0, + "grad_norm": 2.283331272986604, + "language_loss": 0.86786687, + "learning_rate": 3.615420317888586e-06, + "loss": 0.89583486, + "num_input_tokens_seen": 80449240, + "step": 3737, + "time_per_iteration": 2.7971041202545166 + }, + { + "auxiliary_loss_clip": 0.01490274, + "auxiliary_loss_mlp": 0.01305874, + "balance_loss_clip": 1.14434993, + "balance_loss_mlp": 1.04151499, + "epoch": 0.2247407184728694, + "flos": 29316787964640.0, + "grad_norm": 5.895440327995752, + "language_loss": 0.79079688, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81875837, + "num_input_tokens_seen": 80467900, + "step": 3738, + "time_per_iteration": 2.805838108062744 + }, + { + "auxiliary_loss_clip": 0.0149665, + "auxiliary_loss_mlp": 0.01305286, + "balance_loss_clip": 1.15109682, + "balance_loss_mlp": 1.03978276, + "epoch": 0.22480084172553735, + "flos": 22312600564320.0, + "grad_norm": 1.8331789734119366, + "language_loss": 0.76687801, + "learning_rate": 3.614960957933224e-06, + "loss": 0.79489732, + "num_input_tokens_seen": 80487100, + "step": 3739, + "time_per_iteration": 2.8356363773345947 + }, + { + "auxiliary_loss_clip": 0.01492481, + "auxiliary_loss_mlp": 0.0131348, + "balance_loss_clip": 1.1466186, + "balance_loss_mlp": 1.05102849, + "epoch": 0.22486096497820532, + "flos": 25593532977600.0, + "grad_norm": 3.193047055479853, + "language_loss": 0.74106705, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76912671, + "num_input_tokens_seen": 80508625, + "step": 3740, + "time_per_iteration": 2.8075032234191895 + }, + { + "auxiliary_loss_clip": 0.01490818, + "auxiliary_loss_mlp": 0.01296108, + "balance_loss_clip": 1.1448586, + "balance_loss_mlp": 1.03041458, + "epoch": 0.22492108823087328, + "flos": 17641776883680.0, + "grad_norm": 2.1664516125075948, + "language_loss": 0.75772381, + "learning_rate": 3.614501353019939e-06, + "loss": 0.78559303, + "num_input_tokens_seen": 80527345, + "step": 3741, + "time_per_iteration": 2.758463144302368 + }, + { + "auxiliary_loss_clip": 0.01494902, + "auxiliary_loss_mlp": 0.01315654, + "balance_loss_clip": 1.14817619, + "balance_loss_mlp": 1.04919779, + "epoch": 0.22498121148354125, + "flos": 16036527337920.0, + "grad_norm": 5.2812481818902555, + "language_loss": 0.8763113, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.90441686, + "num_input_tokens_seen": 80545545, + "step": 3742, + "time_per_iteration": 2.772526264190674 + }, + { + "auxiliary_loss_clip": 0.01494285, + "auxiliary_loss_mlp": 0.01310366, + "balance_loss_clip": 1.14815271, + "balance_loss_mlp": 1.04600787, + "epoch": 0.22504133473620924, + "flos": 24026022315360.0, + "grad_norm": 1.8452992771748669, + "language_loss": 0.8175391, + "learning_rate": 3.614041503218444e-06, + "loss": 0.84558558, + "num_input_tokens_seen": 80565040, + "step": 3743, + "time_per_iteration": 2.7887396812438965 + }, + { + "auxiliary_loss_clip": 0.01493386, + "auxiliary_loss_mlp": 0.01297605, + "balance_loss_clip": 1.1472646, + "balance_loss_mlp": 1.03343737, + "epoch": 0.2251014579888772, + "flos": 16765855212960.0, + "grad_norm": 2.781354448623631, + "language_loss": 0.63752174, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.66543168, + "num_input_tokens_seen": 80582815, + "step": 3744, + "time_per_iteration": 4.438545227050781 + }, + { + "auxiliary_loss_clip": 0.01486844, + "auxiliary_loss_mlp": 0.01323638, + "balance_loss_clip": 1.1408596, + "balance_loss_mlp": 1.06347549, + "epoch": 0.22516158124154517, + "flos": 13992748034400.0, + "grad_norm": 4.091341230611758, + "language_loss": 0.76598191, + "learning_rate": 3.613581408598489e-06, + "loss": 0.79408675, + "num_input_tokens_seen": 80600865, + "step": 3745, + "time_per_iteration": 2.6713881492614746 + }, + { + "auxiliary_loss_clip": 0.01501207, + "auxiliary_loss_mlp": 0.0131406, + "balance_loss_clip": 1.15405154, + "balance_loss_mlp": 1.05122709, + "epoch": 0.22522170449421314, + "flos": 14391642500640.0, + "grad_norm": 1.9898416933228196, + "language_loss": 0.80957663, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83772933, + "num_input_tokens_seen": 80617455, + "step": 3746, + "time_per_iteration": 2.7339751720428467 + }, + { + "auxiliary_loss_clip": 0.01489108, + "auxiliary_loss_mlp": 0.01311999, + "balance_loss_clip": 1.1417706, + "balance_loss_mlp": 1.04649663, + "epoch": 0.2252818277468811, + "flos": 23807819424960.0, + "grad_norm": 2.493231087859346, + "language_loss": 0.86106741, + "learning_rate": 3.613121069229862e-06, + "loss": 0.8890785, + "num_input_tokens_seen": 80635125, + "step": 3747, + "time_per_iteration": 2.7931759357452393 + }, + { + "auxiliary_loss_clip": 0.01489463, + "auxiliary_loss_mlp": 0.01319037, + "balance_loss_clip": 1.14274263, + "balance_loss_mlp": 1.05773067, + "epoch": 0.22534195099954907, + "flos": 24720380062560.0, + "grad_norm": 1.7335716650180526, + "language_loss": 0.76749623, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.79558134, + "num_input_tokens_seen": 80656370, + "step": 3748, + "time_per_iteration": 2.805011034011841 + }, + { + "auxiliary_loss_clip": 0.01496104, + "auxiliary_loss_mlp": 0.01313774, + "balance_loss_clip": 1.14944673, + "balance_loss_mlp": 1.05017877, + "epoch": 0.22540207425221703, + "flos": 21034446749280.0, + "grad_norm": 1.7236266185539844, + "language_loss": 0.80078435, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.82888305, + "num_input_tokens_seen": 80676495, + "step": 3749, + "time_per_iteration": 2.7844300270080566 + }, + { + "auxiliary_loss_clip": 0.01488699, + "auxiliary_loss_mlp": 0.01296328, + "balance_loss_clip": 1.14071918, + "balance_loss_mlp": 1.03425789, + "epoch": 0.22546219750488503, + "flos": 19392368595840.0, + "grad_norm": 1.6705728900513468, + "language_loss": 0.798958, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.82680827, + "num_input_tokens_seen": 80694755, + "step": 3750, + "time_per_iteration": 2.76224684715271 + }, + { + "auxiliary_loss_clip": 0.01496707, + "auxiliary_loss_mlp": 0.01309569, + "balance_loss_clip": 1.14850998, + "balance_loss_mlp": 1.04463804, + "epoch": 0.225522320757553, + "flos": 25195017792960.0, + "grad_norm": 1.9692978079716785, + "language_loss": 0.82028157, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.84834433, + "num_input_tokens_seen": 80713670, + "step": 3751, + "time_per_iteration": 4.317918539047241 + }, + { + "auxiliary_loss_clip": 0.01495985, + "auxiliary_loss_mlp": 0.0132029, + "balance_loss_clip": 1.14791703, + "balance_loss_mlp": 1.05688512, + "epoch": 0.22558244401022096, + "flos": 17164901391840.0, + "grad_norm": 11.696043099298151, + "language_loss": 0.83893502, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86709774, + "num_input_tokens_seen": 80731450, + "step": 3752, + "time_per_iteration": 4.315500974655151 + }, + { + "auxiliary_loss_clip": 0.0148372, + "auxiliary_loss_mlp": 0.01302986, + "balance_loss_clip": 1.13490498, + "balance_loss_mlp": 1.04473078, + "epoch": 0.22564256726288892, + "flos": 15232783684320.0, + "grad_norm": 1.7790187121477126, + "language_loss": 0.78600943, + "learning_rate": 3.611738583330375e-06, + "loss": 0.81387651, + "num_input_tokens_seen": 80748415, + "step": 3753, + "time_per_iteration": 2.790748119354248 + }, + { + "auxiliary_loss_clip": 0.01491234, + "auxiliary_loss_mlp": 0.01309106, + "balance_loss_clip": 1.14353704, + "balance_loss_mlp": 1.05046916, + "epoch": 0.2257026905155569, + "flos": 34571521497600.0, + "grad_norm": 2.007337860766809, + "language_loss": 0.78522027, + "learning_rate": 3.611507955052295e-06, + "loss": 0.81322366, + "num_input_tokens_seen": 80770835, + "step": 3754, + "time_per_iteration": 2.9047226905822754 + }, + { + "auxiliary_loss_clip": 0.01496239, + "auxiliary_loss_mlp": 0.01315142, + "balance_loss_clip": 1.14845908, + "balance_loss_mlp": 1.05326343, + "epoch": 0.22576281376822485, + "flos": 19940511828960.0, + "grad_norm": 2.1087365346563125, + "language_loss": 0.70687497, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.73498881, + "num_input_tokens_seen": 80787840, + "step": 3755, + "time_per_iteration": 2.8091397285461426 + }, + { + "auxiliary_loss_clip": 0.01493147, + "auxiliary_loss_mlp": 0.01310943, + "balance_loss_clip": 1.14503336, + "balance_loss_mlp": 1.03933644, + "epoch": 0.22582293702089282, + "flos": 24603977446560.0, + "grad_norm": 2.5345222922299535, + "language_loss": 0.78062385, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.80866474, + "num_input_tokens_seen": 80806335, + "step": 3756, + "time_per_iteration": 4.259307861328125 + }, + { + "auxiliary_loss_clip": 0.01497635, + "auxiliary_loss_mlp": 0.01304801, + "balance_loss_clip": 1.14888644, + "balance_loss_mlp": 1.04025197, + "epoch": 0.2258830602735608, + "flos": 23037452916480.0, + "grad_norm": 2.4154679417777847, + "language_loss": 0.82530802, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.85333234, + "num_input_tokens_seen": 80825355, + "step": 3757, + "time_per_iteration": 2.764754295349121 + }, + { + "auxiliary_loss_clip": 0.01500121, + "auxiliary_loss_mlp": 0.01319706, + "balance_loss_clip": 1.15115786, + "balance_loss_mlp": 1.05611002, + "epoch": 0.22594318352622877, + "flos": 22160317544640.0, + "grad_norm": 1.8224508192829916, + "language_loss": 0.73080385, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.75900221, + "num_input_tokens_seen": 80842570, + "step": 3758, + "time_per_iteration": 2.8016061782836914 + }, + { + "auxiliary_loss_clip": 0.01493601, + "auxiliary_loss_mlp": 0.01304835, + "balance_loss_clip": 1.14484441, + "balance_loss_mlp": 1.03608966, + "epoch": 0.22600330677889674, + "flos": 20596372129440.0, + "grad_norm": 2.30117308541469, + "language_loss": 0.76503575, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79302001, + "num_input_tokens_seen": 80858745, + "step": 3759, + "time_per_iteration": 2.7688186168670654 + }, + { + "auxiliary_loss_clip": 0.01486671, + "auxiliary_loss_mlp": 0.01307824, + "balance_loss_clip": 1.13822317, + "balance_loss_mlp": 1.0375526, + "epoch": 0.2260634300315647, + "flos": 35662535949600.0, + "grad_norm": 3.401502373025869, + "language_loss": 0.78718424, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.81512916, + "num_input_tokens_seen": 80880085, + "step": 3760, + "time_per_iteration": 2.90386962890625 + }, + { + "auxiliary_loss_clip": 0.01654094, + "auxiliary_loss_mlp": 0.01249046, + "balance_loss_clip": 1.31564581, + "balance_loss_mlp": 1.0125351, + "epoch": 0.22612355328423267, + "flos": 72096234899040.0, + "grad_norm": 0.9572228752597818, + "language_loss": 0.60038704, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62941843, + "num_input_tokens_seen": 80937660, + "step": 3761, + "time_per_iteration": 3.250011920928955 + }, + { + "auxiliary_loss_clip": 0.01487968, + "auxiliary_loss_mlp": 0.01329538, + "balance_loss_clip": 1.13977599, + "balance_loss_mlp": 1.06727755, + "epoch": 0.22618367653690064, + "flos": 22785986665440.0, + "grad_norm": 2.4647220536003225, + "language_loss": 0.77859902, + "learning_rate": 3.609660729655211e-06, + "loss": 0.80677408, + "num_input_tokens_seen": 80956265, + "step": 3762, + "time_per_iteration": 2.766477584838867 + }, + { + "auxiliary_loss_clip": 0.01495719, + "auxiliary_loss_mlp": 0.01309048, + "balance_loss_clip": 1.14851117, + "balance_loss_mlp": 1.04430842, + "epoch": 0.22624379978956863, + "flos": 20450309328000.0, + "grad_norm": 2.1417062622723373, + "language_loss": 0.79238242, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.8204301, + "num_input_tokens_seen": 80975185, + "step": 3763, + "time_per_iteration": 2.840996503829956 + }, + { + "auxiliary_loss_clip": 0.01492816, + "auxiliary_loss_mlp": 0.01332, + "balance_loss_clip": 1.14415765, + "balance_loss_mlp": 1.064399, + "epoch": 0.2263039230422366, + "flos": 17496283004640.0, + "grad_norm": 1.7957106514109897, + "language_loss": 0.91598552, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.94423366, + "num_input_tokens_seen": 80992830, + "step": 3764, + "time_per_iteration": 2.8394699096679688 + }, + { + "auxiliary_loss_clip": 0.01493633, + "auxiliary_loss_mlp": 0.01310951, + "balance_loss_clip": 1.14675117, + "balance_loss_mlp": 1.04869008, + "epoch": 0.22636404629490456, + "flos": 28332276878880.0, + "grad_norm": 1.7759384859764904, + "language_loss": 0.75363874, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.78168458, + "num_input_tokens_seen": 81013675, + "step": 3765, + "time_per_iteration": 2.905083656311035 + }, + { + "auxiliary_loss_clip": 0.01495176, + "auxiliary_loss_mlp": 0.01326978, + "balance_loss_clip": 1.14784253, + "balance_loss_mlp": 1.06891334, + "epoch": 0.22642416954757252, + "flos": 17490821349600.0, + "grad_norm": 2.1035300444004856, + "language_loss": 0.90028065, + "learning_rate": 3.608735651752494e-06, + "loss": 0.9285022, + "num_input_tokens_seen": 81030345, + "step": 3766, + "time_per_iteration": 2.7488763332366943 + }, + { + "auxiliary_loss_clip": 0.01497962, + "auxiliary_loss_mlp": 0.01324146, + "balance_loss_clip": 1.15013087, + "balance_loss_mlp": 1.06512797, + "epoch": 0.2264842928002405, + "flos": 24386571047520.0, + "grad_norm": 1.4928549332798202, + "language_loss": 0.75022864, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.77844977, + "num_input_tokens_seen": 81051000, + "step": 3767, + "time_per_iteration": 2.864132881164551 + }, + { + "auxiliary_loss_clip": 0.0149483, + "auxiliary_loss_mlp": 0.01333433, + "balance_loss_clip": 1.14749825, + "balance_loss_mlp": 1.07079089, + "epoch": 0.22654441605290845, + "flos": 19832680977120.0, + "grad_norm": 1.521910787741474, + "language_loss": 0.71568036, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.74396294, + "num_input_tokens_seen": 81071205, + "step": 3768, + "time_per_iteration": 2.794048309326172 + }, + { + "auxiliary_loss_clip": 0.01496721, + "auxiliary_loss_mlp": 0.01314386, + "balance_loss_clip": 1.14935386, + "balance_loss_mlp": 1.05288887, + "epoch": 0.22660453930557642, + "flos": 27457075843200.0, + "grad_norm": 1.7898817562020617, + "language_loss": 0.78092414, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80903524, + "num_input_tokens_seen": 81091880, + "step": 3769, + "time_per_iteration": 2.821028470993042 + }, + { + "auxiliary_loss_clip": 0.0149171, + "auxiliary_loss_mlp": 0.0130879, + "balance_loss_clip": 1.14407992, + "balance_loss_mlp": 1.04462206, + "epoch": 0.2266646625582444, + "flos": 23990445336960.0, + "grad_norm": 2.547596527960686, + "language_loss": 0.68538213, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.71338707, + "num_input_tokens_seen": 81113290, + "step": 3770, + "time_per_iteration": 2.863142967224121 + }, + { + "auxiliary_loss_clip": 0.01496433, + "auxiliary_loss_mlp": 0.01315441, + "balance_loss_clip": 1.14912426, + "balance_loss_mlp": 1.04459786, + "epoch": 0.22672478581091238, + "flos": 26030280111840.0, + "grad_norm": 1.6700447034241315, + "language_loss": 0.80444139, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.83256012, + "num_input_tokens_seen": 81133535, + "step": 3771, + "time_per_iteration": 2.8017523288726807 + }, + { + "auxiliary_loss_clip": 0.01496284, + "auxiliary_loss_mlp": 0.01300224, + "balance_loss_clip": 1.14869046, + "balance_loss_mlp": 1.03643763, + "epoch": 0.22678490906358034, + "flos": 23844079110240.0, + "grad_norm": 1.9169058667649521, + "language_loss": 0.78929937, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81726444, + "num_input_tokens_seen": 81154650, + "step": 3772, + "time_per_iteration": 2.8167145252227783 + }, + { + "auxiliary_loss_clip": 0.01645565, + "auxiliary_loss_mlp": 0.01276855, + "balance_loss_clip": 1.31015468, + "balance_loss_mlp": 1.04644775, + "epoch": 0.2268450323162483, + "flos": 65055484023840.0, + "grad_norm": 0.6569020578626462, + "language_loss": 0.54296601, + "learning_rate": 3.607114417129261e-06, + "loss": 0.57219017, + "num_input_tokens_seen": 81221240, + "step": 3773, + "time_per_iteration": 3.376887083053589 + }, + { + "auxiliary_loss_clip": 0.01495648, + "auxiliary_loss_mlp": 0.01297571, + "balance_loss_clip": 1.14801431, + "balance_loss_mlp": 1.03149569, + "epoch": 0.22690515556891627, + "flos": 22528148483520.0, + "grad_norm": 1.6498052593236594, + "language_loss": 0.7072469, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.73517907, + "num_input_tokens_seen": 81241520, + "step": 3774, + "time_per_iteration": 2.7955620288848877 + }, + { + "auxiliary_loss_clip": 0.01490898, + "auxiliary_loss_mlp": 0.01309802, + "balance_loss_clip": 1.14425874, + "balance_loss_mlp": 1.04143786, + "epoch": 0.22696527882158424, + "flos": 18225421238880.0, + "grad_norm": 3.579393491797405, + "language_loss": 0.74503392, + "learning_rate": 3.606650658627658e-06, + "loss": 0.77304089, + "num_input_tokens_seen": 81256825, + "step": 3775, + "time_per_iteration": 2.785309314727783 + }, + { + "auxiliary_loss_clip": 0.01498224, + "auxiliary_loss_mlp": 0.01302674, + "balance_loss_clip": 1.15083766, + "balance_loss_mlp": 1.03240323, + "epoch": 0.22702540207425223, + "flos": 17021152208160.0, + "grad_norm": 3.0441309335739164, + "language_loss": 0.82425565, + "learning_rate": 3.606418687985928e-06, + "loss": 0.85226464, + "num_input_tokens_seen": 81275695, + "step": 3776, + "time_per_iteration": 2.75459623336792 + }, + { + "auxiliary_loss_clip": 0.01496282, + "auxiliary_loss_mlp": 0.013106, + "balance_loss_clip": 1.14952946, + "balance_loss_mlp": 1.03918457, + "epoch": 0.2270855253269202, + "flos": 21327937765920.0, + "grad_norm": 1.8245031391139024, + "language_loss": 0.82740271, + "learning_rate": 3.606186656428641e-06, + "loss": 0.85547149, + "num_input_tokens_seen": 81294920, + "step": 3777, + "time_per_iteration": 2.782318353652954 + }, + { + "auxiliary_loss_clip": 0.01503294, + "auxiliary_loss_mlp": 0.01310435, + "balance_loss_clip": 1.15649581, + "balance_loss_mlp": 1.04092646, + "epoch": 0.22714564857958816, + "flos": 23552749998720.0, + "grad_norm": 2.3301615341847515, + "language_loss": 0.72662377, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.7547611, + "num_input_tokens_seen": 81314275, + "step": 3778, + "time_per_iteration": 2.87616229057312 + }, + { + "auxiliary_loss_clip": 0.01493772, + "auxiliary_loss_mlp": 0.01309636, + "balance_loss_clip": 1.14619172, + "balance_loss_mlp": 1.0384109, + "epoch": 0.22720577183225613, + "flos": 25992010234080.0, + "grad_norm": 4.198302545506589, + "language_loss": 0.64513397, + "learning_rate": 3.605722410602591e-06, + "loss": 0.67316806, + "num_input_tokens_seen": 81333890, + "step": 3779, + "time_per_iteration": 2.8222391605377197 + }, + { + "auxiliary_loss_clip": 0.01493689, + "auxiliary_loss_mlp": 0.0129244, + "balance_loss_clip": 1.14713645, + "balance_loss_mlp": 1.01949883, + "epoch": 0.2272658950849241, + "flos": 20816205930720.0, + "grad_norm": 1.9568580711317136, + "language_loss": 0.7094512, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.73731256, + "num_input_tokens_seen": 81353640, + "step": 3780, + "time_per_iteration": 2.8030881881713867 + }, + { + "auxiliary_loss_clip": 0.01504146, + "auxiliary_loss_mlp": 0.01318826, + "balance_loss_clip": 1.1579504, + "balance_loss_mlp": 1.04702878, + "epoch": 0.22732601833759206, + "flos": 23911288538400.0, + "grad_norm": 2.144476786324299, + "language_loss": 0.89484167, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.92307138, + "num_input_tokens_seen": 81371595, + "step": 3781, + "time_per_iteration": 2.78357195854187 + }, + { + "auxiliary_loss_clip": 0.01495608, + "auxiliary_loss_mlp": 0.01309606, + "balance_loss_clip": 1.14934134, + "balance_loss_mlp": 1.03933418, + "epoch": 0.22738614159026002, + "flos": 15926231155680.0, + "grad_norm": 2.6489976862478013, + "language_loss": 0.73916805, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76722026, + "num_input_tokens_seen": 81388435, + "step": 3782, + "time_per_iteration": 4.390119791030884 + }, + { + "auxiliary_loss_clip": 0.01497376, + "auxiliary_loss_mlp": 0.0130313, + "balance_loss_clip": 1.15139687, + "balance_loss_mlp": 1.03591084, + "epoch": 0.22744626484292801, + "flos": 24207737951520.0, + "grad_norm": 1.7401712348802993, + "language_loss": 0.8258245, + "learning_rate": 3.604793188351095e-06, + "loss": 0.85382962, + "num_input_tokens_seen": 81410195, + "step": 3783, + "time_per_iteration": 2.8158185482025146 + }, + { + "auxiliary_loss_clip": 0.01504602, + "auxiliary_loss_mlp": 0.01318575, + "balance_loss_clip": 1.15839767, + "balance_loss_mlp": 1.05555117, + "epoch": 0.22750638809559598, + "flos": 24793999349760.0, + "grad_norm": 2.9711429864732106, + "language_loss": 0.75551033, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78374213, + "num_input_tokens_seen": 81430060, + "step": 3784, + "time_per_iteration": 2.8317341804504395 + }, + { + "auxiliary_loss_clip": 0.01493984, + "auxiliary_loss_mlp": 0.01305192, + "balance_loss_clip": 1.14838123, + "balance_loss_mlp": 1.03759122, + "epoch": 0.22756651134826394, + "flos": 22238677851840.0, + "grad_norm": 1.7176018414438237, + "language_loss": 0.71024507, + "learning_rate": 3.604328212066594e-06, + "loss": 0.73823678, + "num_input_tokens_seen": 81447375, + "step": 3785, + "time_per_iteration": 2.7855560779571533 + }, + { + "auxiliary_loss_clip": 0.01657673, + "auxiliary_loss_mlp": 0.01256256, + "balance_loss_clip": 1.32585788, + "balance_loss_mlp": 1.02050781, + "epoch": 0.2276266346009319, + "flos": 62714420887680.0, + "grad_norm": 0.8279407653797297, + "language_loss": 0.61882305, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.64796233, + "num_input_tokens_seen": 81505235, + "step": 3786, + "time_per_iteration": 3.339989185333252 + }, + { + "auxiliary_loss_clip": 0.01501719, + "auxiliary_loss_mlp": 0.01321524, + "balance_loss_clip": 1.15576184, + "balance_loss_mlp": 1.05449557, + "epoch": 0.22768675785359987, + "flos": 18615326731200.0, + "grad_norm": 2.7257391584743784, + "language_loss": 0.86471164, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.8929441, + "num_input_tokens_seen": 81518685, + "step": 3787, + "time_per_iteration": 2.756765365600586 + }, + { + "auxiliary_loss_clip": 0.01504061, + "auxiliary_loss_mlp": 0.01315603, + "balance_loss_clip": 1.15928483, + "balance_loss_mlp": 1.04876518, + "epoch": 0.22774688110626784, + "flos": 26872634996640.0, + "grad_norm": 1.3929288868810368, + "language_loss": 0.72750473, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.75570136, + "num_input_tokens_seen": 81538940, + "step": 3788, + "time_per_iteration": 2.8203108310699463 + }, + { + "auxiliary_loss_clip": 0.01501304, + "auxiliary_loss_mlp": 0.01319893, + "balance_loss_clip": 1.15548086, + "balance_loss_mlp": 1.05515265, + "epoch": 0.2278070043589358, + "flos": 15555062538720.0, + "grad_norm": 2.7183512281524886, + "language_loss": 0.67317021, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.7013821, + "num_input_tokens_seen": 81555525, + "step": 3789, + "time_per_iteration": 4.260986328125 + }, + { + "auxiliary_loss_clip": 0.0150662, + "auxiliary_loss_mlp": 0.01323658, + "balance_loss_clip": 1.16078985, + "balance_loss_mlp": 1.05605698, + "epoch": 0.2278671276116038, + "flos": 22418876361600.0, + "grad_norm": 2.1966696111038635, + "language_loss": 0.76578748, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.79409027, + "num_input_tokens_seen": 81576305, + "step": 3790, + "time_per_iteration": 2.864772319793701 + }, + { + "auxiliary_loss_clip": 0.01494743, + "auxiliary_loss_mlp": 0.01304778, + "balance_loss_clip": 1.14992118, + "balance_loss_mlp": 1.0404191, + "epoch": 0.22792725086427176, + "flos": 20633466234240.0, + "grad_norm": 2.837607325928931, + "language_loss": 0.9099732, + "learning_rate": 3.602931823424522e-06, + "loss": 0.93796843, + "num_input_tokens_seen": 81594115, + "step": 3791, + "time_per_iteration": 4.252053737640381 + }, + { + "auxiliary_loss_clip": 0.01495262, + "auxiliary_loss_mlp": 0.01312726, + "balance_loss_clip": 1.14879429, + "balance_loss_mlp": 1.04836774, + "epoch": 0.22798737411693973, + "flos": 31431531584160.0, + "grad_norm": 4.139367430039298, + "language_loss": 0.82583082, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.85391068, + "num_input_tokens_seen": 81615355, + "step": 3792, + "time_per_iteration": 2.8770198822021484 + }, + { + "auxiliary_loss_clip": 0.01650937, + "auxiliary_loss_mlp": 0.01245186, + "balance_loss_clip": 1.32102931, + "balance_loss_mlp": 1.00867462, + "epoch": 0.2280474973696077, + "flos": 52401841086240.0, + "grad_norm": 1.1431832164882, + "language_loss": 0.65652966, + "learning_rate": 3.602465874182981e-06, + "loss": 0.68549085, + "num_input_tokens_seen": 81662075, + "step": 3793, + "time_per_iteration": 4.5749781131744385 + }, + { + "auxiliary_loss_clip": 0.01501072, + "auxiliary_loss_mlp": 0.01317387, + "balance_loss_clip": 1.15655541, + "balance_loss_mlp": 1.04692483, + "epoch": 0.22810762062227566, + "flos": 26398490332320.0, + "grad_norm": 2.3382768810011347, + "language_loss": 0.77282584, + "learning_rate": 3.602232808409293e-06, + "loss": 0.80101043, + "num_input_tokens_seen": 81681625, + "step": 3794, + "time_per_iteration": 2.845430612564087 + }, + { + "auxiliary_loss_clip": 0.01495671, + "auxiliary_loss_mlp": 0.01321746, + "balance_loss_clip": 1.14883924, + "balance_loss_mlp": 1.05853224, + "epoch": 0.22816774387494362, + "flos": 25632675203040.0, + "grad_norm": 1.9766854060639312, + "language_loss": 0.81393439, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.84210861, + "num_input_tokens_seen": 81701170, + "step": 3795, + "time_per_iteration": 2.87792706489563 + }, + { + "auxiliary_loss_clip": 0.0149335, + "auxiliary_loss_mlp": 0.01324551, + "balance_loss_clip": 1.14750338, + "balance_loss_mlp": 1.06229019, + "epoch": 0.22822786712761162, + "flos": 22453580992320.0, + "grad_norm": 1.863661939882044, + "language_loss": 0.77225262, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.80043161, + "num_input_tokens_seen": 81721265, + "step": 3796, + "time_per_iteration": 2.803255319595337 + }, + { + "auxiliary_loss_clip": 0.01499162, + "auxiliary_loss_mlp": 0.01306581, + "balance_loss_clip": 1.15439963, + "balance_loss_mlp": 1.04012418, + "epoch": 0.22828799038027958, + "flos": 12204151941600.0, + "grad_norm": 6.324143183281291, + "language_loss": 0.96143055, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98948801, + "num_input_tokens_seen": 81736565, + "step": 3797, + "time_per_iteration": 2.766078233718872 + }, + { + "auxiliary_loss_clip": 0.01498328, + "auxiliary_loss_mlp": 0.01308051, + "balance_loss_clip": 1.15291023, + "balance_loss_mlp": 1.04922342, + "epoch": 0.22834811363294755, + "flos": 22087608533280.0, + "grad_norm": 1.9767270064083888, + "language_loss": 0.8174504, + "learning_rate": 3.601299937834666e-06, + "loss": 0.84551418, + "num_input_tokens_seen": 81756240, + "step": 3798, + "time_per_iteration": 2.8647279739379883 + }, + { + "auxiliary_loss_clip": 0.01497692, + "auxiliary_loss_mlp": 0.01323654, + "balance_loss_clip": 1.15222752, + "balance_loss_mlp": 1.05815089, + "epoch": 0.2284082368856155, + "flos": 24862801760640.0, + "grad_norm": 2.031356981986996, + "language_loss": 0.79036939, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.81858283, + "num_input_tokens_seen": 81775720, + "step": 3799, + "time_per_iteration": 2.78574538230896 + }, + { + "auxiliary_loss_clip": 0.01497746, + "auxiliary_loss_mlp": 0.01316052, + "balance_loss_clip": 1.15233934, + "balance_loss_mlp": 1.05207491, + "epoch": 0.22846836013828348, + "flos": 23295025601280.0, + "grad_norm": 1.9045622793209906, + "language_loss": 0.75291669, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.78105462, + "num_input_tokens_seen": 81795830, + "step": 3800, + "time_per_iteration": 2.7978851795196533 + }, + { + "auxiliary_loss_clip": 0.01497439, + "auxiliary_loss_mlp": 0.01332069, + "balance_loss_clip": 1.15140331, + "balance_loss_mlp": 1.06999898, + "epoch": 0.22852848339095144, + "flos": 27418692180960.0, + "grad_norm": 1.9347103627911932, + "language_loss": 0.64473271, + "learning_rate": 3.600599647297484e-06, + "loss": 0.67302787, + "num_input_tokens_seen": 81815745, + "step": 3801, + "time_per_iteration": 2.8010149002075195 + }, + { + "auxiliary_loss_clip": 0.0149906, + "auxiliary_loss_mlp": 0.01309738, + "balance_loss_clip": 1.15418172, + "balance_loss_mlp": 1.04690552, + "epoch": 0.2285886066436194, + "flos": 26323467703200.0, + "grad_norm": 1.824918423895662, + "language_loss": 0.81923276, + "learning_rate": 3.60036609571682e-06, + "loss": 0.84732068, + "num_input_tokens_seen": 81835155, + "step": 3802, + "time_per_iteration": 2.794149875640869 + }, + { + "auxiliary_loss_clip": 0.01496167, + "auxiliary_loss_mlp": 0.0130739, + "balance_loss_clip": 1.15088058, + "balance_loss_mlp": 1.03902555, + "epoch": 0.2286487298962874, + "flos": 29719095965280.0, + "grad_norm": 3.949756925975253, + "language_loss": 0.7860319, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81406748, + "num_input_tokens_seen": 81855655, + "step": 3803, + "time_per_iteration": 2.8586385250091553 + }, + { + "auxiliary_loss_clip": 0.01487356, + "auxiliary_loss_mlp": 0.01312398, + "balance_loss_clip": 1.14129984, + "balance_loss_mlp": 1.04308057, + "epoch": 0.22870885314895537, + "flos": 21289288606560.0, + "grad_norm": 1.6529773952464781, + "language_loss": 0.85513896, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.88313651, + "num_input_tokens_seen": 81876385, + "step": 3804, + "time_per_iteration": 2.780158281326294 + }, + { + "auxiliary_loss_clip": 0.01490629, + "auxiliary_loss_mlp": 0.01325103, + "balance_loss_clip": 1.14534461, + "balance_loss_mlp": 1.05864644, + "epoch": 0.22876897640162333, + "flos": 14941227003840.0, + "grad_norm": 3.3783647173500833, + "language_loss": 0.7671693, + "learning_rate": 3.59966507689401e-06, + "loss": 0.79532659, + "num_input_tokens_seen": 81893225, + "step": 3805, + "time_per_iteration": 2.7709991931915283 + }, + { + "auxiliary_loss_clip": 0.01496682, + "auxiliary_loss_mlp": 0.01318856, + "balance_loss_clip": 1.15204203, + "balance_loss_mlp": 1.04381633, + "epoch": 0.2288290996542913, + "flos": 18115883619840.0, + "grad_norm": 2.2638016016234124, + "language_loss": 0.78852314, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81667852, + "num_input_tokens_seen": 81911350, + "step": 3806, + "time_per_iteration": 2.7906911373138428 + }, + { + "auxiliary_loss_clip": 0.01498252, + "auxiliary_loss_mlp": 0.01321743, + "balance_loss_clip": 1.15320325, + "balance_loss_mlp": 1.05623972, + "epoch": 0.22888922290695926, + "flos": 39858418474560.0, + "grad_norm": 1.9308536405657541, + "language_loss": 0.69755352, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.72575343, + "num_input_tokens_seen": 81935420, + "step": 3807, + "time_per_iteration": 3.0998964309692383 + }, + { + "auxiliary_loss_clip": 0.01502974, + "auxiliary_loss_mlp": 0.01315586, + "balance_loss_clip": 1.15882635, + "balance_loss_mlp": 1.04531479, + "epoch": 0.22894934615962723, + "flos": 23406004490400.0, + "grad_norm": 2.574220283281174, + "language_loss": 0.65897971, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.68716526, + "num_input_tokens_seen": 81953845, + "step": 3808, + "time_per_iteration": 2.7655892372131348 + }, + { + "auxiliary_loss_clip": 0.01498319, + "auxiliary_loss_mlp": 0.01309605, + "balance_loss_clip": 1.15552974, + "balance_loss_mlp": 1.040097, + "epoch": 0.22900946941229522, + "flos": 18844983925920.0, + "grad_norm": 2.0934852429074287, + "language_loss": 0.75227934, + "learning_rate": 3.598729535939222e-06, + "loss": 0.78035855, + "num_input_tokens_seen": 81972100, + "step": 3809, + "time_per_iteration": 2.760328769683838 + }, + { + "auxiliary_loss_clip": 0.01501421, + "auxiliary_loss_mlp": 0.01303902, + "balance_loss_clip": 1.15702105, + "balance_loss_mlp": 1.04030657, + "epoch": 0.22906959266496318, + "flos": 22931594328960.0, + "grad_norm": 1.929823980881591, + "language_loss": 0.81588542, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.84393871, + "num_input_tokens_seen": 81992760, + "step": 3810, + "time_per_iteration": 2.7800309658050537 + }, + { + "auxiliary_loss_clip": 0.01505874, + "auxiliary_loss_mlp": 0.0131284, + "balance_loss_clip": 1.16340756, + "balance_loss_mlp": 1.04810023, + "epoch": 0.22912971591763115, + "flos": 19356564048480.0, + "grad_norm": 11.639416402885136, + "language_loss": 0.78824466, + "learning_rate": 3.598261401682441e-06, + "loss": 0.81643188, + "num_input_tokens_seen": 82009080, + "step": 3811, + "time_per_iteration": 2.7603254318237305 + }, + { + "auxiliary_loss_clip": 0.01502301, + "auxiliary_loss_mlp": 0.01303421, + "balance_loss_clip": 1.15852427, + "balance_loss_mlp": 1.03887177, + "epoch": 0.22918983917029911, + "flos": 19935353599200.0, + "grad_norm": 1.8334165325046061, + "language_loss": 0.83330512, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.86136234, + "num_input_tokens_seen": 82026705, + "step": 3812, + "time_per_iteration": 2.7757623195648193 + }, + { + "auxiliary_loss_clip": 0.01515553, + "auxiliary_loss_mlp": 0.01324492, + "balance_loss_clip": 1.17245591, + "balance_loss_mlp": 1.05402982, + "epoch": 0.22924996242296708, + "flos": 16692766920000.0, + "grad_norm": 2.7260719525618864, + "language_loss": 0.83558762, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.86398804, + "num_input_tokens_seen": 82043245, + "step": 3813, + "time_per_iteration": 2.752216100692749 + }, + { + "auxiliary_loss_clip": 0.01505759, + "auxiliary_loss_mlp": 0.01316103, + "balance_loss_clip": 1.16310358, + "balance_loss_mlp": 1.05155337, + "epoch": 0.22931008567563504, + "flos": 33038943035040.0, + "grad_norm": 1.688648405043816, + "language_loss": 0.70261908, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.73083764, + "num_input_tokens_seen": 82066870, + "step": 3814, + "time_per_iteration": 2.865316390991211 + }, + { + "auxiliary_loss_clip": 0.01505959, + "auxiliary_loss_mlp": 0.01315367, + "balance_loss_clip": 1.16303325, + "balance_loss_mlp": 1.05215299, + "epoch": 0.229370208928303, + "flos": 23332916197440.0, + "grad_norm": 2.5678138690415326, + "language_loss": 0.66970468, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69791788, + "num_input_tokens_seen": 82083180, + "step": 3815, + "time_per_iteration": 2.7646541595458984 + }, + { + "auxiliary_loss_clip": 0.01511905, + "auxiliary_loss_mlp": 0.01300846, + "balance_loss_clip": 1.16827834, + "balance_loss_mlp": 1.03248179, + "epoch": 0.229430332180971, + "flos": 28619509749120.0, + "grad_norm": 4.467719669144115, + "language_loss": 0.83117259, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85930014, + "num_input_tokens_seen": 82102950, + "step": 3816, + "time_per_iteration": 2.872825860977173 + }, + { + "auxiliary_loss_clip": 0.01507519, + "auxiliary_loss_mlp": 0.01305136, + "balance_loss_clip": 1.164572, + "balance_loss_mlp": 1.03791666, + "epoch": 0.22949045543363897, + "flos": 17240113661760.0, + "grad_norm": 2.2422797557420253, + "language_loss": 0.86718464, + "learning_rate": 3.596855544646742e-06, + "loss": 0.89531118, + "num_input_tokens_seen": 82119510, + "step": 3817, + "time_per_iteration": 2.751532554626465 + }, + { + "auxiliary_loss_clip": 0.01506686, + "auxiliary_loss_mlp": 0.01320882, + "balance_loss_clip": 1.16360521, + "balance_loss_mlp": 1.05442584, + "epoch": 0.22955057868630693, + "flos": 27491856330240.0, + "grad_norm": 1.8197630817941182, + "language_loss": 0.74728191, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.77555752, + "num_input_tokens_seen": 82140095, + "step": 3818, + "time_per_iteration": 2.8806326389312744 + }, + { + "auxiliary_loss_clip": 0.01516768, + "auxiliary_loss_mlp": 0.01319527, + "balance_loss_clip": 1.17312562, + "balance_loss_mlp": 1.05364227, + "epoch": 0.2296107019389749, + "flos": 23478637645440.0, + "grad_norm": 2.004716459010696, + "language_loss": 0.74674612, + "learning_rate": 3.596386441116659e-06, + "loss": 0.77510905, + "num_input_tokens_seen": 82159510, + "step": 3819, + "time_per_iteration": 2.8156564235687256 + }, + { + "auxiliary_loss_clip": 0.01508484, + "auxiliary_loss_mlp": 0.01311558, + "balance_loss_clip": 1.16710913, + "balance_loss_mlp": 1.04185915, + "epoch": 0.22967082519164286, + "flos": 31287858256800.0, + "grad_norm": 1.8916825113152433, + "language_loss": 0.80880636, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83700675, + "num_input_tokens_seen": 82179580, + "step": 3820, + "time_per_iteration": 4.478313684463501 + }, + { + "auxiliary_loss_clip": 0.01511341, + "auxiliary_loss_mlp": 0.01312004, + "balance_loss_clip": 1.16887867, + "balance_loss_mlp": 1.04287767, + "epoch": 0.22973094844431083, + "flos": 14644436237280.0, + "grad_norm": 2.347708335598885, + "language_loss": 0.69536901, + "learning_rate": 3.595917095446042e-06, + "loss": 0.72360241, + "num_input_tokens_seen": 82195585, + "step": 3821, + "time_per_iteration": 2.7493505477905273 + }, + { + "auxiliary_loss_clip": 0.01511578, + "auxiliary_loss_mlp": 0.01302777, + "balance_loss_clip": 1.17017484, + "balance_loss_mlp": 1.03841817, + "epoch": 0.2297910716969788, + "flos": 22826266735680.0, + "grad_norm": 1.8146475060037435, + "language_loss": 0.83052784, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.85867137, + "num_input_tokens_seen": 82217530, + "step": 3822, + "time_per_iteration": 2.8696537017822266 + }, + { + "auxiliary_loss_clip": 0.01517791, + "auxiliary_loss_mlp": 0.01307634, + "balance_loss_clip": 1.1756413, + "balance_loss_mlp": 1.04174924, + "epoch": 0.2298511949496468, + "flos": 23041245732480.0, + "grad_norm": 1.6424547945173185, + "language_loss": 0.66626942, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.69452369, + "num_input_tokens_seen": 82237980, + "step": 3823, + "time_per_iteration": 2.788590908050537 + }, + { + "auxiliary_loss_clip": 0.01673915, + "auxiliary_loss_mlp": 0.01242325, + "balance_loss_clip": 1.34960914, + "balance_loss_mlp": 1.01649475, + "epoch": 0.22991131820231475, + "flos": 66897407838240.0, + "grad_norm": 0.7967077475596746, + "language_loss": 0.56767219, + "learning_rate": 3.595212623082357e-06, + "loss": 0.5968346, + "num_input_tokens_seen": 82301785, + "step": 3824, + "time_per_iteration": 3.402303457260132 + }, + { + "auxiliary_loss_clip": 0.01512018, + "auxiliary_loss_mlp": 0.01312375, + "balance_loss_clip": 1.17068076, + "balance_loss_mlp": 1.0476346, + "epoch": 0.22997144145498272, + "flos": 17888767611840.0, + "grad_norm": 3.453546891891908, + "language_loss": 0.73005629, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75830019, + "num_input_tokens_seen": 82317355, + "step": 3825, + "time_per_iteration": 2.719799280166626 + }, + { + "auxiliary_loss_clip": 0.01521061, + "auxiliary_loss_mlp": 0.01326469, + "balance_loss_clip": 1.18037474, + "balance_loss_mlp": 1.05677009, + "epoch": 0.23003156470765068, + "flos": 24678924219360.0, + "grad_norm": 2.333610819801449, + "language_loss": 0.87557918, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.90405446, + "num_input_tokens_seen": 82336645, + "step": 3826, + "time_per_iteration": 2.822740316390991 + }, + { + "auxiliary_loss_clip": 0.01510972, + "auxiliary_loss_mlp": 0.01321748, + "balance_loss_clip": 1.16806912, + "balance_loss_mlp": 1.05681753, + "epoch": 0.23009168796031865, + "flos": 15815897045280.0, + "grad_norm": 2.2877224812275743, + "language_loss": 0.8158164, + "learning_rate": 3.594507606303083e-06, + "loss": 0.84414363, + "num_input_tokens_seen": 82354225, + "step": 3827, + "time_per_iteration": 2.7668473720550537 + }, + { + "auxiliary_loss_clip": 0.01514194, + "auxiliary_loss_mlp": 0.01319406, + "balance_loss_clip": 1.17159033, + "balance_loss_mlp": 1.06038821, + "epoch": 0.2301518112129866, + "flos": 16214488086240.0, + "grad_norm": 2.603874272905333, + "language_loss": 0.86751091, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.89584696, + "num_input_tokens_seen": 82370240, + "step": 3828, + "time_per_iteration": 4.226633310317993 + }, + { + "auxiliary_loss_clip": 0.01510159, + "auxiliary_loss_mlp": 0.01333828, + "balance_loss_clip": 1.16840267, + "balance_loss_mlp": 1.07500041, + "epoch": 0.2302119344656546, + "flos": 20597585830560.0, + "grad_norm": 2.154528506398112, + "language_loss": 0.70609611, + "learning_rate": 3.594037292782607e-06, + "loss": 0.73453593, + "num_input_tokens_seen": 82389145, + "step": 3829, + "time_per_iteration": 4.325129985809326 + }, + { + "auxiliary_loss_clip": 0.01516236, + "auxiliary_loss_mlp": 0.01311633, + "balance_loss_clip": 1.17440939, + "balance_loss_mlp": 1.04994464, + "epoch": 0.23027205771832257, + "flos": 26799319134720.0, + "grad_norm": 2.347100492325552, + "language_loss": 0.84521228, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.87349105, + "num_input_tokens_seen": 82409185, + "step": 3830, + "time_per_iteration": 2.799132823944092 + }, + { + "auxiliary_loss_clip": 0.01514667, + "auxiliary_loss_mlp": 0.01303575, + "balance_loss_clip": 1.173684, + "balance_loss_mlp": 1.03807187, + "epoch": 0.23033218097099054, + "flos": 43876567820160.0, + "grad_norm": 1.9395744283836323, + "language_loss": 0.67456418, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.70274657, + "num_input_tokens_seen": 82432070, + "step": 3831, + "time_per_iteration": 4.462870121002197 + }, + { + "auxiliary_loss_clip": 0.01517893, + "auxiliary_loss_mlp": 0.01316086, + "balance_loss_clip": 1.1753999, + "balance_loss_mlp": 1.05020142, + "epoch": 0.2303923042236585, + "flos": 26070067116000.0, + "grad_norm": 2.473950709029051, + "language_loss": 0.75473452, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.78307426, + "num_input_tokens_seen": 82450625, + "step": 3832, + "time_per_iteration": 2.7653005123138428 + }, + { + "auxiliary_loss_clip": 0.01516548, + "auxiliary_loss_mlp": 0.01304935, + "balance_loss_clip": 1.17458355, + "balance_loss_mlp": 1.0426743, + "epoch": 0.23045242747632647, + "flos": 18298357819200.0, + "grad_norm": 1.9593360239421433, + "language_loss": 0.87139201, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89960688, + "num_input_tokens_seen": 82468575, + "step": 3833, + "time_per_iteration": 2.739445686340332 + }, + { + "auxiliary_loss_clip": 0.01518921, + "auxiliary_loss_mlp": 0.01319703, + "balance_loss_clip": 1.17607868, + "balance_loss_mlp": 1.05629849, + "epoch": 0.23051255072899443, + "flos": 25522910015040.0, + "grad_norm": 1.9728492123388817, + "language_loss": 0.75684446, + "learning_rate": 3.592860451331624e-06, + "loss": 0.7852307, + "num_input_tokens_seen": 82488655, + "step": 3834, + "time_per_iteration": 2.79683518409729 + }, + { + "auxiliary_loss_clip": 0.01515866, + "auxiliary_loss_mlp": 0.01294872, + "balance_loss_clip": 1.17341244, + "balance_loss_mlp": 1.0314672, + "epoch": 0.2305726739816624, + "flos": 21217338158400.0, + "grad_norm": 6.173308820343298, + "language_loss": 0.85656583, + "learning_rate": 3.592624901801432e-06, + "loss": 0.88467324, + "num_input_tokens_seen": 82507220, + "step": 3835, + "time_per_iteration": 2.8809902667999268 + }, + { + "auxiliary_loss_clip": 0.0152058, + "auxiliary_loss_mlp": 0.01319749, + "balance_loss_clip": 1.17756987, + "balance_loss_mlp": 1.05462766, + "epoch": 0.2306327972343304, + "flos": 23333371335360.0, + "grad_norm": 2.893670339707124, + "language_loss": 0.82608378, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.85448712, + "num_input_tokens_seen": 82527920, + "step": 3836, + "time_per_iteration": 2.86600661277771 + }, + { + "auxiliary_loss_clip": 0.01520939, + "auxiliary_loss_mlp": 0.01327393, + "balance_loss_clip": 1.17873812, + "balance_loss_mlp": 1.06360626, + "epoch": 0.23069292048699835, + "flos": 20668777715520.0, + "grad_norm": 2.406446505789556, + "language_loss": 0.79579437, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.8242777, + "num_input_tokens_seen": 82549040, + "step": 3837, + "time_per_iteration": 2.844358444213867 + }, + { + "auxiliary_loss_clip": 0.01670837, + "auxiliary_loss_mlp": 0.01291359, + "balance_loss_clip": 1.34634018, + "balance_loss_mlp": 1.07239532, + "epoch": 0.23075304373966632, + "flos": 70460870029920.0, + "grad_norm": 0.9145704932408949, + "language_loss": 0.65394092, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.68356287, + "num_input_tokens_seen": 82604070, + "step": 3838, + "time_per_iteration": 3.247668981552124 + }, + { + "auxiliary_loss_clip": 0.0151742, + "auxiliary_loss_mlp": 0.01299994, + "balance_loss_clip": 1.17478573, + "balance_loss_mlp": 1.03658867, + "epoch": 0.23081316699233428, + "flos": 16619868267840.0, + "grad_norm": 2.4644572727132235, + "language_loss": 0.76133168, + "learning_rate": 3.591682099845058e-06, + "loss": 0.78950584, + "num_input_tokens_seen": 82619665, + "step": 3839, + "time_per_iteration": 2.749691963195801 + }, + { + "auxiliary_loss_clip": 0.01516111, + "auxiliary_loss_mlp": 0.01318473, + "balance_loss_clip": 1.17376888, + "balance_loss_mlp": 1.04915547, + "epoch": 0.23087329024500225, + "flos": 13299793629120.0, + "grad_norm": 1.9527780425502184, + "language_loss": 0.68524468, + "learning_rate": 3.591446248441752e-06, + "loss": 0.71359056, + "num_input_tokens_seen": 82637530, + "step": 3840, + "time_per_iteration": 2.768524408340454 + }, + { + "auxiliary_loss_clip": 0.01516857, + "auxiliary_loss_mlp": 0.01332362, + "balance_loss_clip": 1.17434287, + "balance_loss_mlp": 1.06590533, + "epoch": 0.23093341349767021, + "flos": 17787687972480.0, + "grad_norm": 2.8737295749469625, + "language_loss": 0.7933867, + "learning_rate": 3.591210336690645e-06, + "loss": 0.82187879, + "num_input_tokens_seen": 82656130, + "step": 3841, + "time_per_iteration": 2.743420362472534 + }, + { + "auxiliary_loss_clip": 0.01511883, + "auxiliary_loss_mlp": 0.01311307, + "balance_loss_clip": 1.16951728, + "balance_loss_mlp": 1.04618526, + "epoch": 0.23099353675033818, + "flos": 23990369480640.0, + "grad_norm": 2.059993157024762, + "language_loss": 0.8285526, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85678452, + "num_input_tokens_seen": 82675295, + "step": 3842, + "time_per_iteration": 2.8245904445648193 + }, + { + "auxiliary_loss_clip": 0.01512919, + "auxiliary_loss_mlp": 0.01300956, + "balance_loss_clip": 1.17071223, + "balance_loss_mlp": 1.03449976, + "epoch": 0.23105366000300617, + "flos": 35998317228960.0, + "grad_norm": 1.561025871941225, + "language_loss": 0.66383016, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.69196892, + "num_input_tokens_seen": 82703260, + "step": 3843, + "time_per_iteration": 2.9522974491119385 + }, + { + "auxiliary_loss_clip": 0.01515782, + "auxiliary_loss_mlp": 0.01297775, + "balance_loss_clip": 1.17267418, + "balance_loss_mlp": 1.03246236, + "epoch": 0.23111378325567414, + "flos": 31247843683680.0, + "grad_norm": 2.810567622965621, + "language_loss": 0.77466369, + "learning_rate": 3.590502239439987e-06, + "loss": 0.80279928, + "num_input_tokens_seen": 82725060, + "step": 3844, + "time_per_iteration": 2.845114231109619 + }, + { + "auxiliary_loss_clip": 0.01518036, + "auxiliary_loss_mlp": 0.01316478, + "balance_loss_clip": 1.1751405, + "balance_loss_mlp": 1.04582524, + "epoch": 0.2311739065083421, + "flos": 19210235749920.0, + "grad_norm": 2.136749465529542, + "language_loss": 0.78283876, + "learning_rate": 3.590266086387156e-06, + "loss": 0.81118387, + "num_input_tokens_seen": 82742960, + "step": 3845, + "time_per_iteration": 2.769158363342285 + }, + { + "auxiliary_loss_clip": 0.01516066, + "auxiliary_loss_mlp": 0.01304115, + "balance_loss_clip": 1.17373371, + "balance_loss_mlp": 1.03956604, + "epoch": 0.23123402976101007, + "flos": 23362007460480.0, + "grad_norm": 2.23308379231694, + "language_loss": 0.77208376, + "learning_rate": 3.590029873031276e-06, + "loss": 0.80028564, + "num_input_tokens_seen": 82760205, + "step": 3846, + "time_per_iteration": 2.774963617324829 + }, + { + "auxiliary_loss_clip": 0.01516998, + "auxiliary_loss_mlp": 0.01320114, + "balance_loss_clip": 1.17389286, + "balance_loss_mlp": 1.05155909, + "epoch": 0.23129415301367803, + "flos": 13737071757600.0, + "grad_norm": 5.408749583735556, + "language_loss": 0.69522101, + "learning_rate": 3.589793599381304e-06, + "loss": 0.72359216, + "num_input_tokens_seen": 82778590, + "step": 3847, + "time_per_iteration": 2.8165271282196045 + }, + { + "auxiliary_loss_clip": 0.01669299, + "auxiliary_loss_mlp": 0.01248688, + "balance_loss_clip": 1.34307909, + "balance_loss_mlp": 1.02285767, + "epoch": 0.231354276266346, + "flos": 69743679166080.0, + "grad_norm": 0.7990223394949602, + "language_loss": 0.61013889, + "learning_rate": 3.589557265446198e-06, + "loss": 0.6393187, + "num_input_tokens_seen": 82833925, + "step": 3848, + "time_per_iteration": 3.280658721923828 + }, + { + "auxiliary_loss_clip": 0.01509375, + "auxiliary_loss_mlp": 0.01301243, + "balance_loss_clip": 1.16671968, + "balance_loss_mlp": 1.02830124, + "epoch": 0.231414399519014, + "flos": 18837474150240.0, + "grad_norm": 2.4017103544602643, + "language_loss": 0.78542113, + "learning_rate": 3.589320871234923e-06, + "loss": 0.81352735, + "num_input_tokens_seen": 82850625, + "step": 3849, + "time_per_iteration": 2.7866225242614746 + }, + { + "auxiliary_loss_clip": 0.01512282, + "auxiliary_loss_mlp": 0.01311439, + "balance_loss_clip": 1.16845274, + "balance_loss_mlp": 1.04002309, + "epoch": 0.23147452277168196, + "flos": 36138311524800.0, + "grad_norm": 2.10363008902655, + "language_loss": 0.72311646, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.75135368, + "num_input_tokens_seen": 82872105, + "step": 3850, + "time_per_iteration": 2.920581340789795 + }, + { + "auxiliary_loss_clip": 0.01518749, + "auxiliary_loss_mlp": 0.01320721, + "balance_loss_clip": 1.17485976, + "balance_loss_mlp": 1.05331063, + "epoch": 0.23153464602434992, + "flos": 20814992229600.0, + "grad_norm": 1.8915804687332949, + "language_loss": 0.76708424, + "learning_rate": 3.588847902019718e-06, + "loss": 0.79547894, + "num_input_tokens_seen": 82890595, + "step": 3851, + "time_per_iteration": 2.8381099700927734 + }, + { + "auxiliary_loss_clip": 0.01513721, + "auxiliary_loss_mlp": 0.01326049, + "balance_loss_clip": 1.16936994, + "balance_loss_mlp": 1.06302524, + "epoch": 0.2315947692770179, + "flos": 19941384176640.0, + "grad_norm": 2.228963006349576, + "language_loss": 0.69809437, + "learning_rate": 3.588611327033723e-06, + "loss": 0.72649211, + "num_input_tokens_seen": 82908910, + "step": 3852, + "time_per_iteration": 2.800729990005493 + }, + { + "auxiliary_loss_clip": 0.01508446, + "auxiliary_loss_mlp": 0.01313695, + "balance_loss_clip": 1.16363072, + "balance_loss_mlp": 1.04380488, + "epoch": 0.23165489252968585, + "flos": 12856977989280.0, + "grad_norm": 2.2400817473955232, + "language_loss": 0.67523348, + "learning_rate": 3.588374691807428e-06, + "loss": 0.70345485, + "num_input_tokens_seen": 82925405, + "step": 3853, + "time_per_iteration": 2.733213424682617 + }, + { + "auxiliary_loss_clip": 0.01512981, + "auxiliary_loss_mlp": 0.01315854, + "balance_loss_clip": 1.16773534, + "balance_loss_mlp": 1.04768062, + "epoch": 0.23171501578235382, + "flos": 30630935967840.0, + "grad_norm": 1.6551021313404293, + "language_loss": 0.79857635, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82686472, + "num_input_tokens_seen": 82945615, + "step": 3854, + "time_per_iteration": 2.882702589035034 + }, + { + "auxiliary_loss_clip": 0.0151006, + "auxiliary_loss_mlp": 0.01324678, + "balance_loss_clip": 1.16437995, + "balance_loss_mlp": 1.05688596, + "epoch": 0.23177513903502178, + "flos": 23845103170560.0, + "grad_norm": 2.474898052569868, + "language_loss": 0.6562317, + "learning_rate": 3.587901240669831e-06, + "loss": 0.68457907, + "num_input_tokens_seen": 82967570, + "step": 3855, + "time_per_iteration": 2.8095476627349854 + }, + { + "auxiliary_loss_clip": 0.01515861, + "auxiliary_loss_mlp": 0.01336086, + "balance_loss_clip": 1.16996861, + "balance_loss_mlp": 1.07630503, + "epoch": 0.23183526228768978, + "flos": 29572881451200.0, + "grad_norm": 2.0422262267938134, + "language_loss": 0.7145682, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.74308765, + "num_input_tokens_seen": 82987435, + "step": 3856, + "time_per_iteration": 2.9095511436462402 + }, + { + "auxiliary_loss_clip": 0.01511863, + "auxiliary_loss_mlp": 0.01322634, + "balance_loss_clip": 1.16587472, + "balance_loss_mlp": 1.0598011, + "epoch": 0.23189538554035774, + "flos": 34461528740640.0, + "grad_norm": 1.6980818052511788, + "language_loss": 0.77337718, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.80172217, + "num_input_tokens_seen": 83010505, + "step": 3857, + "time_per_iteration": 2.869335651397705 + }, + { + "auxiliary_loss_clip": 0.01509306, + "auxiliary_loss_mlp": 0.01332447, + "balance_loss_clip": 1.16265082, + "balance_loss_mlp": 1.06122208, + "epoch": 0.2319555087930257, + "flos": 18005739150240.0, + "grad_norm": 2.6578750476179596, + "language_loss": 0.91533124, + "learning_rate": 3.587190612385584e-06, + "loss": 0.94374883, + "num_input_tokens_seen": 83026705, + "step": 3858, + "time_per_iteration": 4.411968946456909 + }, + { + "auxiliary_loss_clip": 0.01514025, + "auxiliary_loss_mlp": 0.01319348, + "balance_loss_clip": 1.16793096, + "balance_loss_mlp": 1.05613387, + "epoch": 0.23201563204569367, + "flos": 23145814762560.0, + "grad_norm": 3.3233916741363907, + "language_loss": 0.76802927, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.796363, + "num_input_tokens_seen": 83046500, + "step": 3859, + "time_per_iteration": 2.820099353790283 + }, + { + "auxiliary_loss_clip": 0.01508213, + "auxiliary_loss_mlp": 0.01316708, + "balance_loss_clip": 1.16142678, + "balance_loss_mlp": 1.05311275, + "epoch": 0.23207575529836164, + "flos": 20670105201120.0, + "grad_norm": 1.6670616768126558, + "language_loss": 0.84266973, + "learning_rate": 3.58671655924898e-06, + "loss": 0.87091899, + "num_input_tokens_seen": 83065280, + "step": 3860, + "time_per_iteration": 2.7782700061798096 + }, + { + "auxiliary_loss_clip": 0.01516933, + "auxiliary_loss_mlp": 0.01330384, + "balance_loss_clip": 1.17065752, + "balance_loss_mlp": 1.06278276, + "epoch": 0.2321358785510296, + "flos": 16474070963520.0, + "grad_norm": 2.341189134165542, + "language_loss": 0.83168399, + "learning_rate": 3.586479442423508e-06, + "loss": 0.86015713, + "num_input_tokens_seen": 83082310, + "step": 3861, + "time_per_iteration": 2.79872727394104 + }, + { + "auxiliary_loss_clip": 0.01511389, + "auxiliary_loss_mlp": 0.01311383, + "balance_loss_clip": 1.16487885, + "balance_loss_mlp": 1.04740644, + "epoch": 0.2321960018036976, + "flos": 21618470386080.0, + "grad_norm": 2.818582149198371, + "language_loss": 0.85899717, + "learning_rate": 3.586242265438576e-06, + "loss": 0.88722491, + "num_input_tokens_seen": 83102065, + "step": 3862, + "time_per_iteration": 2.806396245956421 + }, + { + "auxiliary_loss_clip": 0.01509807, + "auxiliary_loss_mlp": 0.01326142, + "balance_loss_clip": 1.16218543, + "balance_loss_mlp": 1.06388128, + "epoch": 0.23225612505636556, + "flos": 22273572123360.0, + "grad_norm": 1.4046266746895208, + "language_loss": 0.75224566, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.7806052, + "num_input_tokens_seen": 83121445, + "step": 3863, + "time_per_iteration": 2.779888153076172 + }, + { + "auxiliary_loss_clip": 0.01513969, + "auxiliary_loss_mlp": 0.01326297, + "balance_loss_clip": 1.16726232, + "balance_loss_mlp": 1.06556213, + "epoch": 0.23231624830903352, + "flos": 17054036287200.0, + "grad_norm": 3.3330780373989084, + "language_loss": 0.74602878, + "learning_rate": 3.58576773102631e-06, + "loss": 0.77443141, + "num_input_tokens_seen": 83138175, + "step": 3864, + "time_per_iteration": 2.781111717224121 + }, + { + "auxiliary_loss_clip": 0.01508136, + "auxiliary_loss_mlp": 0.01315496, + "balance_loss_clip": 1.16022134, + "balance_loss_mlp": 1.0526638, + "epoch": 0.2323763715617015, + "flos": 34642827167040.0, + "grad_norm": 1.6892533598255692, + "language_loss": 0.70770276, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.73593909, + "num_input_tokens_seen": 83161975, + "step": 3865, + "time_per_iteration": 2.9029126167297363 + }, + { + "auxiliary_loss_clip": 0.01519103, + "auxiliary_loss_mlp": 0.01325481, + "balance_loss_clip": 1.17067158, + "balance_loss_mlp": 1.05520976, + "epoch": 0.23243649481436945, + "flos": 25553783901600.0, + "grad_norm": 2.166225018608988, + "language_loss": 0.95096672, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97941256, + "num_input_tokens_seen": 83180905, + "step": 3866, + "time_per_iteration": 2.8178718090057373 + }, + { + "auxiliary_loss_clip": 0.01521665, + "auxiliary_loss_mlp": 0.01301998, + "balance_loss_clip": 1.17354226, + "balance_loss_mlp": 1.03191757, + "epoch": 0.23249661806703742, + "flos": 20485469096640.0, + "grad_norm": 2.743579760860331, + "language_loss": 0.73517013, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.76340675, + "num_input_tokens_seen": 83196390, + "step": 3867, + "time_per_iteration": 4.278920412063599 + }, + { + "auxiliary_loss_clip": 0.01510808, + "auxiliary_loss_mlp": 0.01310126, + "balance_loss_clip": 1.16273987, + "balance_loss_mlp": 1.04004562, + "epoch": 0.23255674131970538, + "flos": 20378738161440.0, + "grad_norm": 6.234344671558679, + "language_loss": 0.82469106, + "learning_rate": 3.584817940684145e-06, + "loss": 0.85290039, + "num_input_tokens_seen": 83216165, + "step": 3868, + "time_per_iteration": 4.278358697891235 + }, + { + "auxiliary_loss_clip": 0.01516788, + "auxiliary_loss_mlp": 0.01307577, + "balance_loss_clip": 1.16789329, + "balance_loss_mlp": 1.04207468, + "epoch": 0.23261686457237338, + "flos": 17058018744000.0, + "grad_norm": 1.7727261398884377, + "language_loss": 0.73573512, + "learning_rate": 3.58458034283495e-06, + "loss": 0.76397872, + "num_input_tokens_seen": 83233845, + "step": 3869, + "time_per_iteration": 5.564393520355225 + }, + { + "auxiliary_loss_clip": 0.01519283, + "auxiliary_loss_mlp": 0.01306778, + "balance_loss_clip": 1.17161775, + "balance_loss_mlp": 1.03555334, + "epoch": 0.23267698782504134, + "flos": 29172507786720.0, + "grad_norm": 6.726169392528798, + "language_loss": 0.79801786, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.82627851, + "num_input_tokens_seen": 83254930, + "step": 3870, + "time_per_iteration": 2.8425674438476562 + }, + { + "auxiliary_loss_clip": 0.01514328, + "auxiliary_loss_mlp": 0.01321476, + "balance_loss_clip": 1.16555858, + "balance_loss_mlp": 1.05273056, + "epoch": 0.2327371110777093, + "flos": 21176565022080.0, + "grad_norm": 3.1474529351620735, + "language_loss": 0.69918263, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72754067, + "num_input_tokens_seen": 83272095, + "step": 3871, + "time_per_iteration": 2.8364100456237793 + }, + { + "auxiliary_loss_clip": 0.01520284, + "auxiliary_loss_mlp": 0.0130902, + "balance_loss_clip": 1.17065084, + "balance_loss_mlp": 1.03893924, + "epoch": 0.23279723433037727, + "flos": 24865418803680.0, + "grad_norm": 1.9563447338271356, + "language_loss": 0.695683, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.72397614, + "num_input_tokens_seen": 83290980, + "step": 3872, + "time_per_iteration": 2.7875471115112305 + }, + { + "auxiliary_loss_clip": 0.01521256, + "auxiliary_loss_mlp": 0.01319772, + "balance_loss_clip": 1.17180169, + "balance_loss_mlp": 1.03996432, + "epoch": 0.23285735758304524, + "flos": 38803701636000.0, + "grad_norm": 2.0863594889694146, + "language_loss": 0.78123194, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80964226, + "num_input_tokens_seen": 83315175, + "step": 3873, + "time_per_iteration": 2.9628193378448486 + }, + { + "auxiliary_loss_clip": 0.01647234, + "auxiliary_loss_mlp": 0.01274323, + "balance_loss_clip": 1.31302452, + "balance_loss_mlp": 1.05230713, + "epoch": 0.2329174808357132, + "flos": 53950197663360.0, + "grad_norm": 0.854866715089218, + "language_loss": 0.60519397, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.63440943, + "num_input_tokens_seen": 83372060, + "step": 3874, + "time_per_iteration": 3.21128249168396 + }, + { + "auxiliary_loss_clip": 0.01519907, + "auxiliary_loss_mlp": 0.01305154, + "balance_loss_clip": 1.17183924, + "balance_loss_mlp": 1.03621757, + "epoch": 0.23297760408838117, + "flos": 21218210506080.0, + "grad_norm": 2.7637082638889496, + "language_loss": 0.81080937, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83906001, + "num_input_tokens_seen": 83389795, + "step": 3875, + "time_per_iteration": 2.820509910583496 + }, + { + "auxiliary_loss_clip": 0.0152545, + "auxiliary_loss_mlp": 0.0129676, + "balance_loss_clip": 1.175969, + "balance_loss_mlp": 1.02648854, + "epoch": 0.23303772734104916, + "flos": 28405554812640.0, + "grad_norm": 2.0088948297397664, + "language_loss": 0.61435473, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.64257681, + "num_input_tokens_seen": 83410005, + "step": 3876, + "time_per_iteration": 2.895073175430298 + }, + { + "auxiliary_loss_clip": 0.01523293, + "auxiliary_loss_mlp": 0.01328694, + "balance_loss_clip": 1.17543006, + "balance_loss_mlp": 1.04926717, + "epoch": 0.23309785059371713, + "flos": 24316934217120.0, + "grad_norm": 2.6056463670725574, + "language_loss": 0.71372575, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.74224561, + "num_input_tokens_seen": 83430250, + "step": 3877, + "time_per_iteration": 2.8621277809143066 + }, + { + "auxiliary_loss_clip": 0.01523174, + "auxiliary_loss_mlp": 0.01316167, + "balance_loss_clip": 1.17374277, + "balance_loss_mlp": 1.0449419, + "epoch": 0.2331579738463851, + "flos": 15994502572320.0, + "grad_norm": 2.340108885035251, + "language_loss": 0.81424105, + "learning_rate": 3.582439259339073e-06, + "loss": 0.84263444, + "num_input_tokens_seen": 83447950, + "step": 3878, + "time_per_iteration": 2.7925000190734863 + }, + { + "auxiliary_loss_clip": 0.0152018, + "auxiliary_loss_mlp": 0.01311414, + "balance_loss_clip": 1.17139816, + "balance_loss_mlp": 1.0314151, + "epoch": 0.23321809709905306, + "flos": 36429792348960.0, + "grad_norm": 1.6045513878957842, + "language_loss": 0.75116205, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.77947807, + "num_input_tokens_seen": 83467785, + "step": 3879, + "time_per_iteration": 3.038909435272217 + }, + { + "auxiliary_loss_clip": 0.01513023, + "auxiliary_loss_mlp": 0.0130171, + "balance_loss_clip": 1.16312325, + "balance_loss_mlp": 1.02914953, + "epoch": 0.23327822035172102, + "flos": 21326913705600.0, + "grad_norm": 2.447463861937684, + "language_loss": 0.89976501, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.92791235, + "num_input_tokens_seen": 83485390, + "step": 3880, + "time_per_iteration": 2.8302009105682373 + }, + { + "auxiliary_loss_clip": 0.01517265, + "auxiliary_loss_mlp": 0.01308659, + "balance_loss_clip": 1.16824627, + "balance_loss_mlp": 1.03514516, + "epoch": 0.233338343604389, + "flos": 19173900208320.0, + "grad_norm": 3.4814947164753933, + "language_loss": 0.72202653, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.75028574, + "num_input_tokens_seen": 83504890, + "step": 3881, + "time_per_iteration": 2.978811264038086 + }, + { + "auxiliary_loss_clip": 0.01521436, + "auxiliary_loss_mlp": 0.01308776, + "balance_loss_clip": 1.1726104, + "balance_loss_mlp": 1.03659785, + "epoch": 0.23339846685705698, + "flos": 26910904874400.0, + "grad_norm": 2.614059030811981, + "language_loss": 0.6806891, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70899117, + "num_input_tokens_seen": 83526475, + "step": 3882, + "time_per_iteration": 2.8706836700439453 + }, + { + "auxiliary_loss_clip": 0.0151891, + "auxiliary_loss_mlp": 0.01306999, + "balance_loss_clip": 1.16924906, + "balance_loss_mlp": 1.03710902, + "epoch": 0.23345859010972494, + "flos": 32345912773440.0, + "grad_norm": 2.3797077424698325, + "language_loss": 0.76961976, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.79787886, + "num_input_tokens_seen": 83546620, + "step": 3883, + "time_per_iteration": 2.896171808242798 + }, + { + "auxiliary_loss_clip": 0.01638278, + "auxiliary_loss_mlp": 0.01241615, + "balance_loss_clip": 1.30223131, + "balance_loss_mlp": 1.01120758, + "epoch": 0.2335187133623929, + "flos": 58491457292160.0, + "grad_norm": 0.7820227503681633, + "language_loss": 0.59124744, + "learning_rate": 3.58100916965445e-06, + "loss": 0.62004632, + "num_input_tokens_seen": 83616160, + "step": 3884, + "time_per_iteration": 3.4481916427612305 + }, + { + "auxiliary_loss_clip": 0.01516477, + "auxiliary_loss_mlp": 0.01312308, + "balance_loss_clip": 1.16778922, + "balance_loss_mlp": 1.04585123, + "epoch": 0.23357883661506088, + "flos": 24504832143360.0, + "grad_norm": 1.7944921504734488, + "language_loss": 0.80262899, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.83091688, + "num_input_tokens_seen": 83636795, + "step": 3885, + "time_per_iteration": 2.9052720069885254 + }, + { + "auxiliary_loss_clip": 0.01516419, + "auxiliary_loss_mlp": 0.01321008, + "balance_loss_clip": 1.16654229, + "balance_loss_mlp": 1.05130887, + "epoch": 0.23363895986772884, + "flos": 18950273591040.0, + "grad_norm": 2.0417648367387717, + "language_loss": 0.88486814, + "learning_rate": 3.580531993380261e-06, + "loss": 0.91324246, + "num_input_tokens_seen": 83654050, + "step": 3886, + "time_per_iteration": 2.8006818294525146 + }, + { + "auxiliary_loss_clip": 0.01521061, + "auxiliary_loss_mlp": 0.0130911, + "balance_loss_clip": 1.17102385, + "balance_loss_mlp": 1.03960192, + "epoch": 0.2336990831203968, + "flos": 31689559406880.0, + "grad_norm": 2.309303854449289, + "language_loss": 0.73855639, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.76685804, + "num_input_tokens_seen": 83673720, + "step": 3887, + "time_per_iteration": 2.9337306022644043 + }, + { + "auxiliary_loss_clip": 0.01516752, + "auxiliary_loss_mlp": 0.01321784, + "balance_loss_clip": 1.16649246, + "balance_loss_mlp": 1.05475497, + "epoch": 0.23375920637306477, + "flos": 27712259053920.0, + "grad_norm": 1.981624638091518, + "language_loss": 0.84266114, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.87104654, + "num_input_tokens_seen": 83693470, + "step": 3888, + "time_per_iteration": 2.8457603454589844 + }, + { + "auxiliary_loss_clip": 0.01525913, + "auxiliary_loss_mlp": 0.01325524, + "balance_loss_clip": 1.17500329, + "balance_loss_mlp": 1.05849528, + "epoch": 0.23381932962573276, + "flos": 17677543502880.0, + "grad_norm": 2.0851427123167294, + "language_loss": 0.87321639, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.90173072, + "num_input_tokens_seen": 83711620, + "step": 3889, + "time_per_iteration": 2.7908663749694824 + }, + { + "auxiliary_loss_clip": 0.01522861, + "auxiliary_loss_mlp": 0.01319568, + "balance_loss_clip": 1.17155933, + "balance_loss_mlp": 1.05158579, + "epoch": 0.23387945287840073, + "flos": 14392514848320.0, + "grad_norm": 2.662937950025885, + "language_loss": 0.76383209, + "learning_rate": 3.579576921697125e-06, + "loss": 0.79225641, + "num_input_tokens_seen": 83727890, + "step": 3890, + "time_per_iteration": 2.8780510425567627 + }, + { + "auxiliary_loss_clip": 0.01514629, + "auxiliary_loss_mlp": 0.0131972, + "balance_loss_clip": 1.16331625, + "balance_loss_mlp": 1.05440748, + "epoch": 0.2339395761310687, + "flos": 46101493837440.0, + "grad_norm": 1.9034627885966617, + "language_loss": 0.73401022, + "learning_rate": 3.579338004009412e-06, + "loss": 0.76235366, + "num_input_tokens_seen": 83749370, + "step": 3891, + "time_per_iteration": 3.0553700923919678 + }, + { + "auxiliary_loss_clip": 0.01518584, + "auxiliary_loss_mlp": 0.01305207, + "balance_loss_clip": 1.16717088, + "balance_loss_mlp": 1.03646159, + "epoch": 0.23399969938373666, + "flos": 22384209659040.0, + "grad_norm": 1.6514147743678858, + "language_loss": 0.8291083, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.8573463, + "num_input_tokens_seen": 83769560, + "step": 3892, + "time_per_iteration": 2.820166826248169 + }, + { + "auxiliary_loss_clip": 0.01521473, + "auxiliary_loss_mlp": 0.01312109, + "balance_loss_clip": 1.1685555, + "balance_loss_mlp": 1.0429821, + "epoch": 0.23405982263640462, + "flos": 43511960774880.0, + "grad_norm": 1.678888490654125, + "language_loss": 0.650877, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67921281, + "num_input_tokens_seen": 83795635, + "step": 3893, + "time_per_iteration": 3.037951707839966 + }, + { + "auxiliary_loss_clip": 0.0152906, + "auxiliary_loss_mlp": 0.0132005, + "balance_loss_clip": 1.17696166, + "balance_loss_mlp": 1.05397451, + "epoch": 0.2341199458890726, + "flos": 22566873499200.0, + "grad_norm": 2.2254878425593523, + "language_loss": 0.79213059, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.82062173, + "num_input_tokens_seen": 83814090, + "step": 3894, + "time_per_iteration": 2.7695233821868896 + }, + { + "auxiliary_loss_clip": 0.01518513, + "auxiliary_loss_mlp": 0.01313726, + "balance_loss_clip": 1.1658231, + "balance_loss_mlp": 1.04650617, + "epoch": 0.23418006914174055, + "flos": 25636581803520.0, + "grad_norm": 1.4033662648968313, + "language_loss": 0.81810272, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.84642506, + "num_input_tokens_seen": 83836870, + "step": 3895, + "time_per_iteration": 2.924220561981201 + }, + { + "auxiliary_loss_clip": 0.01523135, + "auxiliary_loss_mlp": 0.01313565, + "balance_loss_clip": 1.17087746, + "balance_loss_mlp": 1.04462934, + "epoch": 0.23424019239440855, + "flos": 13547239495200.0, + "grad_norm": 2.6587029586603865, + "language_loss": 0.80732173, + "learning_rate": 3.578142517422292e-06, + "loss": 0.83568871, + "num_input_tokens_seen": 83853275, + "step": 3896, + "time_per_iteration": 2.7291793823242188 + }, + { + "auxiliary_loss_clip": 0.01526464, + "auxiliary_loss_mlp": 0.01315234, + "balance_loss_clip": 1.17446995, + "balance_loss_mlp": 1.0428642, + "epoch": 0.2343003156470765, + "flos": 22421720973600.0, + "grad_norm": 1.598898485843159, + "language_loss": 0.83220673, + "learning_rate": 3.577903240538623e-06, + "loss": 0.86062372, + "num_input_tokens_seen": 83872340, + "step": 3897, + "time_per_iteration": 4.686462163925171 + }, + { + "auxiliary_loss_clip": 0.01517963, + "auxiliary_loss_mlp": 0.01324269, + "balance_loss_clip": 1.16521823, + "balance_loss_mlp": 1.0555234, + "epoch": 0.23436043889974448, + "flos": 14792433374880.0, + "grad_norm": 1.7136062860725056, + "language_loss": 0.79616773, + "learning_rate": 3.577663903820705e-06, + "loss": 0.82459009, + "num_input_tokens_seen": 83888795, + "step": 3898, + "time_per_iteration": 2.7652764320373535 + }, + { + "auxiliary_loss_clip": 0.01532011, + "auxiliary_loss_mlp": 0.01319344, + "balance_loss_clip": 1.18071938, + "balance_loss_mlp": 1.05231476, + "epoch": 0.23442056215241244, + "flos": 22967891942400.0, + "grad_norm": 2.5359901051047524, + "language_loss": 0.74362171, + "learning_rate": 3.577424507277614e-06, + "loss": 0.77213526, + "num_input_tokens_seen": 83906820, + "step": 3899, + "time_per_iteration": 2.8049747943878174 + }, + { + "auxiliary_loss_clip": 0.01530308, + "auxiliary_loss_mlp": 0.01320681, + "balance_loss_clip": 1.17836666, + "balance_loss_mlp": 1.04945612, + "epoch": 0.2344806854050804, + "flos": 23073902242560.0, + "grad_norm": 1.6446523260256234, + "language_loss": 0.75654221, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.78505206, + "num_input_tokens_seen": 83926370, + "step": 3900, + "time_per_iteration": 2.8160486221313477 + }, + { + "auxiliary_loss_clip": 0.01514767, + "auxiliary_loss_mlp": 0.01325135, + "balance_loss_clip": 1.16325855, + "balance_loss_mlp": 1.05829692, + "epoch": 0.23454080865774837, + "flos": 16328994294240.0, + "grad_norm": 2.060739092348205, + "language_loss": 0.6714139, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69981289, + "num_input_tokens_seen": 83944600, + "step": 3901, + "time_per_iteration": 2.928864002227783 + }, + { + "auxiliary_loss_clip": 0.01636143, + "auxiliary_loss_mlp": 0.01274452, + "balance_loss_clip": 1.29946065, + "balance_loss_mlp": 1.05396271, + "epoch": 0.23460093191041637, + "flos": 67767185147040.0, + "grad_norm": 0.7617809482118819, + "language_loss": 0.58181089, + "learning_rate": 3.576705958788091e-06, + "loss": 0.61091685, + "num_input_tokens_seen": 84005100, + "step": 3902, + "time_per_iteration": 3.319843292236328 + }, + { + "auxiliary_loss_clip": 0.01525306, + "auxiliary_loss_mlp": 0.01308907, + "balance_loss_clip": 1.17509246, + "balance_loss_mlp": 1.03272283, + "epoch": 0.23466105516308433, + "flos": 20079330351840.0, + "grad_norm": 2.5285884103436187, + "language_loss": 0.80777961, + "learning_rate": 3.576466323035108e-06, + "loss": 0.8361218, + "num_input_tokens_seen": 84023775, + "step": 3903, + "time_per_iteration": 2.87904691696167 + }, + { + "auxiliary_loss_clip": 0.01516687, + "auxiliary_loss_mlp": 0.01312186, + "balance_loss_clip": 1.16657519, + "balance_loss_mlp": 1.03962564, + "epoch": 0.2347211784157523, + "flos": 24538057575840.0, + "grad_norm": 1.908522218580756, + "language_loss": 0.81976163, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84805036, + "num_input_tokens_seen": 84042605, + "step": 3904, + "time_per_iteration": 2.872473955154419 + }, + { + "auxiliary_loss_clip": 0.01524247, + "auxiliary_loss_mlp": 0.01318464, + "balance_loss_clip": 1.17512655, + "balance_loss_mlp": 1.04323387, + "epoch": 0.23478130166842026, + "flos": 23807288430720.0, + "grad_norm": 2.143099595091006, + "language_loss": 0.71823025, + "learning_rate": 3.57598687219895e-06, + "loss": 0.74665737, + "num_input_tokens_seen": 84061520, + "step": 3905, + "time_per_iteration": 5.949056148529053 + }, + { + "auxiliary_loss_clip": 0.0151956, + "auxiliary_loss_mlp": 0.01309597, + "balance_loss_clip": 1.1699909, + "balance_loss_mlp": 1.03932571, + "epoch": 0.23484142492108823, + "flos": 24095697073920.0, + "grad_norm": 1.6899140873125296, + "language_loss": 0.71346676, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.74175835, + "num_input_tokens_seen": 84081800, + "step": 3906, + "time_per_iteration": 2.833361864089966 + }, + { + "auxiliary_loss_clip": 0.01517945, + "auxiliary_loss_mlp": 0.01315775, + "balance_loss_clip": 1.16759181, + "balance_loss_mlp": 1.03787386, + "epoch": 0.2349015481737562, + "flos": 29098395433440.0, + "grad_norm": 2.7017743025968377, + "language_loss": 0.73187017, + "learning_rate": 3.575507182316473e-06, + "loss": 0.7602073, + "num_input_tokens_seen": 84102340, + "step": 3907, + "time_per_iteration": 4.681717872619629 + }, + { + "auxiliary_loss_clip": 0.01518237, + "auxiliary_loss_mlp": 0.01303021, + "balance_loss_clip": 1.1672163, + "balance_loss_mlp": 1.02626503, + "epoch": 0.23496167142642416, + "flos": 18918299787840.0, + "grad_norm": 1.9725468970431115, + "language_loss": 0.73084033, + "learning_rate": 3.575267247755601e-06, + "loss": 0.75905287, + "num_input_tokens_seen": 84120370, + "step": 3908, + "time_per_iteration": 2.782456636428833 + }, + { + "auxiliary_loss_clip": 0.01635428, + "auxiliary_loss_mlp": 0.01248428, + "balance_loss_clip": 1.29977679, + "balance_loss_mlp": 1.02412415, + "epoch": 0.23502179467909215, + "flos": 55873705678560.0, + "grad_norm": 1.0134270888143273, + "language_loss": 0.73248494, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.76132357, + "num_input_tokens_seen": 84165515, + "step": 3909, + "time_per_iteration": 3.1070709228515625 + }, + { + "auxiliary_loss_clip": 0.01522099, + "auxiliary_loss_mlp": 0.01310427, + "balance_loss_clip": 1.17091739, + "balance_loss_mlp": 1.04339826, + "epoch": 0.23508191793176011, + "flos": 23403880513440.0, + "grad_norm": 1.6750693407385893, + "language_loss": 0.87913418, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9074595, + "num_input_tokens_seen": 84184540, + "step": 3910, + "time_per_iteration": 2.8315889835357666 + }, + { + "auxiliary_loss_clip": 0.01520047, + "auxiliary_loss_mlp": 0.01316668, + "balance_loss_clip": 1.1686548, + "balance_loss_mlp": 1.04410791, + "epoch": 0.23514204118442808, + "flos": 20049746022720.0, + "grad_norm": 2.0120958531412647, + "language_loss": 0.76508212, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.79344928, + "num_input_tokens_seen": 84202025, + "step": 3911, + "time_per_iteration": 2.768827438354492 + }, + { + "auxiliary_loss_clip": 0.01523987, + "auxiliary_loss_mlp": 0.01312932, + "balance_loss_clip": 1.17263436, + "balance_loss_mlp": 1.04418635, + "epoch": 0.23520216443709605, + "flos": 21582779623200.0, + "grad_norm": 3.0703205282355164, + "language_loss": 0.81772339, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.84609258, + "num_input_tokens_seen": 84221895, + "step": 3912, + "time_per_iteration": 2.8211090564727783 + }, + { + "auxiliary_loss_clip": 0.01524215, + "auxiliary_loss_mlp": 0.01316065, + "balance_loss_clip": 1.17124629, + "balance_loss_mlp": 1.05227911, + "epoch": 0.235262287689764, + "flos": 23188067097120.0, + "grad_norm": 2.207771648882444, + "language_loss": 0.71958101, + "learning_rate": 3.574066679118909e-06, + "loss": 0.74798387, + "num_input_tokens_seen": 84240455, + "step": 3913, + "time_per_iteration": 2.893709897994995 + }, + { + "auxiliary_loss_clip": 0.01516838, + "auxiliary_loss_mlp": 0.01323677, + "balance_loss_clip": 1.16466522, + "balance_loss_mlp": 1.0467304, + "epoch": 0.23532241094243198, + "flos": 23187460246560.0, + "grad_norm": 1.7446700488693048, + "language_loss": 0.76136416, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78976929, + "num_input_tokens_seen": 84261605, + "step": 3914, + "time_per_iteration": 2.7777230739593506 + }, + { + "auxiliary_loss_clip": 0.01513442, + "auxiliary_loss_mlp": 0.01315667, + "balance_loss_clip": 1.16167438, + "balance_loss_mlp": 1.04768491, + "epoch": 0.23538253419509997, + "flos": 17021228064480.0, + "grad_norm": 5.208520736487275, + "language_loss": 0.89872253, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.92701364, + "num_input_tokens_seen": 84278675, + "step": 3915, + "time_per_iteration": 2.8210363388061523 + }, + { + "auxiliary_loss_clip": 0.01627196, + "auxiliary_loss_mlp": 0.01252876, + "balance_loss_clip": 1.2889173, + "balance_loss_mlp": 1.02857208, + "epoch": 0.23544265744776793, + "flos": 63454179371040.0, + "grad_norm": 0.8083170860501613, + "language_loss": 0.59370255, + "learning_rate": 3.573345621598854e-06, + "loss": 0.62250328, + "num_input_tokens_seen": 84329765, + "step": 3916, + "time_per_iteration": 3.2382822036743164 + }, + { + "auxiliary_loss_clip": 0.01624108, + "auxiliary_loss_mlp": 0.01245972, + "balance_loss_clip": 1.2856102, + "balance_loss_mlp": 1.02090454, + "epoch": 0.2355027807004359, + "flos": 70522731223200.0, + "grad_norm": 0.7849584945105995, + "language_loss": 0.49491978, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.52362061, + "num_input_tokens_seen": 84393680, + "step": 3917, + "time_per_iteration": 3.274017095565796 + }, + { + "auxiliary_loss_clip": 0.01518328, + "auxiliary_loss_mlp": 0.01327647, + "balance_loss_clip": 1.16602302, + "balance_loss_mlp": 1.05718541, + "epoch": 0.23556290395310386, + "flos": 21436451324640.0, + "grad_norm": 2.017787713425669, + "language_loss": 0.76862216, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.79708189, + "num_input_tokens_seen": 84412640, + "step": 3918, + "time_per_iteration": 2.857714891433716 + }, + { + "auxiliary_loss_clip": 0.01512912, + "auxiliary_loss_mlp": 0.01334843, + "balance_loss_clip": 1.15972042, + "balance_loss_mlp": 1.0681957, + "epoch": 0.23562302720577183, + "flos": 18188365062240.0, + "grad_norm": 3.794498865145361, + "language_loss": 0.694242, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.72271955, + "num_input_tokens_seen": 84431605, + "step": 3919, + "time_per_iteration": 2.7815005779266357 + }, + { + "auxiliary_loss_clip": 0.0151551, + "auxiliary_loss_mlp": 0.01323751, + "balance_loss_clip": 1.16243172, + "balance_loss_mlp": 1.06511426, + "epoch": 0.2356831504584398, + "flos": 33733376638560.0, + "grad_norm": 1.839197209580468, + "language_loss": 0.71012056, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.73851311, + "num_input_tokens_seen": 84454210, + "step": 3920, + "time_per_iteration": 2.9689981937408447 + }, + { + "auxiliary_loss_clip": 0.01520929, + "auxiliary_loss_mlp": 0.0131727, + "balance_loss_clip": 1.16646242, + "balance_loss_mlp": 1.0542469, + "epoch": 0.23574327371110776, + "flos": 24934903921440.0, + "grad_norm": 1.702545159059873, + "language_loss": 0.77549535, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.80387735, + "num_input_tokens_seen": 84475540, + "step": 3921, + "time_per_iteration": 2.843320846557617 + }, + { + "auxiliary_loss_clip": 0.01511676, + "auxiliary_loss_mlp": 0.01314454, + "balance_loss_clip": 1.15920866, + "balance_loss_mlp": 1.04914141, + "epoch": 0.23580339696377575, + "flos": 17824061442240.0, + "grad_norm": 2.353834976317075, + "language_loss": 0.75313807, + "learning_rate": 3.571901895946612e-06, + "loss": 0.78139931, + "num_input_tokens_seen": 84494580, + "step": 3922, + "time_per_iteration": 2.81105899810791 + }, + { + "auxiliary_loss_clip": 0.01507134, + "auxiliary_loss_mlp": 0.01310053, + "balance_loss_clip": 1.15363348, + "balance_loss_mlp": 1.04721999, + "epoch": 0.23586352021644372, + "flos": 26289028569600.0, + "grad_norm": 2.139095392613836, + "language_loss": 0.80669904, + "learning_rate": 3.571661066327956e-06, + "loss": 0.83487093, + "num_input_tokens_seen": 84513850, + "step": 3923, + "time_per_iteration": 2.8240389823913574 + }, + { + "auxiliary_loss_clip": 0.01510108, + "auxiliary_loss_mlp": 0.013229, + "balance_loss_clip": 1.15772343, + "balance_loss_mlp": 1.0614022, + "epoch": 0.23592364346911168, + "flos": 14248462239360.0, + "grad_norm": 2.347888574590862, + "language_loss": 0.74692118, + "learning_rate": 3.571420177111754e-06, + "loss": 0.77525121, + "num_input_tokens_seen": 84532315, + "step": 3924, + "time_per_iteration": 2.819908380508423 + }, + { + "auxiliary_loss_clip": 0.01513113, + "auxiliary_loss_mlp": 0.01339894, + "balance_loss_clip": 1.16095805, + "balance_loss_mlp": 1.07763374, + "epoch": 0.23598376672177965, + "flos": 18589914499680.0, + "grad_norm": 1.755572569638049, + "language_loss": 0.8278411, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.85637116, + "num_input_tokens_seen": 84550970, + "step": 3925, + "time_per_iteration": 2.8274927139282227 + }, + { + "auxiliary_loss_clip": 0.01507477, + "auxiliary_loss_mlp": 0.01309769, + "balance_loss_clip": 1.15407133, + "balance_loss_mlp": 1.0429306, + "epoch": 0.2360438899744476, + "flos": 22677814460160.0, + "grad_norm": 2.0642532694876246, + "language_loss": 0.59918296, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.62735546, + "num_input_tokens_seen": 84571655, + "step": 3926, + "time_per_iteration": 2.7799429893493652 + }, + { + "auxiliary_loss_clip": 0.01512938, + "auxiliary_loss_mlp": 0.01319508, + "balance_loss_clip": 1.16087699, + "balance_loss_mlp": 1.0618248, + "epoch": 0.23610401322711558, + "flos": 29572729738560.0, + "grad_norm": 1.9983970433198375, + "language_loss": 0.71864659, + "learning_rate": 3.570697151969235e-06, + "loss": 0.74697107, + "num_input_tokens_seen": 84593130, + "step": 3927, + "time_per_iteration": 2.894944429397583 + }, + { + "auxiliary_loss_clip": 0.01512424, + "auxiliary_loss_mlp": 0.01303313, + "balance_loss_clip": 1.16016817, + "balance_loss_mlp": 1.04048085, + "epoch": 0.23616413647978354, + "flos": 17860472840160.0, + "grad_norm": 1.913489673796113, + "language_loss": 0.75089991, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77905726, + "num_input_tokens_seen": 84612410, + "step": 3928, + "time_per_iteration": 3.0105974674224854 + }, + { + "auxiliary_loss_clip": 0.01502698, + "auxiliary_loss_mlp": 0.01320888, + "balance_loss_clip": 1.1504333, + "balance_loss_mlp": 1.05099833, + "epoch": 0.23622425973245154, + "flos": 11036370165120.0, + "grad_norm": 3.480945487958139, + "language_loss": 0.819206, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.84744191, + "num_input_tokens_seen": 84627610, + "step": 3929, + "time_per_iteration": 2.799607515335083 + }, + { + "auxiliary_loss_clip": 0.01511481, + "auxiliary_loss_mlp": 0.01328566, + "balance_loss_clip": 1.16008866, + "balance_loss_mlp": 1.0586766, + "epoch": 0.2362843829851195, + "flos": 23406307915680.0, + "grad_norm": 1.8131125480984807, + "language_loss": 0.7208693, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74926972, + "num_input_tokens_seen": 84648415, + "step": 3930, + "time_per_iteration": 2.840609550476074 + }, + { + "auxiliary_loss_clip": 0.01505473, + "auxiliary_loss_mlp": 0.01302575, + "balance_loss_clip": 1.15371895, + "balance_loss_mlp": 1.03688097, + "epoch": 0.23634450623778747, + "flos": 39532953654720.0, + "grad_norm": 2.5290694766610375, + "language_loss": 0.74322897, + "learning_rate": 3.569732284634665e-06, + "loss": 0.77130944, + "num_input_tokens_seen": 84670080, + "step": 3931, + "time_per_iteration": 3.036447763442993 + }, + { + "auxiliary_loss_clip": 0.0150761, + "auxiliary_loss_mlp": 0.01316283, + "balance_loss_clip": 1.15517473, + "balance_loss_mlp": 1.04639363, + "epoch": 0.23640462949045543, + "flos": 24209482646880.0, + "grad_norm": 2.1540656477459184, + "language_loss": 0.80668032, + "learning_rate": 3.569490918967136e-06, + "loss": 0.83491921, + "num_input_tokens_seen": 84686465, + "step": 3932, + "time_per_iteration": 2.8160431385040283 + }, + { + "auxiliary_loss_clip": 0.01510928, + "auxiliary_loss_mlp": 0.01318881, + "balance_loss_clip": 1.1590023, + "balance_loss_mlp": 1.0579555, + "epoch": 0.2364647527431234, + "flos": 26180059872960.0, + "grad_norm": 1.4823134844878534, + "language_loss": 0.85864073, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.88693881, + "num_input_tokens_seen": 84708825, + "step": 3933, + "time_per_iteration": 2.8498263359069824 + }, + { + "auxiliary_loss_clip": 0.01508584, + "auxiliary_loss_mlp": 0.01317898, + "balance_loss_clip": 1.15597534, + "balance_loss_mlp": 1.04610109, + "epoch": 0.23652487599579136, + "flos": 22639165300800.0, + "grad_norm": 2.1006596252268555, + "language_loss": 0.83171606, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.85998088, + "num_input_tokens_seen": 84726165, + "step": 3934, + "time_per_iteration": 2.8090322017669678 + }, + { + "auxiliary_loss_clip": 0.0150422, + "auxiliary_loss_mlp": 0.01322107, + "balance_loss_clip": 1.15091753, + "balance_loss_mlp": 1.05774844, + "epoch": 0.23658499924845935, + "flos": 21764419403040.0, + "grad_norm": 1.953655608117697, + "language_loss": 0.78884006, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.81710339, + "num_input_tokens_seen": 84745815, + "step": 3935, + "time_per_iteration": 4.468435764312744 + }, + { + "auxiliary_loss_clip": 0.0150565, + "auxiliary_loss_mlp": 0.01315609, + "balance_loss_clip": 1.15232456, + "balance_loss_mlp": 1.05373001, + "epoch": 0.23664512250112732, + "flos": 21801172154400.0, + "grad_norm": 6.618073670379666, + "language_loss": 0.79761022, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.82582277, + "num_input_tokens_seen": 84765415, + "step": 3936, + "time_per_iteration": 2.7869412899017334 + }, + { + "auxiliary_loss_clip": 0.01505003, + "auxiliary_loss_mlp": 0.01311847, + "balance_loss_clip": 1.15094292, + "balance_loss_mlp": 1.04519999, + "epoch": 0.23670524575379528, + "flos": 22640075576640.0, + "grad_norm": 1.5287659363309518, + "language_loss": 0.79207385, + "learning_rate": 3.568283198083826e-06, + "loss": 0.82024235, + "num_input_tokens_seen": 84787080, + "step": 3937, + "time_per_iteration": 2.873969793319702 + }, + { + "auxiliary_loss_clip": 0.01510897, + "auxiliary_loss_mlp": 0.01310396, + "balance_loss_clip": 1.15635109, + "balance_loss_mlp": 1.04584694, + "epoch": 0.23676536900646325, + "flos": 16726750915680.0, + "grad_norm": 2.390425438126537, + "language_loss": 0.85632348, + "learning_rate": 3.568041475462147e-06, + "loss": 0.88453639, + "num_input_tokens_seen": 84805395, + "step": 3938, + "time_per_iteration": 2.802983522415161 + }, + { + "auxiliary_loss_clip": 0.01503166, + "auxiliary_loss_mlp": 0.01300183, + "balance_loss_clip": 1.14838505, + "balance_loss_mlp": 1.0371598, + "epoch": 0.23682549225913122, + "flos": 11136653313120.0, + "grad_norm": 2.7120400863773537, + "language_loss": 0.94393754, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.97197104, + "num_input_tokens_seen": 84818090, + "step": 3939, + "time_per_iteration": 2.7726047039031982 + }, + { + "auxiliary_loss_clip": 0.01504666, + "auxiliary_loss_mlp": 0.01303184, + "balance_loss_clip": 1.14933395, + "balance_loss_mlp": 1.0317688, + "epoch": 0.23688561551179918, + "flos": 22561108418880.0, + "grad_norm": 1.9432667092733606, + "language_loss": 0.82284844, + "learning_rate": 3.567557851847088e-06, + "loss": 0.85092694, + "num_input_tokens_seen": 84837695, + "step": 3940, + "time_per_iteration": 2.78542423248291 + }, + { + "auxiliary_loss_clip": 0.01499513, + "auxiliary_loss_mlp": 0.01315211, + "balance_loss_clip": 1.14444113, + "balance_loss_mlp": 1.04303241, + "epoch": 0.23694573876446715, + "flos": 18516864134880.0, + "grad_norm": 2.178426040319034, + "language_loss": 0.89001429, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91816151, + "num_input_tokens_seen": 84854630, + "step": 3941, + "time_per_iteration": 2.7930257320404053 + }, + { + "auxiliary_loss_clip": 0.01496179, + "auxiliary_loss_mlp": 0.01306015, + "balance_loss_clip": 1.14178133, + "balance_loss_mlp": 1.03650665, + "epoch": 0.23700586201713514, + "flos": 15337238929920.0, + "grad_norm": 4.088712423951282, + "language_loss": 0.84764588, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.87566781, + "num_input_tokens_seen": 84871805, + "step": 3942, + "time_per_iteration": 2.9654877185821533 + }, + { + "auxiliary_loss_clip": 0.01508578, + "auxiliary_loss_mlp": 0.01319626, + "balance_loss_clip": 1.15269876, + "balance_loss_mlp": 1.05584002, + "epoch": 0.2370659852698031, + "flos": 23949672200640.0, + "grad_norm": 2.1532601234106266, + "language_loss": 0.81468135, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.84296346, + "num_input_tokens_seen": 84889815, + "step": 3943, + "time_per_iteration": 4.293899774551392 + }, + { + "auxiliary_loss_clip": 0.01506803, + "auxiliary_loss_mlp": 0.01323074, + "balance_loss_clip": 1.15036905, + "balance_loss_mlp": 1.05680847, + "epoch": 0.23712610852247107, + "flos": 15333863323680.0, + "grad_norm": 5.828320627224442, + "language_loss": 0.68683207, + "learning_rate": 3.566589891386959e-06, + "loss": 0.71513093, + "num_input_tokens_seen": 84904380, + "step": 3944, + "time_per_iteration": 6.087591886520386 + }, + { + "auxiliary_loss_clip": 0.01502674, + "auxiliary_loss_mlp": 0.01302349, + "balance_loss_clip": 1.14642596, + "balance_loss_mlp": 1.03665543, + "epoch": 0.23718623177513903, + "flos": 19684607983200.0, + "grad_norm": 1.7953378631839183, + "language_loss": 0.75353181, + "learning_rate": 3.566347752735866e-06, + "loss": 0.781582, + "num_input_tokens_seen": 84922935, + "step": 3945, + "time_per_iteration": 2.768587589263916 + }, + { + "auxiliary_loss_clip": 0.01501667, + "auxiliary_loss_mlp": 0.01312662, + "balance_loss_clip": 1.14707005, + "balance_loss_mlp": 1.04696894, + "epoch": 0.237246355027807, + "flos": 24975677057760.0, + "grad_norm": 1.5936183929601937, + "language_loss": 0.63859433, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.66673762, + "num_input_tokens_seen": 84943685, + "step": 3946, + "time_per_iteration": 2.8496806621551514 + }, + { + "auxiliary_loss_clip": 0.01506628, + "auxiliary_loss_mlp": 0.0130629, + "balance_loss_clip": 1.15088487, + "balance_loss_mlp": 1.04250336, + "epoch": 0.23730647828047496, + "flos": 15379225767360.0, + "grad_norm": 2.5792417370712575, + "language_loss": 0.77193356, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.80006272, + "num_input_tokens_seen": 84959505, + "step": 3947, + "time_per_iteration": 2.791825294494629 + }, + { + "auxiliary_loss_clip": 0.01509755, + "auxiliary_loss_mlp": 0.01329093, + "balance_loss_clip": 1.15400219, + "balance_loss_mlp": 1.06358981, + "epoch": 0.23736660153314296, + "flos": 28153367926560.0, + "grad_norm": 1.8746910517194566, + "language_loss": 0.80733001, + "learning_rate": 3.565620980442944e-06, + "loss": 0.83571851, + "num_input_tokens_seen": 84982130, + "step": 3948, + "time_per_iteration": 2.8337268829345703 + }, + { + "auxiliary_loss_clip": 0.01509841, + "auxiliary_loss_mlp": 0.0132956, + "balance_loss_clip": 1.15367007, + "balance_loss_mlp": 1.06634581, + "epoch": 0.23742672478581092, + "flos": 22088518809120.0, + "grad_norm": 2.1176782627440804, + "language_loss": 0.80645227, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.83484626, + "num_input_tokens_seen": 85000640, + "step": 3949, + "time_per_iteration": 2.7996058464050293 + }, + { + "auxiliary_loss_clip": 0.01506762, + "auxiliary_loss_mlp": 0.01319708, + "balance_loss_clip": 1.15075326, + "balance_loss_mlp": 1.05363238, + "epoch": 0.2374868480384789, + "flos": 19539152032320.0, + "grad_norm": 2.2211324465529843, + "language_loss": 0.73239231, + "learning_rate": 3.565136168723163e-06, + "loss": 0.76065707, + "num_input_tokens_seen": 85018970, + "step": 3950, + "time_per_iteration": 2.7695837020874023 + }, + { + "auxiliary_loss_clip": 0.01507056, + "auxiliary_loss_mlp": 0.01316985, + "balance_loss_clip": 1.15143597, + "balance_loss_mlp": 1.05834889, + "epoch": 0.23754697129114685, + "flos": 19424266542720.0, + "grad_norm": 2.5992027488609946, + "language_loss": 0.73504698, + "learning_rate": 3.564893673833495e-06, + "loss": 0.76328737, + "num_input_tokens_seen": 85035905, + "step": 3951, + "time_per_iteration": 2.8481340408325195 + }, + { + "auxiliary_loss_clip": 0.01511284, + "auxiliary_loss_mlp": 0.01339664, + "balance_loss_clip": 1.15561295, + "balance_loss_mlp": 1.07606816, + "epoch": 0.23760709454381482, + "flos": 19503081987840.0, + "grad_norm": 1.9465022447809084, + "language_loss": 0.74086571, + "learning_rate": 3.564651119602903e-06, + "loss": 0.76937515, + "num_input_tokens_seen": 85054560, + "step": 3952, + "time_per_iteration": 2.88354229927063 + }, + { + "auxiliary_loss_clip": 0.01505171, + "auxiliary_loss_mlp": 0.01315326, + "balance_loss_clip": 1.14914656, + "balance_loss_mlp": 1.05001378, + "epoch": 0.23766721779648278, + "flos": 27639094904640.0, + "grad_norm": 1.91565073510384, + "language_loss": 0.71155727, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73976225, + "num_input_tokens_seen": 85074425, + "step": 3953, + "time_per_iteration": 2.949367046356201 + }, + { + "auxiliary_loss_clip": 0.01513743, + "auxiliary_loss_mlp": 0.0132909, + "balance_loss_clip": 1.15693331, + "balance_loss_mlp": 1.06625712, + "epoch": 0.23772734104915075, + "flos": 23406611340960.0, + "grad_norm": 1.9756919674624012, + "language_loss": 0.81661212, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.8450405, + "num_input_tokens_seen": 85092865, + "step": 3954, + "time_per_iteration": 2.805572748184204 + }, + { + "auxiliary_loss_clip": 0.01517383, + "auxiliary_loss_mlp": 0.01329697, + "balance_loss_clip": 1.16095448, + "balance_loss_mlp": 1.06686425, + "epoch": 0.23778746430181874, + "flos": 15707307630240.0, + "grad_norm": 2.5893094640251975, + "language_loss": 0.6656245, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.69409531, + "num_input_tokens_seen": 85110175, + "step": 3955, + "time_per_iteration": 2.8419268131256104 + }, + { + "auxiliary_loss_clip": 0.01516783, + "auxiliary_loss_mlp": 0.0132164, + "balance_loss_clip": 1.15998673, + "balance_loss_mlp": 1.06109619, + "epoch": 0.2378475875544867, + "flos": 19428438640320.0, + "grad_norm": 1.635580903000644, + "language_loss": 0.83956265, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.86794686, + "num_input_tokens_seen": 85129925, + "step": 3956, + "time_per_iteration": 2.7801132202148438 + }, + { + "auxiliary_loss_clip": 0.01516339, + "auxiliary_loss_mlp": 0.01315283, + "balance_loss_clip": 1.16048813, + "balance_loss_mlp": 1.05035245, + "epoch": 0.23790771080715467, + "flos": 22270499942400.0, + "grad_norm": 5.030026426159927, + "language_loss": 0.84966838, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.87798464, + "num_input_tokens_seen": 85147755, + "step": 3957, + "time_per_iteration": 2.866365671157837 + }, + { + "auxiliary_loss_clip": 0.01513128, + "auxiliary_loss_mlp": 0.01309618, + "balance_loss_clip": 1.15613985, + "balance_loss_mlp": 1.04697597, + "epoch": 0.23796783405982264, + "flos": 20049366741120.0, + "grad_norm": 2.1517971450741493, + "language_loss": 0.70331579, + "learning_rate": 3.563194548575151e-06, + "loss": 0.73154324, + "num_input_tokens_seen": 85165270, + "step": 3958, + "time_per_iteration": 2.8274168968200684 + }, + { + "auxiliary_loss_clip": 0.01516682, + "auxiliary_loss_mlp": 0.01303929, + "balance_loss_clip": 1.1606636, + "balance_loss_mlp": 1.03785396, + "epoch": 0.2380279573124906, + "flos": 14247665748000.0, + "grad_norm": 3.698960536758946, + "language_loss": 0.66426504, + "learning_rate": 3.562951579215745e-06, + "loss": 0.69247121, + "num_input_tokens_seen": 85181555, + "step": 3959, + "time_per_iteration": 2.7619729042053223 + }, + { + "auxiliary_loss_clip": 0.0151556, + "auxiliary_loss_mlp": 0.01314414, + "balance_loss_clip": 1.16028094, + "balance_loss_mlp": 1.04814792, + "epoch": 0.23808808056515857, + "flos": 21181343970240.0, + "grad_norm": 1.7424647253994598, + "language_loss": 0.71998268, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74828243, + "num_input_tokens_seen": 85199455, + "step": 3960, + "time_per_iteration": 2.809077262878418 + }, + { + "auxiliary_loss_clip": 0.01515114, + "auxiliary_loss_mlp": 0.01314957, + "balance_loss_clip": 1.15811527, + "balance_loss_mlp": 1.05002654, + "epoch": 0.23814820381782653, + "flos": 22530500029440.0, + "grad_norm": 1.7798443782619557, + "language_loss": 0.74528718, + "learning_rate": 3.562465462704307e-06, + "loss": 0.77358794, + "num_input_tokens_seen": 85219170, + "step": 3961, + "time_per_iteration": 2.8225204944610596 + }, + { + "auxiliary_loss_clip": 0.01515541, + "auxiliary_loss_mlp": 0.013017, + "balance_loss_clip": 1.15853691, + "balance_loss_mlp": 1.03390849, + "epoch": 0.23820832707049452, + "flos": 22306228633440.0, + "grad_norm": 1.7707602232610669, + "language_loss": 0.65894228, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68711472, + "num_input_tokens_seen": 85238480, + "step": 3962, + "time_per_iteration": 2.8138976097106934 + }, + { + "auxiliary_loss_clip": 0.01516029, + "auxiliary_loss_mlp": 0.01310891, + "balance_loss_clip": 1.15857077, + "balance_loss_mlp": 1.04424405, + "epoch": 0.2382684503231625, + "flos": 24866594576640.0, + "grad_norm": 1.827291555378758, + "language_loss": 0.74322248, + "learning_rate": 3.561979109197483e-06, + "loss": 0.77149165, + "num_input_tokens_seen": 85259180, + "step": 3963, + "time_per_iteration": 2.862125873565674 + }, + { + "auxiliary_loss_clip": 0.01525221, + "auxiliary_loss_mlp": 0.01321471, + "balance_loss_clip": 1.1686672, + "balance_loss_mlp": 1.05234373, + "epoch": 0.23832857357583045, + "flos": 21873767381280.0, + "grad_norm": 2.040660928902596, + "language_loss": 0.77196968, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.80043662, + "num_input_tokens_seen": 85278550, + "step": 3964, + "time_per_iteration": 2.8191301822662354 + }, + { + "auxiliary_loss_clip": 0.01520886, + "auxiliary_loss_mlp": 0.01307771, + "balance_loss_clip": 1.16287661, + "balance_loss_mlp": 1.04341209, + "epoch": 0.23838869682849842, + "flos": 21290198882400.0, + "grad_norm": 2.0058023463833083, + "language_loss": 0.71360767, + "learning_rate": 3.561492518769045e-06, + "loss": 0.74189425, + "num_input_tokens_seen": 85297345, + "step": 3965, + "time_per_iteration": 2.8759548664093018 + }, + { + "auxiliary_loss_clip": 0.01523805, + "auxiliary_loss_mlp": 0.01330831, + "balance_loss_clip": 1.16763735, + "balance_loss_mlp": 1.06933355, + "epoch": 0.23844882008116638, + "flos": 16182476354880.0, + "grad_norm": 1.9751690988619175, + "language_loss": 0.7821039, + "learning_rate": 3.561249134732282e-06, + "loss": 0.81065023, + "num_input_tokens_seen": 85315105, + "step": 3966, + "time_per_iteration": 2.75968074798584 + }, + { + "auxiliary_loss_clip": 0.01514899, + "auxiliary_loss_mlp": 0.01295936, + "balance_loss_clip": 1.15948176, + "balance_loss_mlp": 1.02375722, + "epoch": 0.23850894333383435, + "flos": 21071882207520.0, + "grad_norm": 1.5547285092355483, + "language_loss": 0.68606198, + "learning_rate": 3.561005691492797e-06, + "loss": 0.71417034, + "num_input_tokens_seen": 85334735, + "step": 3967, + "time_per_iteration": 2.796459674835205 + }, + { + "auxiliary_loss_clip": 0.01534795, + "auxiliary_loss_mlp": 0.01319531, + "balance_loss_clip": 1.17875099, + "balance_loss_mlp": 1.05364609, + "epoch": 0.23856906658650234, + "flos": 17203512623040.0, + "grad_norm": 3.2181742793943364, + "language_loss": 0.68136227, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70990551, + "num_input_tokens_seen": 85352875, + "step": 3968, + "time_per_iteration": 2.7263104915618896 + }, + { + "auxiliary_loss_clip": 0.01520982, + "auxiliary_loss_mlp": 0.01314683, + "balance_loss_clip": 1.16454482, + "balance_loss_mlp": 1.04155052, + "epoch": 0.2386291898391703, + "flos": 29496645120960.0, + "grad_norm": 1.964907937717253, + "language_loss": 0.76807809, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.79643476, + "num_input_tokens_seen": 85372205, + "step": 3969, + "time_per_iteration": 2.8750762939453125 + }, + { + "auxiliary_loss_clip": 0.01520726, + "auxiliary_loss_mlp": 0.01306103, + "balance_loss_clip": 1.16448569, + "balance_loss_mlp": 1.03125381, + "epoch": 0.23868931309183827, + "flos": 21144591218880.0, + "grad_norm": 2.1961514728091207, + "language_loss": 0.76360095, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.79186922, + "num_input_tokens_seen": 85389705, + "step": 3970, + "time_per_iteration": 2.78330659866333 + }, + { + "auxiliary_loss_clip": 0.01523245, + "auxiliary_loss_mlp": 0.01306936, + "balance_loss_clip": 1.16695809, + "balance_loss_mlp": 1.03952563, + "epoch": 0.23874943634450624, + "flos": 25661387184480.0, + "grad_norm": 2.4808780262674035, + "language_loss": 0.85712337, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.88542521, + "num_input_tokens_seen": 85407855, + "step": 3971, + "time_per_iteration": 2.854680061340332 + }, + { + "auxiliary_loss_clip": 0.0169382, + "auxiliary_loss_mlp": 0.01239746, + "balance_loss_clip": 1.35051823, + "balance_loss_mlp": 1.01467896, + "epoch": 0.2388095595971742, + "flos": 58993403662080.0, + "grad_norm": 0.7367949479712638, + "language_loss": 0.62737465, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.65671027, + "num_input_tokens_seen": 85470885, + "step": 3972, + "time_per_iteration": 3.35878324508667 + }, + { + "auxiliary_loss_clip": 0.01526209, + "auxiliary_loss_mlp": 0.01304685, + "balance_loss_clip": 1.16945148, + "balance_loss_mlp": 1.03670275, + "epoch": 0.23886968284984217, + "flos": 16802228682720.0, + "grad_norm": 2.293407744348052, + "language_loss": 0.81642401, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.84473288, + "num_input_tokens_seen": 85488460, + "step": 3973, + "time_per_iteration": 4.421841144561768 + }, + { + "auxiliary_loss_clip": 0.01525055, + "auxiliary_loss_mlp": 0.01314856, + "balance_loss_clip": 1.16845667, + "balance_loss_mlp": 1.04439354, + "epoch": 0.23892980610251013, + "flos": 22384854437760.0, + "grad_norm": 1.7294385073193053, + "language_loss": 0.79538894, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.82378805, + "num_input_tokens_seen": 85508590, + "step": 3974, + "time_per_iteration": 2.804286241531372 + }, + { + "auxiliary_loss_clip": 0.0152032, + "auxiliary_loss_mlp": 0.01310796, + "balance_loss_clip": 1.16470695, + "balance_loss_mlp": 1.04166913, + "epoch": 0.23898992935517813, + "flos": 12824852473440.0, + "grad_norm": 16.8824117194258, + "language_loss": 0.84882146, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.87713265, + "num_input_tokens_seen": 85525970, + "step": 3975, + "time_per_iteration": 2.7983558177948 + }, + { + "auxiliary_loss_clip": 0.01520027, + "auxiliary_loss_mlp": 0.0131322, + "balance_loss_clip": 1.16264439, + "balance_loss_mlp": 1.0480988, + "epoch": 0.2390500526078461, + "flos": 22347836189280.0, + "grad_norm": 2.5079586818075046, + "language_loss": 0.83423543, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.8625679, + "num_input_tokens_seen": 85543700, + "step": 3976, + "time_per_iteration": 2.763185739517212 + }, + { + "auxiliary_loss_clip": 0.01524389, + "auxiliary_loss_mlp": 0.0131743, + "balance_loss_clip": 1.16652787, + "balance_loss_mlp": 1.05631423, + "epoch": 0.23911017586051406, + "flos": 22637193036480.0, + "grad_norm": 2.295388218304141, + "language_loss": 0.74851978, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.77693796, + "num_input_tokens_seen": 85562765, + "step": 3977, + "time_per_iteration": 2.8139755725860596 + }, + { + "auxiliary_loss_clip": 0.015219, + "auxiliary_loss_mlp": 0.01314822, + "balance_loss_clip": 1.16525531, + "balance_loss_mlp": 1.04912829, + "epoch": 0.23917029911318202, + "flos": 23655195051840.0, + "grad_norm": 1.9002599391752892, + "language_loss": 0.72400409, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.75237131, + "num_input_tokens_seen": 85581755, + "step": 3978, + "time_per_iteration": 2.76872181892395 + }, + { + "auxiliary_loss_clip": 0.01536464, + "auxiliary_loss_mlp": 0.013309, + "balance_loss_clip": 1.17872345, + "balance_loss_mlp": 1.06329882, + "epoch": 0.23923042236585, + "flos": 22785797024640.0, + "grad_norm": 2.3457483403346946, + "language_loss": 0.78898942, + "learning_rate": 3.558079758168997e-06, + "loss": 0.81766307, + "num_input_tokens_seen": 85599455, + "step": 3979, + "time_per_iteration": 2.8868751525878906 + }, + { + "auxiliary_loss_clip": 0.01528249, + "auxiliary_loss_mlp": 0.01333884, + "balance_loss_clip": 1.17087233, + "balance_loss_mlp": 1.07143307, + "epoch": 0.23929054561851795, + "flos": 28150257817440.0, + "grad_norm": 1.7981104556916145, + "language_loss": 0.8257097, + "learning_rate": 3.557835546134977e-06, + "loss": 0.85433108, + "num_input_tokens_seen": 85619970, + "step": 3980, + "time_per_iteration": 2.836247444152832 + }, + { + "auxiliary_loss_clip": 0.01517611, + "auxiliary_loss_mlp": 0.01308909, + "balance_loss_clip": 1.16039491, + "balance_loss_mlp": 1.04626739, + "epoch": 0.23935066887118592, + "flos": 21688524426240.0, + "grad_norm": 2.255449578598687, + "language_loss": 0.83999509, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86826026, + "num_input_tokens_seen": 85638850, + "step": 3981, + "time_per_iteration": 2.779904842376709 + }, + { + "auxiliary_loss_clip": 0.01519334, + "auxiliary_loss_mlp": 0.01332489, + "balance_loss_clip": 1.16184402, + "balance_loss_mlp": 1.0620265, + "epoch": 0.2394107921238539, + "flos": 32124903199200.0, + "grad_norm": 1.9446249910645288, + "language_loss": 0.77298743, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.80150568, + "num_input_tokens_seen": 85656285, + "step": 3982, + "time_per_iteration": 4.386898994445801 + }, + { + "auxiliary_loss_clip": 0.01525041, + "auxiliary_loss_mlp": 0.01306602, + "balance_loss_clip": 1.16696155, + "balance_loss_mlp": 1.03957331, + "epoch": 0.23947091537652188, + "flos": 17021076351840.0, + "grad_norm": 2.364494464239686, + "language_loss": 0.78323543, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.81155187, + "num_input_tokens_seen": 85673020, + "step": 3983, + "time_per_iteration": 4.27474570274353 + }, + { + "auxiliary_loss_clip": 0.0152312, + "auxiliary_loss_mlp": 0.01301016, + "balance_loss_clip": 1.16475594, + "balance_loss_mlp": 1.03532267, + "epoch": 0.23953103862918984, + "flos": 20595613566240.0, + "grad_norm": 1.806512644559839, + "language_loss": 0.73106682, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75930822, + "num_input_tokens_seen": 85692565, + "step": 3984, + "time_per_iteration": 2.7992587089538574 + }, + { + "auxiliary_loss_clip": 0.01524573, + "auxiliary_loss_mlp": 0.01303021, + "balance_loss_clip": 1.16611326, + "balance_loss_mlp": 1.03656435, + "epoch": 0.2395911618818578, + "flos": 20706516599040.0, + "grad_norm": 1.8961248305894438, + "language_loss": 0.79250908, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.82078505, + "num_input_tokens_seen": 85709730, + "step": 3985, + "time_per_iteration": 2.8018436431884766 + }, + { + "auxiliary_loss_clip": 0.01525259, + "auxiliary_loss_mlp": 0.01316113, + "balance_loss_clip": 1.16759372, + "balance_loss_mlp": 1.0471772, + "epoch": 0.23965128513452577, + "flos": 27056019471840.0, + "grad_norm": 2.143692230672646, + "language_loss": 0.73545295, + "learning_rate": 3.556369033716254e-06, + "loss": 0.76386666, + "num_input_tokens_seen": 85730045, + "step": 3986, + "time_per_iteration": 2.8271596431732178 + }, + { + "auxiliary_loss_clip": 0.0151833, + "auxiliary_loss_mlp": 0.01318635, + "balance_loss_clip": 1.16032624, + "balance_loss_mlp": 1.04721951, + "epoch": 0.23971140838719374, + "flos": 23146307828640.0, + "grad_norm": 3.1048547016738794, + "language_loss": 0.88741386, + "learning_rate": 3.556124408363871e-06, + "loss": 0.91578346, + "num_input_tokens_seen": 85747590, + "step": 3987, + "time_per_iteration": 2.7782132625579834 + }, + { + "auxiliary_loss_clip": 0.01519024, + "auxiliary_loss_mlp": 0.01313198, + "balance_loss_clip": 1.1629734, + "balance_loss_mlp": 1.0520817, + "epoch": 0.23977153163986173, + "flos": 18036157898880.0, + "grad_norm": 2.498545297143569, + "language_loss": 0.83467889, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.86300111, + "num_input_tokens_seen": 85763460, + "step": 3988, + "time_per_iteration": 2.745830535888672 + }, + { + "auxiliary_loss_clip": 0.01511979, + "auxiliary_loss_mlp": 0.0130014, + "balance_loss_clip": 1.15574062, + "balance_loss_mlp": 1.03272974, + "epoch": 0.2398316548925297, + "flos": 18115314697440.0, + "grad_norm": 1.9390269239987026, + "language_loss": 0.85322642, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.88134766, + "num_input_tokens_seen": 85782050, + "step": 3989, + "time_per_iteration": 2.753700017929077 + }, + { + "auxiliary_loss_clip": 0.01521682, + "auxiliary_loss_mlp": 0.01305724, + "balance_loss_clip": 1.16449583, + "balance_loss_mlp": 1.04022145, + "epoch": 0.23989177814519766, + "flos": 12569365837440.0, + "grad_norm": 3.0790313679867904, + "language_loss": 0.85061586, + "learning_rate": 3.555390178293477e-06, + "loss": 0.87888992, + "num_input_tokens_seen": 85797400, + "step": 3990, + "time_per_iteration": 2.8996474742889404 + }, + { + "auxiliary_loss_clip": 0.01514214, + "auxiliary_loss_mlp": 0.01309441, + "balance_loss_clip": 1.15807974, + "balance_loss_mlp": 1.04431987, + "epoch": 0.23995190139786562, + "flos": 25266930312960.0, + "grad_norm": 1.5418853534111236, + "language_loss": 0.7596581, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.78789461, + "num_input_tokens_seen": 85818995, + "step": 3991, + "time_per_iteration": 2.8255293369293213 + }, + { + "auxiliary_loss_clip": 0.01672455, + "auxiliary_loss_mlp": 0.01242981, + "balance_loss_clip": 1.32986736, + "balance_loss_mlp": 1.02096558, + "epoch": 0.2400120246505336, + "flos": 61966318573440.0, + "grad_norm": 0.8822776158466538, + "language_loss": 0.63710833, + "learning_rate": 3.554900396661656e-06, + "loss": 0.66626275, + "num_input_tokens_seen": 85876695, + "step": 3992, + "time_per_iteration": 3.253471612930298 + }, + { + "auxiliary_loss_clip": 0.01673102, + "auxiliary_loss_mlp": 0.01259193, + "balance_loss_clip": 1.33073282, + "balance_loss_mlp": 1.03794098, + "epoch": 0.24007214790320155, + "flos": 66715047423360.0, + "grad_norm": 0.7527485552593353, + "language_loss": 0.62939411, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65871704, + "num_input_tokens_seen": 85940990, + "step": 3993, + "time_per_iteration": 3.373339891433716 + }, + { + "auxiliary_loss_clip": 0.01518846, + "auxiliary_loss_mlp": 0.01318006, + "balance_loss_clip": 1.16403353, + "balance_loss_mlp": 1.05116796, + "epoch": 0.24013227115586952, + "flos": 25811280730080.0, + "grad_norm": 1.7029827012467056, + "language_loss": 0.77210349, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.80047202, + "num_input_tokens_seen": 85961165, + "step": 3994, + "time_per_iteration": 2.8495290279388428 + }, + { + "auxiliary_loss_clip": 0.01519604, + "auxiliary_loss_mlp": 0.01327551, + "balance_loss_clip": 1.16445065, + "balance_loss_mlp": 1.05632591, + "epoch": 0.2401923944085375, + "flos": 25559890335360.0, + "grad_norm": 1.9993561689519208, + "language_loss": 0.78593218, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.81440371, + "num_input_tokens_seen": 85982710, + "step": 3995, + "time_per_iteration": 2.808255672454834 + }, + { + "auxiliary_loss_clip": 0.01667369, + "auxiliary_loss_mlp": 0.01271095, + "balance_loss_clip": 1.32571363, + "balance_loss_mlp": 1.05060577, + "epoch": 0.24025251766120548, + "flos": 54947945676960.0, + "grad_norm": 0.8954043596620905, + "language_loss": 0.63443989, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.66382456, + "num_input_tokens_seen": 86046935, + "step": 3996, + "time_per_iteration": 3.274369716644287 + }, + { + "auxiliary_loss_clip": 0.01509507, + "auxiliary_loss_mlp": 0.01305944, + "balance_loss_clip": 1.15378797, + "balance_loss_mlp": 1.0341469, + "epoch": 0.24031264091387344, + "flos": 20633162808960.0, + "grad_norm": 2.704368247016529, + "language_loss": 0.69777, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72592449, + "num_input_tokens_seen": 86064355, + "step": 3997, + "time_per_iteration": 2.8066344261169434 + }, + { + "auxiliary_loss_clip": 0.01516122, + "auxiliary_loss_mlp": 0.01307997, + "balance_loss_clip": 1.16085207, + "balance_loss_mlp": 1.04535532, + "epoch": 0.2403727641665414, + "flos": 20888346019680.0, + "grad_norm": 5.992351735460068, + "language_loss": 0.87189257, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.90013385, + "num_input_tokens_seen": 86081340, + "step": 3998, + "time_per_iteration": 2.7958338260650635 + }, + { + "auxiliary_loss_clip": 0.01515703, + "auxiliary_loss_mlp": 0.01317953, + "balance_loss_clip": 1.15964818, + "balance_loss_mlp": 1.04558396, + "epoch": 0.24043288741920937, + "flos": 22822246350720.0, + "grad_norm": 1.5774052430245475, + "language_loss": 0.76126242, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.78959894, + "num_input_tokens_seen": 86102260, + "step": 3999, + "time_per_iteration": 2.830231189727783 + }, + { + "auxiliary_loss_clip": 0.01517406, + "auxiliary_loss_mlp": 0.01317631, + "balance_loss_clip": 1.16307414, + "balance_loss_mlp": 1.05441713, + "epoch": 0.24049301067187734, + "flos": 27961866825120.0, + "grad_norm": 2.0795678486532334, + "language_loss": 0.72848988, + "learning_rate": 3.552938912398679e-06, + "loss": 0.75684023, + "num_input_tokens_seen": 86123400, + "step": 4000, + "time_per_iteration": 2.897407054901123 + }, + { + "auxiliary_loss_clip": 0.01515285, + "auxiliary_loss_mlp": 0.01321905, + "balance_loss_clip": 1.16102719, + "balance_loss_mlp": 1.05277789, + "epoch": 0.24055313392454533, + "flos": 27453738165120.0, + "grad_norm": 2.5320831018250343, + "language_loss": 0.6694442, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.69781613, + "num_input_tokens_seen": 86144060, + "step": 4001, + "time_per_iteration": 2.809953451156616 + }, + { + "auxiliary_loss_clip": 0.01515044, + "auxiliary_loss_mlp": 0.01309441, + "balance_loss_clip": 1.16089904, + "balance_loss_mlp": 1.04088593, + "epoch": 0.2406132571772133, + "flos": 25558980059520.0, + "grad_norm": 2.368328306837321, + "language_loss": 0.82942462, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.85766935, + "num_input_tokens_seen": 86163005, + "step": 4002, + "time_per_iteration": 2.838879346847534 + }, + { + "auxiliary_loss_clip": 0.01518844, + "auxiliary_loss_mlp": 0.0131311, + "balance_loss_clip": 1.16479063, + "balance_loss_mlp": 1.05104065, + "epoch": 0.24067338042988126, + "flos": 24793771780800.0, + "grad_norm": 2.177873016936816, + "language_loss": 0.83320636, + "learning_rate": 3.552202383898897e-06, + "loss": 0.86152589, + "num_input_tokens_seen": 86182580, + "step": 4003, + "time_per_iteration": 2.807208776473999 + }, + { + "auxiliary_loss_clip": 0.01519418, + "auxiliary_loss_mlp": 0.01320466, + "balance_loss_clip": 1.16648567, + "balance_loss_mlp": 1.05267465, + "epoch": 0.24073350368254923, + "flos": 21180016484640.0, + "grad_norm": 2.1893078112694533, + "language_loss": 0.87362373, + "learning_rate": 3.551956756667215e-06, + "loss": 0.90202254, + "num_input_tokens_seen": 86200665, + "step": 4004, + "time_per_iteration": 2.7530386447906494 + }, + { + "auxiliary_loss_clip": 0.01502504, + "auxiliary_loss_mlp": 0.01314783, + "balance_loss_clip": 1.14816809, + "balance_loss_mlp": 1.04508448, + "epoch": 0.2407936269352172, + "flos": 22496629818240.0, + "grad_norm": 2.6667438919533173, + "language_loss": 0.77905905, + "learning_rate": 3.551711070585177e-06, + "loss": 0.80723196, + "num_input_tokens_seen": 86221640, + "step": 4005, + "time_per_iteration": 2.8773298263549805 + }, + { + "auxiliary_loss_clip": 0.01510608, + "auxiliary_loss_mlp": 0.01311841, + "balance_loss_clip": 1.15713573, + "balance_loss_mlp": 1.04996192, + "epoch": 0.24085375018788516, + "flos": 18553085892000.0, + "grad_norm": 1.7372280968988782, + "language_loss": 0.79260135, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.82082582, + "num_input_tokens_seen": 86240795, + "step": 4006, + "time_per_iteration": 2.750058650970459 + }, + { + "auxiliary_loss_clip": 0.01508149, + "auxiliary_loss_mlp": 0.01310928, + "balance_loss_clip": 1.15432477, + "balance_loss_mlp": 1.04027557, + "epoch": 0.24091387344055312, + "flos": 24172843680000.0, + "grad_norm": 1.6462523198805559, + "language_loss": 0.71493661, + "learning_rate": 3.551219521907302e-06, + "loss": 0.74312741, + "num_input_tokens_seen": 86262000, + "step": 4007, + "time_per_iteration": 2.818366527557373 + }, + { + "auxiliary_loss_clip": 0.01502866, + "auxiliary_loss_mlp": 0.01308101, + "balance_loss_clip": 1.14880896, + "balance_loss_mlp": 1.04793894, + "epoch": 0.24097399669322112, + "flos": 11037925219680.0, + "grad_norm": 20.851065761640328, + "language_loss": 0.76168674, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.78979647, + "num_input_tokens_seen": 86279680, + "step": 4008, + "time_per_iteration": 2.7536559104919434 + }, + { + "auxiliary_loss_clip": 0.01510694, + "auxiliary_loss_mlp": 0.01312332, + "balance_loss_clip": 1.15537262, + "balance_loss_mlp": 1.05045283, + "epoch": 0.24103411994588908, + "flos": 17166987440640.0, + "grad_norm": 2.3818344643301232, + "language_loss": 0.74816877, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.77639902, + "num_input_tokens_seen": 86297180, + "step": 4009, + "time_per_iteration": 2.779578685760498 + }, + { + "auxiliary_loss_clip": 0.01505321, + "auxiliary_loss_mlp": 0.01309833, + "balance_loss_clip": 1.15122712, + "balance_loss_mlp": 1.047382, + "epoch": 0.24109424319855705, + "flos": 20670067272960.0, + "grad_norm": 1.7481899126508258, + "language_loss": 0.79842228, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82657385, + "num_input_tokens_seen": 86317660, + "step": 4010, + "time_per_iteration": 2.7879841327667236 + }, + { + "auxiliary_loss_clip": 0.01505259, + "auxiliary_loss_mlp": 0.01319075, + "balance_loss_clip": 1.1513629, + "balance_loss_mlp": 1.05528831, + "epoch": 0.241154366451225, + "flos": 28184203884960.0, + "grad_norm": 2.263760016290766, + "language_loss": 0.70597923, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.73422253, + "num_input_tokens_seen": 86338325, + "step": 4011, + "time_per_iteration": 4.497110843658447 + }, + { + "auxiliary_loss_clip": 0.01501318, + "auxiliary_loss_mlp": 0.01315085, + "balance_loss_clip": 1.14783108, + "balance_loss_mlp": 1.05358779, + "epoch": 0.24121448970389298, + "flos": 21691900032480.0, + "grad_norm": 2.7156776391597788, + "language_loss": 0.69435245, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.72251648, + "num_input_tokens_seen": 86357615, + "step": 4012, + "time_per_iteration": 2.894774913787842 + }, + { + "auxiliary_loss_clip": 0.01513662, + "auxiliary_loss_mlp": 0.01320143, + "balance_loss_clip": 1.15845203, + "balance_loss_mlp": 1.05807304, + "epoch": 0.24127461295656094, + "flos": 39679319881440.0, + "grad_norm": 1.536596813970365, + "language_loss": 0.73538649, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.76372457, + "num_input_tokens_seen": 86380355, + "step": 4013, + "time_per_iteration": 2.9549481868743896 + }, + { + "auxiliary_loss_clip": 0.01508468, + "auxiliary_loss_mlp": 0.01321782, + "balance_loss_clip": 1.15475702, + "balance_loss_mlp": 1.05418134, + "epoch": 0.2413347362092289, + "flos": 19137906020160.0, + "grad_norm": 3.663589369024243, + "language_loss": 0.88308203, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.91138458, + "num_input_tokens_seen": 86399125, + "step": 4014, + "time_per_iteration": 2.858116388320923 + }, + { + "auxiliary_loss_clip": 0.01501918, + "auxiliary_loss_mlp": 0.0132129, + "balance_loss_clip": 1.14815044, + "balance_loss_mlp": 1.0487293, + "epoch": 0.2413948594618969, + "flos": 26941664976480.0, + "grad_norm": 2.252747273696346, + "language_loss": 0.94959855, + "learning_rate": 3.549250975045952e-06, + "loss": 0.97783065, + "num_input_tokens_seen": 86418625, + "step": 4015, + "time_per_iteration": 2.8157598972320557 + }, + { + "auxiliary_loss_clip": 0.01506556, + "auxiliary_loss_mlp": 0.01327634, + "balance_loss_clip": 1.15158582, + "balance_loss_mlp": 1.06003344, + "epoch": 0.24145498271456486, + "flos": 25230443058720.0, + "grad_norm": 1.7688719889891098, + "language_loss": 0.83394361, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.86228549, + "num_input_tokens_seen": 86438375, + "step": 4016, + "time_per_iteration": 2.8423430919647217 + }, + { + "auxiliary_loss_clip": 0.01509046, + "auxiliary_loss_mlp": 0.01315239, + "balance_loss_clip": 1.15508676, + "balance_loss_mlp": 1.05450416, + "epoch": 0.24151510596723283, + "flos": 40664968812000.0, + "grad_norm": 1.996403594002411, + "language_loss": 0.69256854, + "learning_rate": 3.54875825066639e-06, + "loss": 0.72081131, + "num_input_tokens_seen": 86463230, + "step": 4017, + "time_per_iteration": 2.9937894344329834 + }, + { + "auxiliary_loss_clip": 0.01510563, + "auxiliary_loss_mlp": 0.01320042, + "balance_loss_clip": 1.15548539, + "balance_loss_mlp": 1.04919887, + "epoch": 0.2415752292199008, + "flos": 18148426345440.0, + "grad_norm": 1.7306773312151074, + "language_loss": 0.85274267, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.88104874, + "num_input_tokens_seen": 86481230, + "step": 4018, + "time_per_iteration": 2.752354145050049 + }, + { + "auxiliary_loss_clip": 0.01641145, + "auxiliary_loss_mlp": 0.01239151, + "balance_loss_clip": 1.30012488, + "balance_loss_mlp": 1.01026917, + "epoch": 0.24163535247256876, + "flos": 67294709321760.0, + "grad_norm": 0.8164906876265009, + "language_loss": 0.60616297, + "learning_rate": 3.548265291370558e-06, + "loss": 0.6349659, + "num_input_tokens_seen": 86541260, + "step": 4019, + "time_per_iteration": 3.384143829345703 + }, + { + "auxiliary_loss_clip": 0.01509526, + "auxiliary_loss_mlp": 0.01304981, + "balance_loss_clip": 1.15492892, + "balance_loss_mlp": 1.04310226, + "epoch": 0.24169547572523672, + "flos": 24931907596800.0, + "grad_norm": 2.00260362385806, + "language_loss": 0.73511511, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.76326025, + "num_input_tokens_seen": 86559580, + "step": 4020, + "time_per_iteration": 4.395548105239868 + }, + { + "auxiliary_loss_clip": 0.01522064, + "auxiliary_loss_mlp": 0.01337634, + "balance_loss_clip": 1.16925025, + "balance_loss_mlp": 1.07499206, + "epoch": 0.24175559897790472, + "flos": 18729984651840.0, + "grad_norm": 1.9510435983078211, + "language_loss": 0.81544733, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.84404427, + "num_input_tokens_seen": 86577560, + "step": 4021, + "time_per_iteration": 4.35838508605957 + }, + { + "auxiliary_loss_clip": 0.01514558, + "auxiliary_loss_mlp": 0.01315652, + "balance_loss_clip": 1.16191566, + "balance_loss_mlp": 1.04595268, + "epoch": 0.24181572223057268, + "flos": 23041511229600.0, + "grad_norm": 2.2928113084058173, + "language_loss": 0.76573205, + "learning_rate": 3.547525412122378e-06, + "loss": 0.79403412, + "num_input_tokens_seen": 86595350, + "step": 4022, + "time_per_iteration": 2.7919209003448486 + }, + { + "auxiliary_loss_clip": 0.01513303, + "auxiliary_loss_mlp": 0.01329475, + "balance_loss_clip": 1.16016769, + "balance_loss_mlp": 1.06511688, + "epoch": 0.24187584548324065, + "flos": 20378245095360.0, + "grad_norm": 2.2014135046201386, + "language_loss": 0.75318468, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.78161246, + "num_input_tokens_seen": 86614805, + "step": 4023, + "time_per_iteration": 2.8178746700286865 + }, + { + "auxiliary_loss_clip": 0.01523801, + "auxiliary_loss_mlp": 0.01321736, + "balance_loss_clip": 1.1718688, + "balance_loss_mlp": 1.05527949, + "epoch": 0.2419359687359086, + "flos": 21399660645120.0, + "grad_norm": 2.7005506024574033, + "language_loss": 0.82037508, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.84883046, + "num_input_tokens_seen": 86633700, + "step": 4024, + "time_per_iteration": 2.7715306282043457 + }, + { + "auxiliary_loss_clip": 0.01525165, + "auxiliary_loss_mlp": 0.01316571, + "balance_loss_clip": 1.17159152, + "balance_loss_mlp": 1.05469203, + "epoch": 0.24199609198857658, + "flos": 18371370255840.0, + "grad_norm": 1.9954142650814137, + "language_loss": 0.86193824, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.89035559, + "num_input_tokens_seen": 86650905, + "step": 4025, + "time_per_iteration": 2.802855968475342 + }, + { + "auxiliary_loss_clip": 0.0151981, + "auxiliary_loss_mlp": 0.01326289, + "balance_loss_clip": 1.16719198, + "balance_loss_mlp": 1.06097662, + "epoch": 0.24205621524124454, + "flos": 19465912026720.0, + "grad_norm": 2.216810302124422, + "language_loss": 0.72586751, + "learning_rate": 3.546538084949365e-06, + "loss": 0.75432855, + "num_input_tokens_seen": 86669185, + "step": 4026, + "time_per_iteration": 2.8204612731933594 + }, + { + "auxiliary_loss_clip": 0.0151892, + "auxiliary_loss_mlp": 0.01318184, + "balance_loss_clip": 1.16635036, + "balance_loss_mlp": 1.0559231, + "epoch": 0.2421163384939125, + "flos": 14978131467840.0, + "grad_norm": 2.620386355633212, + "language_loss": 0.64482212, + "learning_rate": 3.546291106520509e-06, + "loss": 0.67319316, + "num_input_tokens_seen": 86686805, + "step": 4027, + "time_per_iteration": 2.7935774326324463 + }, + { + "auxiliary_loss_clip": 0.01521177, + "auxiliary_loss_mlp": 0.01311037, + "balance_loss_clip": 1.16866946, + "balance_loss_mlp": 1.04496169, + "epoch": 0.2421764617465805, + "flos": 18664102709280.0, + "grad_norm": 2.76367411344582, + "language_loss": 0.70892447, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.73724663, + "num_input_tokens_seen": 86705520, + "step": 4028, + "time_per_iteration": 2.7591469287872314 + }, + { + "auxiliary_loss_clip": 0.01658402, + "auxiliary_loss_mlp": 0.01249352, + "balance_loss_clip": 1.32099211, + "balance_loss_mlp": 1.0250473, + "epoch": 0.24223658499924847, + "flos": 64354261279680.0, + "grad_norm": 0.8463860248917962, + "language_loss": 0.55353349, + "learning_rate": 3.545796973765623e-06, + "loss": 0.58261108, + "num_input_tokens_seen": 86767320, + "step": 4029, + "time_per_iteration": 3.3307178020477295 + }, + { + "auxiliary_loss_clip": 0.0152951, + "auxiliary_loss_mlp": 0.01336427, + "balance_loss_clip": 1.17720485, + "balance_loss_mlp": 1.0741663, + "epoch": 0.24229670825191643, + "flos": 25777600159680.0, + "grad_norm": 1.7632035195094073, + "language_loss": 0.7395069, + "learning_rate": 3.54554981945833e-06, + "loss": 0.7681663, + "num_input_tokens_seen": 86788110, + "step": 4030, + "time_per_iteration": 2.815453052520752 + }, + { + "auxiliary_loss_clip": 0.01523395, + "auxiliary_loss_mlp": 0.01323333, + "balance_loss_clip": 1.17138839, + "balance_loss_mlp": 1.06221735, + "epoch": 0.2423568315045844, + "flos": 20669156997120.0, + "grad_norm": 2.361466000272108, + "language_loss": 0.76812333, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.79659063, + "num_input_tokens_seen": 86807640, + "step": 4031, + "time_per_iteration": 2.78379225730896 + }, + { + "auxiliary_loss_clip": 0.01526664, + "auxiliary_loss_mlp": 0.01326115, + "balance_loss_clip": 1.17517471, + "balance_loss_mlp": 1.0627104, + "epoch": 0.24241695475725236, + "flos": 22418876361600.0, + "grad_norm": 2.241007510625343, + "language_loss": 0.65484035, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.68336815, + "num_input_tokens_seen": 86826795, + "step": 4032, + "time_per_iteration": 2.8226640224456787 + }, + { + "auxiliary_loss_clip": 0.01522444, + "auxiliary_loss_mlp": 0.01302712, + "balance_loss_clip": 1.17152154, + "balance_loss_mlp": 1.03740001, + "epoch": 0.24247707800992033, + "flos": 17130917396160.0, + "grad_norm": 2.110127933699809, + "language_loss": 0.81682312, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.84507465, + "num_input_tokens_seen": 86843175, + "step": 4033, + "time_per_iteration": 2.7527236938476562 + }, + { + "auxiliary_loss_clip": 0.01524949, + "auxiliary_loss_mlp": 0.01310832, + "balance_loss_clip": 1.17308688, + "balance_loss_mlp": 1.04895353, + "epoch": 0.2425372012625883, + "flos": 31616471113920.0, + "grad_norm": 2.3406807209280696, + "language_loss": 0.70396769, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.73232549, + "num_input_tokens_seen": 86863185, + "step": 4034, + "time_per_iteration": 2.8950259685516357 + }, + { + "auxiliary_loss_clip": 0.01527267, + "auxiliary_loss_mlp": 0.01319337, + "balance_loss_clip": 1.1745826, + "balance_loss_mlp": 1.05841219, + "epoch": 0.24259732451525629, + "flos": 16327894377600.0, + "grad_norm": 2.992960357917682, + "language_loss": 0.96561331, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.99407935, + "num_input_tokens_seen": 86880040, + "step": 4035, + "time_per_iteration": 2.7612063884735107 + }, + { + "auxiliary_loss_clip": 0.01524506, + "auxiliary_loss_mlp": 0.01306079, + "balance_loss_clip": 1.17315173, + "balance_loss_mlp": 1.04191136, + "epoch": 0.24265744776792425, + "flos": 22858847389440.0, + "grad_norm": 2.116140193024766, + "language_loss": 0.78187907, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.81018496, + "num_input_tokens_seen": 86900610, + "step": 4036, + "time_per_iteration": 2.8191614151000977 + }, + { + "auxiliary_loss_clip": 0.01525358, + "auxiliary_loss_mlp": 0.01291259, + "balance_loss_clip": 1.17275298, + "balance_loss_mlp": 1.02499306, + "epoch": 0.24271757102059222, + "flos": 21873729453120.0, + "grad_norm": 1.845731343289679, + "language_loss": 0.74274969, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.77091587, + "num_input_tokens_seen": 86919385, + "step": 4037, + "time_per_iteration": 2.8822309970855713 + }, + { + "auxiliary_loss_clip": 0.01526791, + "auxiliary_loss_mlp": 0.01306902, + "balance_loss_clip": 1.17435038, + "balance_loss_mlp": 1.04158974, + "epoch": 0.24277769427326018, + "flos": 19210615031520.0, + "grad_norm": 2.3432076884521296, + "language_loss": 0.76234043, + "learning_rate": 3.543570475921171e-06, + "loss": 0.79067731, + "num_input_tokens_seen": 86938885, + "step": 4038, + "time_per_iteration": 2.788455009460449 + }, + { + "auxiliary_loss_clip": 0.01530419, + "auxiliary_loss_mlp": 0.01323417, + "balance_loss_clip": 1.17946541, + "balance_loss_mlp": 1.06249201, + "epoch": 0.24283781752592815, + "flos": 19501906214880.0, + "grad_norm": 1.876628239670934, + "language_loss": 0.72419393, + "learning_rate": 3.543322794484905e-06, + "loss": 0.75273228, + "num_input_tokens_seen": 86957705, + "step": 4039, + "time_per_iteration": 2.8431942462921143 + }, + { + "auxiliary_loss_clip": 0.01532085, + "auxiliary_loss_mlp": 0.01318977, + "balance_loss_clip": 1.1793915, + "balance_loss_mlp": 1.05385518, + "epoch": 0.2428979407785961, + "flos": 19904290071840.0, + "grad_norm": 6.7195302565239885, + "language_loss": 0.78539681, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.81390738, + "num_input_tokens_seen": 86975845, + "step": 4040, + "time_per_iteration": 2.8302009105682373 + }, + { + "auxiliary_loss_clip": 0.01530362, + "auxiliary_loss_mlp": 0.01314487, + "balance_loss_clip": 1.17805707, + "balance_loss_mlp": 1.05241704, + "epoch": 0.2429580640312641, + "flos": 24718104372960.0, + "grad_norm": 1.9743088477467017, + "language_loss": 0.80421197, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.83266044, + "num_input_tokens_seen": 86994800, + "step": 4041, + "time_per_iteration": 2.8548243045806885 + }, + { + "auxiliary_loss_clip": 0.01520879, + "auxiliary_loss_mlp": 0.01308553, + "balance_loss_clip": 1.16878438, + "balance_loss_mlp": 1.04209638, + "epoch": 0.24301818728393207, + "flos": 25632751059360.0, + "grad_norm": 2.4068217486231256, + "language_loss": 0.76575893, + "learning_rate": 3.542579399075957e-06, + "loss": 0.79405332, + "num_input_tokens_seen": 87016845, + "step": 4042, + "time_per_iteration": 2.7963812351226807 + }, + { + "auxiliary_loss_clip": 0.01525632, + "auxiliary_loss_mlp": 0.01309679, + "balance_loss_clip": 1.1723702, + "balance_loss_mlp": 1.04875422, + "epoch": 0.24307831053660003, + "flos": 26143838115840.0, + "grad_norm": 1.819098986014125, + "language_loss": 0.81192529, + "learning_rate": 3.542331483604246e-06, + "loss": 0.84027839, + "num_input_tokens_seen": 87036270, + "step": 4043, + "time_per_iteration": 2.8247740268707275 + }, + { + "auxiliary_loss_clip": 0.01522766, + "auxiliary_loss_mlp": 0.01306765, + "balance_loss_clip": 1.16974568, + "balance_loss_mlp": 1.03611231, + "epoch": 0.243138433789268, + "flos": 14973997298400.0, + "grad_norm": 2.582619285774822, + "language_loss": 0.73802751, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.76632285, + "num_input_tokens_seen": 87049920, + "step": 4044, + "time_per_iteration": 2.7751917839050293 + }, + { + "auxiliary_loss_clip": 0.01526383, + "auxiliary_loss_mlp": 0.01309415, + "balance_loss_clip": 1.17394078, + "balance_loss_mlp": 1.0391438, + "epoch": 0.24319855704193596, + "flos": 25194031660800.0, + "grad_norm": 2.045047305259045, + "language_loss": 0.83796453, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.86632252, + "num_input_tokens_seen": 87068230, + "step": 4045, + "time_per_iteration": 2.8375093936920166 + }, + { + "auxiliary_loss_clip": 0.01524067, + "auxiliary_loss_mlp": 0.01310293, + "balance_loss_clip": 1.17276514, + "balance_loss_mlp": 1.04459953, + "epoch": 0.24325868029460393, + "flos": 22129405729920.0, + "grad_norm": 2.0082487528945903, + "language_loss": 0.86979365, + "learning_rate": 3.541587386314541e-06, + "loss": 0.89813721, + "num_input_tokens_seen": 87086435, + "step": 4046, + "time_per_iteration": 2.7965378761291504 + }, + { + "auxiliary_loss_clip": 0.01532361, + "auxiliary_loss_mlp": 0.01308867, + "balance_loss_clip": 1.17982078, + "balance_loss_mlp": 1.04927671, + "epoch": 0.2433188035472719, + "flos": 23584116951360.0, + "grad_norm": 1.8342763266404885, + "language_loss": 0.73118025, + "learning_rate": 3.5413392369578e-06, + "loss": 0.75959253, + "num_input_tokens_seen": 87105340, + "step": 4047, + "time_per_iteration": 2.7754271030426025 + }, + { + "auxiliary_loss_clip": 0.0152281, + "auxiliary_loss_mlp": 0.01309987, + "balance_loss_clip": 1.17152429, + "balance_loss_mlp": 1.04715419, + "epoch": 0.2433789267999399, + "flos": 24465158923680.0, + "grad_norm": 2.4712608289145837, + "language_loss": 0.73185736, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.76018536, + "num_input_tokens_seen": 87125780, + "step": 4048, + "time_per_iteration": 2.931971788406372 + }, + { + "auxiliary_loss_clip": 0.01523917, + "auxiliary_loss_mlp": 0.01308824, + "balance_loss_clip": 1.17280912, + "balance_loss_mlp": 1.04370308, + "epoch": 0.24343905005260785, + "flos": 16729823096640.0, + "grad_norm": 2.19282466000146, + "language_loss": 0.73206961, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.76039702, + "num_input_tokens_seen": 87144470, + "step": 4049, + "time_per_iteration": 2.837966203689575 + }, + { + "auxiliary_loss_clip": 0.01517261, + "auxiliary_loss_mlp": 0.01303837, + "balance_loss_clip": 1.16646576, + "balance_loss_mlp": 1.04062307, + "epoch": 0.24349917330527582, + "flos": 20045611853280.0, + "grad_norm": 2.2519840480626714, + "language_loss": 0.73924279, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.76745379, + "num_input_tokens_seen": 87162830, + "step": 4050, + "time_per_iteration": 4.461285829544067 + }, + { + "auxiliary_loss_clip": 0.01519139, + "auxiliary_loss_mlp": 0.01303162, + "balance_loss_clip": 1.16787982, + "balance_loss_mlp": 1.04338121, + "epoch": 0.24355929655794378, + "flos": 17422474076640.0, + "grad_norm": 5.183563348388449, + "language_loss": 0.74822867, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77645171, + "num_input_tokens_seen": 87180905, + "step": 4051, + "time_per_iteration": 2.8136794567108154 + }, + { + "auxiliary_loss_clip": 0.01519219, + "auxiliary_loss_mlp": 0.01313712, + "balance_loss_clip": 1.16936743, + "balance_loss_mlp": 1.05412221, + "epoch": 0.24361941981061175, + "flos": 25413296539680.0, + "grad_norm": 2.5005389464217016, + "language_loss": 0.70413166, + "learning_rate": 3.540097613646296e-06, + "loss": 0.73246098, + "num_input_tokens_seen": 87202290, + "step": 4052, + "time_per_iteration": 2.825563430786133 + }, + { + "auxiliary_loss_clip": 0.01521789, + "auxiliary_loss_mlp": 0.01324522, + "balance_loss_clip": 1.1706512, + "balance_loss_mlp": 1.05711138, + "epoch": 0.2436795430632797, + "flos": 22823042842080.0, + "grad_norm": 1.697444605785466, + "language_loss": 0.81449878, + "learning_rate": 3.539849113744351e-06, + "loss": 0.84296185, + "num_input_tokens_seen": 87221650, + "step": 4053, + "time_per_iteration": 2.79095721244812 + }, + { + "auxiliary_loss_clip": 0.01527606, + "auxiliary_loss_mlp": 0.01308154, + "balance_loss_clip": 1.17668343, + "balance_loss_mlp": 1.04284215, + "epoch": 0.2437396663159477, + "flos": 15159543678720.0, + "grad_norm": 1.7812533793942644, + "language_loss": 0.78377295, + "learning_rate": 3.539600555451172e-06, + "loss": 0.81213057, + "num_input_tokens_seen": 87238515, + "step": 4054, + "time_per_iteration": 2.878617525100708 + }, + { + "auxiliary_loss_clip": 0.01514986, + "auxiliary_loss_mlp": 0.01302964, + "balance_loss_clip": 1.16561627, + "balance_loss_mlp": 1.03879619, + "epoch": 0.24379978956861567, + "flos": 22093335685440.0, + "grad_norm": 1.9183204847406492, + "language_loss": 0.84307837, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.8712579, + "num_input_tokens_seen": 87256290, + "step": 4055, + "time_per_iteration": 2.818263292312622 + }, + { + "auxiliary_loss_clip": 0.01506974, + "auxiliary_loss_mlp": 0.01319437, + "balance_loss_clip": 1.15822101, + "balance_loss_mlp": 1.05565047, + "epoch": 0.24385991282128364, + "flos": 31470484168800.0, + "grad_norm": 3.822041776007623, + "language_loss": 0.54548734, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57375145, + "num_input_tokens_seen": 87277085, + "step": 4056, + "time_per_iteration": 2.8771538734436035 + }, + { + "auxiliary_loss_clip": 0.0151994, + "auxiliary_loss_mlp": 0.01314586, + "balance_loss_clip": 1.17086864, + "balance_loss_mlp": 1.05137181, + "epoch": 0.2439200360739516, + "flos": 23840893144800.0, + "grad_norm": 2.27982130857178, + "language_loss": 0.7985431, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82688832, + "num_input_tokens_seen": 87293020, + "step": 4057, + "time_per_iteration": 2.8055062294006348 + }, + { + "auxiliary_loss_clip": 0.0151432, + "auxiliary_loss_mlp": 0.01306367, + "balance_loss_clip": 1.16531205, + "balance_loss_mlp": 1.04734957, + "epoch": 0.24398015932661957, + "flos": 19171890015840.0, + "grad_norm": 1.8380088683824352, + "language_loss": 0.79407907, + "learning_rate": 3.538605738554673e-06, + "loss": 0.82228589, + "num_input_tokens_seen": 87311445, + "step": 4058, + "time_per_iteration": 7.255496025085449 + }, + { + "auxiliary_loss_clip": 0.01515186, + "auxiliary_loss_mlp": 0.01310406, + "balance_loss_clip": 1.16660929, + "balance_loss_mlp": 1.04719186, + "epoch": 0.24404028257928753, + "flos": 25264692551520.0, + "grad_norm": 1.6038955774370023, + "language_loss": 0.85767531, + "learning_rate": 3.538356888446756e-06, + "loss": 0.88593125, + "num_input_tokens_seen": 87332055, + "step": 4059, + "time_per_iteration": 2.8518824577331543 + }, + { + "auxiliary_loss_clip": 0.01514109, + "auxiliary_loss_mlp": 0.01299464, + "balance_loss_clip": 1.16625237, + "balance_loss_mlp": 1.03987432, + "epoch": 0.2441004058319555, + "flos": 26469947714400.0, + "grad_norm": 1.687754352882175, + "language_loss": 0.74990606, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.77804184, + "num_input_tokens_seen": 87351295, + "step": 4060, + "time_per_iteration": 2.858863115310669 + }, + { + "auxiliary_loss_clip": 0.01512373, + "auxiliary_loss_mlp": 0.01327276, + "balance_loss_clip": 1.16292214, + "balance_loss_mlp": 1.06310844, + "epoch": 0.2441605290846235, + "flos": 26762945664960.0, + "grad_norm": 7.480667210858138, + "language_loss": 0.73734951, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.765746, + "num_input_tokens_seen": 87370650, + "step": 4061, + "time_per_iteration": 2.7906243801116943 + }, + { + "auxiliary_loss_clip": 0.01514836, + "auxiliary_loss_mlp": 0.01302723, + "balance_loss_clip": 1.16644478, + "balance_loss_mlp": 1.04256022, + "epoch": 0.24422065233729146, + "flos": 21107990180160.0, + "grad_norm": 1.7437977884094387, + "language_loss": 0.76195478, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.79013038, + "num_input_tokens_seen": 87389020, + "step": 4062, + "time_per_iteration": 2.831221342086792 + }, + { + "auxiliary_loss_clip": 0.01517227, + "auxiliary_loss_mlp": 0.01313845, + "balance_loss_clip": 1.16975212, + "balance_loss_mlp": 1.05559015, + "epoch": 0.24428077558995942, + "flos": 25265413186560.0, + "grad_norm": 2.2915249902917267, + "language_loss": 0.85064173, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87895244, + "num_input_tokens_seen": 87409695, + "step": 4063, + "time_per_iteration": 2.822476863861084 + }, + { + "auxiliary_loss_clip": 0.01517819, + "auxiliary_loss_mlp": 0.01308797, + "balance_loss_clip": 1.1694802, + "balance_loss_mlp": 1.04863477, + "epoch": 0.24434089884262739, + "flos": 20487365504640.0, + "grad_norm": 2.6928019573038946, + "language_loss": 0.68533957, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.71360576, + "num_input_tokens_seen": 87428250, + "step": 4064, + "time_per_iteration": 2.7317378520965576 + }, + { + "auxiliary_loss_clip": 0.01516349, + "auxiliary_loss_mlp": 0.0130985, + "balance_loss_clip": 1.16712749, + "balance_loss_mlp": 1.04606402, + "epoch": 0.24440102209529535, + "flos": 23624093596320.0, + "grad_norm": 1.6547433244013163, + "language_loss": 0.70216036, + "learning_rate": 3.536862563102088e-06, + "loss": 0.73042232, + "num_input_tokens_seen": 87449380, + "step": 4065, + "time_per_iteration": 2.9114558696746826 + }, + { + "auxiliary_loss_clip": 0.01514116, + "auxiliary_loss_mlp": 0.01314631, + "balance_loss_clip": 1.16539478, + "balance_loss_mlp": 1.04989123, + "epoch": 0.24446114534796332, + "flos": 20556623053440.0, + "grad_norm": 1.9536892165489677, + "language_loss": 0.84574217, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.87402964, + "num_input_tokens_seen": 87465365, + "step": 4066, + "time_per_iteration": 2.782189130783081 + }, + { + "auxiliary_loss_clip": 0.01643197, + "auxiliary_loss_mlp": 0.01245178, + "balance_loss_clip": 1.3066628, + "balance_loss_mlp": 1.02163696, + "epoch": 0.24452126860063128, + "flos": 60395318520480.0, + "grad_norm": 0.7339575610049488, + "language_loss": 0.52215242, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.55103618, + "num_input_tokens_seen": 87522525, + "step": 4067, + "time_per_iteration": 3.2556815147399902 + }, + { + "auxiliary_loss_clip": 0.01510716, + "auxiliary_loss_mlp": 0.01311307, + "balance_loss_clip": 1.16216779, + "balance_loss_mlp": 1.04694819, + "epoch": 0.24458139185329927, + "flos": 15123435706080.0, + "grad_norm": 2.767400180569203, + "language_loss": 0.72768068, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.75590092, + "num_input_tokens_seen": 87539170, + "step": 4068, + "time_per_iteration": 2.8759374618530273 + }, + { + "auxiliary_loss_clip": 0.01513921, + "auxiliary_loss_mlp": 0.01304269, + "balance_loss_clip": 1.16426015, + "balance_loss_mlp": 1.04391599, + "epoch": 0.24464151510596724, + "flos": 28001084906880.0, + "grad_norm": 1.8313085436188083, + "language_loss": 0.7794013, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.80758321, + "num_input_tokens_seen": 87558875, + "step": 4069, + "time_per_iteration": 2.8753082752227783 + }, + { + "auxiliary_loss_clip": 0.01513657, + "auxiliary_loss_mlp": 0.01315875, + "balance_loss_clip": 1.16524589, + "balance_loss_mlp": 1.05838323, + "epoch": 0.2447016383586352, + "flos": 19794980021760.0, + "grad_norm": 1.8723130415739035, + "language_loss": 0.80501223, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.8333075, + "num_input_tokens_seen": 87576485, + "step": 4070, + "time_per_iteration": 2.803704261779785 + }, + { + "auxiliary_loss_clip": 0.01511522, + "auxiliary_loss_mlp": 0.01305052, + "balance_loss_clip": 1.16346276, + "balance_loss_mlp": 1.0467968, + "epoch": 0.24476176161130317, + "flos": 26069725762560.0, + "grad_norm": 2.609373805625825, + "language_loss": 0.84350121, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.87166697, + "num_input_tokens_seen": 87598620, + "step": 4071, + "time_per_iteration": 2.80665922164917 + }, + { + "auxiliary_loss_clip": 0.01511078, + "auxiliary_loss_mlp": 0.013123, + "balance_loss_clip": 1.16316843, + "balance_loss_mlp": 1.04851413, + "epoch": 0.24482188486397113, + "flos": 18845552848320.0, + "grad_norm": 2.554199076570581, + "language_loss": 0.80005884, + "learning_rate": 3.535116532028798e-06, + "loss": 0.82829261, + "num_input_tokens_seen": 87616595, + "step": 4072, + "time_per_iteration": 2.7904579639434814 + }, + { + "auxiliary_loss_clip": 0.0151117, + "auxiliary_loss_mlp": 0.01310588, + "balance_loss_clip": 1.16413498, + "balance_loss_mlp": 1.05157018, + "epoch": 0.2448820081166391, + "flos": 21253939197120.0, + "grad_norm": 1.7143950639539565, + "language_loss": 0.70230925, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.73052686, + "num_input_tokens_seen": 87635755, + "step": 4073, + "time_per_iteration": 2.759460687637329 + }, + { + "auxiliary_loss_clip": 0.0151129, + "auxiliary_loss_mlp": 0.0130953, + "balance_loss_clip": 1.16267586, + "balance_loss_mlp": 1.05318224, + "epoch": 0.2449421313693071, + "flos": 23952820237920.0, + "grad_norm": 3.3410500261192952, + "language_loss": 0.67543209, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.70364022, + "num_input_tokens_seen": 87652885, + "step": 4074, + "time_per_iteration": 2.783761978149414 + }, + { + "auxiliary_loss_clip": 0.01634629, + "auxiliary_loss_mlp": 0.01265991, + "balance_loss_clip": 1.29992485, + "balance_loss_mlp": 1.04931641, + "epoch": 0.24500225462197506, + "flos": 60693778126080.0, + "grad_norm": 0.8937248197243386, + "language_loss": 0.68646884, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.71547502, + "num_input_tokens_seen": 87713220, + "step": 4075, + "time_per_iteration": 3.393637180328369 + }, + { + "auxiliary_loss_clip": 0.01514748, + "auxiliary_loss_mlp": 0.01308745, + "balance_loss_clip": 1.16752601, + "balance_loss_mlp": 1.05144382, + "epoch": 0.24506237787464302, + "flos": 26286487382880.0, + "grad_norm": 1.8164231673812161, + "language_loss": 0.79683018, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.82506508, + "num_input_tokens_seen": 87732680, + "step": 4076, + "time_per_iteration": 2.8031089305877686 + }, + { + "auxiliary_loss_clip": 0.01512608, + "auxiliary_loss_mlp": 0.01312697, + "balance_loss_clip": 1.16448283, + "balance_loss_mlp": 1.05234373, + "epoch": 0.245122501127311, + "flos": 20554043938560.0, + "grad_norm": 2.7743195019038565, + "language_loss": 0.82505107, + "learning_rate": 3.533867620434151e-06, + "loss": 0.85330409, + "num_input_tokens_seen": 87751880, + "step": 4077, + "time_per_iteration": 2.8023219108581543 + }, + { + "auxiliary_loss_clip": 0.01518056, + "auxiliary_loss_mlp": 0.013255, + "balance_loss_clip": 1.17005277, + "balance_loss_mlp": 1.0651474, + "epoch": 0.24518262437997895, + "flos": 29135186112960.0, + "grad_norm": 2.3167127375968457, + "language_loss": 0.62767678, + "learning_rate": 3.533617663584082e-06, + "loss": 0.65611231, + "num_input_tokens_seen": 87771795, + "step": 4078, + "time_per_iteration": 2.813910961151123 + }, + { + "auxiliary_loss_clip": 0.01510166, + "auxiliary_loss_mlp": 0.01327532, + "balance_loss_clip": 1.16243339, + "balance_loss_mlp": 1.0704217, + "epoch": 0.24524274763264692, + "flos": 23479168639680.0, + "grad_norm": 1.6086146089523408, + "language_loss": 0.75542855, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.78380555, + "num_input_tokens_seen": 87793640, + "step": 4079, + "time_per_iteration": 2.914475202560425 + }, + { + "auxiliary_loss_clip": 0.01512623, + "auxiliary_loss_mlp": 0.01309294, + "balance_loss_clip": 1.16527009, + "balance_loss_mlp": 1.0474149, + "epoch": 0.24530287088531488, + "flos": 17203057485120.0, + "grad_norm": 2.186652257666183, + "language_loss": 0.755777, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.78399622, + "num_input_tokens_seen": 87812390, + "step": 4080, + "time_per_iteration": 3.0085055828094482 + }, + { + "auxiliary_loss_clip": 0.01504806, + "auxiliary_loss_mlp": 0.01301899, + "balance_loss_clip": 1.1584065, + "balance_loss_mlp": 1.04574203, + "epoch": 0.24536299413798288, + "flos": 14869276555680.0, + "grad_norm": 2.0034802520855504, + "language_loss": 0.8301574, + "learning_rate": 3.532867444142186e-06, + "loss": 0.85822439, + "num_input_tokens_seen": 87830640, + "step": 4081, + "time_per_iteration": 2.7944695949554443 + }, + { + "auxiliary_loss_clip": 0.0151483, + "auxiliary_loss_mlp": 0.01318291, + "balance_loss_clip": 1.16745627, + "balance_loss_mlp": 1.06327868, + "epoch": 0.24542311739065084, + "flos": 35264968968960.0, + "grad_norm": 1.9583307283840117, + "language_loss": 0.73610806, + "learning_rate": 3.532617254729267e-06, + "loss": 0.76443928, + "num_input_tokens_seen": 87850450, + "step": 4082, + "time_per_iteration": 2.8940746784210205 + }, + { + "auxiliary_loss_clip": 0.01515889, + "auxiliary_loss_mlp": 0.01302148, + "balance_loss_clip": 1.1696527, + "balance_loss_mlp": 1.04293907, + "epoch": 0.2454832406433188, + "flos": 21505064094720.0, + "grad_norm": 1.6231167602113887, + "language_loss": 0.72466171, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.75284207, + "num_input_tokens_seen": 87868810, + "step": 4083, + "time_per_iteration": 2.7569496631622314 + }, + { + "auxiliary_loss_clip": 0.01512771, + "auxiliary_loss_mlp": 0.01310459, + "balance_loss_clip": 1.1659497, + "balance_loss_mlp": 1.04991531, + "epoch": 0.24554336389598677, + "flos": 14758411451040.0, + "grad_norm": 3.172927648236994, + "language_loss": 0.75287056, + "learning_rate": 3.532116701561919e-06, + "loss": 0.7811029, + "num_input_tokens_seen": 87885685, + "step": 4084, + "time_per_iteration": 2.7607858180999756 + }, + { + "auxiliary_loss_clip": 0.01509206, + "auxiliary_loss_mlp": 0.0130162, + "balance_loss_clip": 1.16146255, + "balance_loss_mlp": 1.0410763, + "epoch": 0.24560348714865474, + "flos": 14978207324160.0, + "grad_norm": 2.1472776440433288, + "language_loss": 0.85573947, + "learning_rate": 3.531866337826471e-06, + "loss": 0.88384771, + "num_input_tokens_seen": 87903715, + "step": 4085, + "time_per_iteration": 2.7416939735412598 + }, + { + "auxiliary_loss_clip": 0.01516331, + "auxiliary_loss_mlp": 0.01304556, + "balance_loss_clip": 1.17014003, + "balance_loss_mlp": 1.04439414, + "epoch": 0.2456636104013227, + "flos": 22677586891200.0, + "grad_norm": 2.0351929478196786, + "language_loss": 0.79148984, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.81969869, + "num_input_tokens_seen": 87923375, + "step": 4086, + "time_per_iteration": 2.779081106185913 + }, + { + "auxiliary_loss_clip": 0.01504477, + "auxiliary_loss_mlp": 0.01296682, + "balance_loss_clip": 1.15846455, + "balance_loss_mlp": 1.03804553, + "epoch": 0.2457237336539907, + "flos": 27420361020000.0, + "grad_norm": 1.6373398323935402, + "language_loss": 0.75412476, + "learning_rate": 3.531365436099496e-06, + "loss": 0.78213632, + "num_input_tokens_seen": 87943115, + "step": 4087, + "time_per_iteration": 2.9006152153015137 + }, + { + "auxiliary_loss_clip": 0.01519741, + "auxiliary_loss_mlp": 0.01326671, + "balance_loss_clip": 1.17253494, + "balance_loss_mlp": 1.06650889, + "epoch": 0.24578385690665866, + "flos": 20414466852480.0, + "grad_norm": 2.4554632004703913, + "language_loss": 0.79333144, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.82179558, + "num_input_tokens_seen": 87959505, + "step": 4088, + "time_per_iteration": 4.520047664642334 + }, + { + "auxiliary_loss_clip": 0.01503276, + "auxiliary_loss_mlp": 0.01294251, + "balance_loss_clip": 1.15643764, + "balance_loss_mlp": 1.03942943, + "epoch": 0.24584398015932662, + "flos": 23917698397440.0, + "grad_norm": 1.6557048614702585, + "language_loss": 0.77387357, + "learning_rate": 3.5308643020944e-06, + "loss": 0.80184889, + "num_input_tokens_seen": 87979725, + "step": 4089, + "time_per_iteration": 2.8814945220947266 + }, + { + "auxiliary_loss_clip": 0.01515211, + "auxiliary_loss_mlp": 0.01305882, + "balance_loss_clip": 1.1676569, + "balance_loss_mlp": 1.04800797, + "epoch": 0.2459041034119946, + "flos": 41499358783200.0, + "grad_norm": 3.5443512487423283, + "language_loss": 0.81638938, + "learning_rate": 3.530613648011309e-06, + "loss": 0.84460032, + "num_input_tokens_seen": 87998270, + "step": 4090, + "time_per_iteration": 2.945891857147217 + }, + { + "auxiliary_loss_clip": 0.01516989, + "auxiliary_loss_mlp": 0.01317427, + "balance_loss_clip": 1.16890478, + "balance_loss_mlp": 1.06146097, + "epoch": 0.24596422666466256, + "flos": 19938805061760.0, + "grad_norm": 1.8453162030629966, + "language_loss": 0.73361504, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.7619592, + "num_input_tokens_seen": 88016760, + "step": 4091, + "time_per_iteration": 2.7656376361846924 + }, + { + "auxiliary_loss_clip": 0.01513555, + "auxiliary_loss_mlp": 0.01311868, + "balance_loss_clip": 1.16555929, + "balance_loss_mlp": 1.05666494, + "epoch": 0.24602434991733052, + "flos": 21546595794240.0, + "grad_norm": 3.4026641955638217, + "language_loss": 0.76696998, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.79522419, + "num_input_tokens_seen": 88036465, + "step": 4092, + "time_per_iteration": 2.819819450378418 + }, + { + "auxiliary_loss_clip": 0.01512264, + "auxiliary_loss_mlp": 0.01307876, + "balance_loss_clip": 1.16392088, + "balance_loss_mlp": 1.04790461, + "epoch": 0.24608447316999849, + "flos": 23187308533920.0, + "grad_norm": 2.452671033646673, + "language_loss": 0.81770796, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.84590936, + "num_input_tokens_seen": 88053270, + "step": 4093, + "time_per_iteration": 2.7596123218536377 + }, + { + "auxiliary_loss_clip": 0.01510659, + "auxiliary_loss_mlp": 0.01310861, + "balance_loss_clip": 1.16312075, + "balance_loss_mlp": 1.04879153, + "epoch": 0.24614459642266648, + "flos": 19643758990560.0, + "grad_norm": 2.3715336919803773, + "language_loss": 0.87260187, + "learning_rate": 3.529610451363797e-06, + "loss": 0.9008171, + "num_input_tokens_seen": 88072305, + "step": 4094, + "time_per_iteration": 2.8111636638641357 + }, + { + "auxiliary_loss_clip": 0.01631179, + "auxiliary_loss_mlp": 0.01269547, + "balance_loss_clip": 1.29538703, + "balance_loss_mlp": 1.05058289, + "epoch": 0.24620471967533444, + "flos": 61745650352640.0, + "grad_norm": 0.7673335075999521, + "language_loss": 0.57485807, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.60386527, + "num_input_tokens_seen": 88137995, + "step": 4095, + "time_per_iteration": 4.898192882537842 + }, + { + "auxiliary_loss_clip": 0.01631746, + "auxiliary_loss_mlp": 0.01262146, + "balance_loss_clip": 1.29564989, + "balance_loss_mlp": 1.04241943, + "epoch": 0.2462648429280024, + "flos": 69161134727520.0, + "grad_norm": 0.6649574538610479, + "language_loss": 0.56277788, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.59171677, + "num_input_tokens_seen": 88208490, + "step": 4096, + "time_per_iteration": 7.770925760269165 + }, + { + "auxiliary_loss_clip": 0.0151787, + "auxiliary_loss_mlp": 0.0132311, + "balance_loss_clip": 1.16962314, + "balance_loss_mlp": 1.06809771, + "epoch": 0.24632496618067037, + "flos": 29462130131040.0, + "grad_norm": 1.6760140799602632, + "language_loss": 0.7763381, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.80474794, + "num_input_tokens_seen": 88228050, + "step": 4097, + "time_per_iteration": 2.9065775871276855 + }, + { + "auxiliary_loss_clip": 0.01515921, + "auxiliary_loss_mlp": 0.01340586, + "balance_loss_clip": 1.16738462, + "balance_loss_mlp": 1.08442879, + "epoch": 0.24638508943333834, + "flos": 24318754768800.0, + "grad_norm": 2.0066919020294325, + "language_loss": 0.76222837, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.79079342, + "num_input_tokens_seen": 88248090, + "step": 4098, + "time_per_iteration": 2.8831863403320312 + }, + { + "auxiliary_loss_clip": 0.01515811, + "auxiliary_loss_mlp": 0.01338578, + "balance_loss_clip": 1.16661048, + "balance_loss_mlp": 1.08280301, + "epoch": 0.2464452126860063, + "flos": 26615593306080.0, + "grad_norm": 7.3746809068741985, + "language_loss": 0.68141294, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70995682, + "num_input_tokens_seen": 88267545, + "step": 4099, + "time_per_iteration": 2.9157345294952393 + }, + { + "auxiliary_loss_clip": 0.01518153, + "auxiliary_loss_mlp": 0.01325687, + "balance_loss_clip": 1.16983402, + "balance_loss_mlp": 1.06991136, + "epoch": 0.24650533593867427, + "flos": 31215111317280.0, + "grad_norm": 2.035536662902404, + "language_loss": 0.6620785, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.69051689, + "num_input_tokens_seen": 88289785, + "step": 4100, + "time_per_iteration": 2.8761556148529053 + }, + { + "auxiliary_loss_clip": 0.01642554, + "auxiliary_loss_mlp": 0.01354912, + "balance_loss_clip": 1.30680394, + "balance_loss_mlp": 1.1420517, + "epoch": 0.24656545919134226, + "flos": 68500457550720.0, + "grad_norm": 0.7517822716280672, + "language_loss": 0.61426604, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.64424074, + "num_input_tokens_seen": 88357320, + "step": 4101, + "time_per_iteration": 3.4778153896331787 + }, + { + "auxiliary_loss_clip": 0.01517857, + "auxiliary_loss_mlp": 0.0132814, + "balance_loss_clip": 1.17002726, + "balance_loss_mlp": 1.07255578, + "epoch": 0.24662558244401023, + "flos": 20086271205120.0, + "grad_norm": 1.599940044982763, + "language_loss": 0.73749709, + "learning_rate": 3.527601274535012e-06, + "loss": 0.765957, + "num_input_tokens_seen": 88377040, + "step": 4102, + "time_per_iteration": 2.8560996055603027 + }, + { + "auxiliary_loss_clip": 0.01511513, + "auxiliary_loss_mlp": 0.01329634, + "balance_loss_clip": 1.16328025, + "balance_loss_mlp": 1.07481241, + "epoch": 0.2466857056966782, + "flos": 30704024260800.0, + "grad_norm": 2.3826405812185265, + "language_loss": 0.76128703, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78969854, + "num_input_tokens_seen": 88395085, + "step": 4103, + "time_per_iteration": 2.8495399951934814 + }, + { + "auxiliary_loss_clip": 0.01512948, + "auxiliary_loss_mlp": 0.01327011, + "balance_loss_clip": 1.16557384, + "balance_loss_mlp": 1.0672307, + "epoch": 0.24674582894934616, + "flos": 22530727598400.0, + "grad_norm": 2.0311940525608647, + "language_loss": 0.78242129, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.81082088, + "num_input_tokens_seen": 88413205, + "step": 4104, + "time_per_iteration": 2.823000192642212 + }, + { + "auxiliary_loss_clip": 0.01508114, + "auxiliary_loss_mlp": 0.0130953, + "balance_loss_clip": 1.15944874, + "balance_loss_mlp": 1.04860425, + "epoch": 0.24680595220201412, + "flos": 20706402814560.0, + "grad_norm": 1.8189555357356864, + "language_loss": 0.83666772, + "learning_rate": 3.526846877170133e-06, + "loss": 0.86484408, + "num_input_tokens_seen": 88431525, + "step": 4105, + "time_per_iteration": 2.824953556060791 + }, + { + "auxiliary_loss_clip": 0.01515483, + "auxiliary_loss_mlp": 0.01316314, + "balance_loss_clip": 1.16781878, + "balance_loss_mlp": 1.05748653, + "epoch": 0.2468660754546821, + "flos": 21833108029440.0, + "grad_norm": 1.9984119219766716, + "language_loss": 0.76464903, + "learning_rate": 3.52659529557275e-06, + "loss": 0.79296696, + "num_input_tokens_seen": 88451210, + "step": 4106, + "time_per_iteration": 2.8774356842041016 + }, + { + "auxiliary_loss_clip": 0.0150541, + "auxiliary_loss_mlp": 0.01306218, + "balance_loss_clip": 1.15730441, + "balance_loss_mlp": 1.04395795, + "epoch": 0.24692619870735008, + "flos": 15269498507520.0, + "grad_norm": 2.3235365097003293, + "language_loss": 0.72434753, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.75246382, + "num_input_tokens_seen": 88467790, + "step": 4107, + "time_per_iteration": 2.8977622985839844 + }, + { + "auxiliary_loss_clip": 0.01518331, + "auxiliary_loss_mlp": 0.01309372, + "balance_loss_clip": 1.16977859, + "balance_loss_mlp": 1.04787397, + "epoch": 0.24698632196001805, + "flos": 29682722495520.0, + "grad_norm": 2.7456044468084913, + "language_loss": 0.65983486, + "learning_rate": 3.526091958721587e-06, + "loss": 0.6881119, + "num_input_tokens_seen": 88490330, + "step": 4108, + "time_per_iteration": 2.8531620502471924 + }, + { + "auxiliary_loss_clip": 0.01509451, + "auxiliary_loss_mlp": 0.01303053, + "balance_loss_clip": 1.16077399, + "balance_loss_mlp": 1.0396477, + "epoch": 0.247046445212686, + "flos": 39168574178400.0, + "grad_norm": 2.117429603534141, + "language_loss": 0.73372352, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.76184857, + "num_input_tokens_seen": 88512435, + "step": 4109, + "time_per_iteration": 3.019366502761841 + }, + { + "auxiliary_loss_clip": 0.0151161, + "auxiliary_loss_mlp": 0.01311943, + "balance_loss_clip": 1.16170645, + "balance_loss_mlp": 1.04834747, + "epoch": 0.24710656846535398, + "flos": 23000851877760.0, + "grad_norm": 1.7926970430495783, + "language_loss": 0.79372585, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.8219614, + "num_input_tokens_seen": 88529780, + "step": 4110, + "time_per_iteration": 2.7537882328033447 + }, + { + "auxiliary_loss_clip": 0.01516791, + "auxiliary_loss_mlp": 0.01304988, + "balance_loss_clip": 1.16815615, + "balance_loss_mlp": 1.0377686, + "epoch": 0.24716669171802194, + "flos": 26435243083680.0, + "grad_norm": 2.2645491281322476, + "language_loss": 0.81190336, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.84012109, + "num_input_tokens_seen": 88547200, + "step": 4111, + "time_per_iteration": 2.8328921794891357 + }, + { + "auxiliary_loss_clip": 0.01505755, + "auxiliary_loss_mlp": 0.01302254, + "balance_loss_clip": 1.15593362, + "balance_loss_mlp": 1.03999329, + "epoch": 0.2472268149706899, + "flos": 23332043849760.0, + "grad_norm": 2.213732612273002, + "language_loss": 0.75256616, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.7806462, + "num_input_tokens_seen": 88566415, + "step": 4112, + "time_per_iteration": 2.881010055541992 + }, + { + "auxiliary_loss_clip": 0.01505031, + "auxiliary_loss_mlp": 0.01288229, + "balance_loss_clip": 1.1553278, + "balance_loss_mlp": 1.0196743, + "epoch": 0.24728693822335787, + "flos": 23770990817280.0, + "grad_norm": 2.2399188186355365, + "language_loss": 0.82928538, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.85721791, + "num_input_tokens_seen": 88585225, + "step": 4113, + "time_per_iteration": 2.845261573791504 + }, + { + "auxiliary_loss_clip": 0.01503658, + "auxiliary_loss_mlp": 0.01298305, + "balance_loss_clip": 1.15463662, + "balance_loss_mlp": 1.03585398, + "epoch": 0.24734706147602586, + "flos": 19319507871840.0, + "grad_norm": 2.28461074708215, + "language_loss": 0.87328595, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.90130562, + "num_input_tokens_seen": 88603280, + "step": 4114, + "time_per_iteration": 2.867969274520874 + }, + { + "auxiliary_loss_clip": 0.01510965, + "auxiliary_loss_mlp": 0.01301626, + "balance_loss_clip": 1.16115594, + "balance_loss_mlp": 1.03879321, + "epoch": 0.24740718472869383, + "flos": 28039392712800.0, + "grad_norm": 1.7435284116160983, + "language_loss": 0.75714087, + "learning_rate": 3.524328457352734e-06, + "loss": 0.78526676, + "num_input_tokens_seen": 88624925, + "step": 4115, + "time_per_iteration": 2.8851656913757324 + }, + { + "auxiliary_loss_clip": 0.01628881, + "auxiliary_loss_mlp": 0.01252823, + "balance_loss_clip": 1.28730345, + "balance_loss_mlp": 1.03385925, + "epoch": 0.2474673079813618, + "flos": 68114837940480.0, + "grad_norm": 0.6691714335655539, + "language_loss": 0.58121383, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.61003089, + "num_input_tokens_seen": 88691475, + "step": 4116, + "time_per_iteration": 3.4588568210601807 + }, + { + "auxiliary_loss_clip": 0.01506921, + "auxiliary_loss_mlp": 0.01303468, + "balance_loss_clip": 1.15723467, + "balance_loss_mlp": 1.03701103, + "epoch": 0.24752743123402976, + "flos": 29464785102240.0, + "grad_norm": 2.754607760965323, + "language_loss": 0.83748209, + "learning_rate": 3.523824079451235e-06, + "loss": 0.86558592, + "num_input_tokens_seen": 88713425, + "step": 4117, + "time_per_iteration": 2.8814737796783447 + }, + { + "auxiliary_loss_clip": 0.01628592, + "auxiliary_loss_mlp": 0.01260468, + "balance_loss_clip": 1.2868799, + "balance_loss_mlp": 1.04455566, + "epoch": 0.24758755448669773, + "flos": 58356318165120.0, + "grad_norm": 0.8950339314411615, + "language_loss": 0.63474649, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.66363704, + "num_input_tokens_seen": 88769995, + "step": 4118, + "time_per_iteration": 3.2243411540985107 + }, + { + "auxiliary_loss_clip": 0.01513498, + "auxiliary_loss_mlp": 0.01302081, + "balance_loss_clip": 1.16350436, + "balance_loss_mlp": 1.03791356, + "epoch": 0.2476476777393657, + "flos": 20486455228800.0, + "grad_norm": 1.573961362427433, + "language_loss": 0.79854846, + "learning_rate": 3.523319470415491e-06, + "loss": 0.82670426, + "num_input_tokens_seen": 88789970, + "step": 4119, + "time_per_iteration": 2.895883083343506 + }, + { + "auxiliary_loss_clip": 0.01509149, + "auxiliary_loss_mlp": 0.01297857, + "balance_loss_clip": 1.15927958, + "balance_loss_mlp": 1.03616905, + "epoch": 0.24770780099203366, + "flos": 20487972355200.0, + "grad_norm": 1.7005004912652764, + "language_loss": 0.7508378, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.7789079, + "num_input_tokens_seen": 88810000, + "step": 4120, + "time_per_iteration": 2.856675863265991 + }, + { + "auxiliary_loss_clip": 0.01496701, + "auxiliary_loss_mlp": 0.01299889, + "balance_loss_clip": 1.14784181, + "balance_loss_mlp": 1.03686523, + "epoch": 0.24776792424470165, + "flos": 15154916443200.0, + "grad_norm": 2.009474383323625, + "language_loss": 0.88039058, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90835643, + "num_input_tokens_seen": 88827515, + "step": 4121, + "time_per_iteration": 2.9197089672088623 + }, + { + "auxiliary_loss_clip": 0.01511108, + "auxiliary_loss_mlp": 0.01323181, + "balance_loss_clip": 1.16106129, + "balance_loss_mlp": 1.06339979, + "epoch": 0.2478280474973696, + "flos": 21727590795360.0, + "grad_norm": 2.0072793023085596, + "language_loss": 0.69601333, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.72435629, + "num_input_tokens_seen": 88845025, + "step": 4122, + "time_per_iteration": 2.8289756774902344 + }, + { + "auxiliary_loss_clip": 0.01497113, + "auxiliary_loss_mlp": 0.0131161, + "balance_loss_clip": 1.14747643, + "balance_loss_mlp": 1.05449986, + "epoch": 0.24788817075003758, + "flos": 20414277211680.0, + "grad_norm": 2.9253106082325693, + "language_loss": 0.80164897, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82973623, + "num_input_tokens_seen": 88861740, + "step": 4123, + "time_per_iteration": 2.7997658252716064 + }, + { + "auxiliary_loss_clip": 0.01500246, + "auxiliary_loss_mlp": 0.01304445, + "balance_loss_clip": 1.15082538, + "balance_loss_mlp": 1.04714394, + "epoch": 0.24794829400270554, + "flos": 22596268187520.0, + "grad_norm": 3.166302037865468, + "language_loss": 0.75219113, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.78023809, + "num_input_tokens_seen": 88879740, + "step": 4124, + "time_per_iteration": 2.848689556121826 + }, + { + "auxiliary_loss_clip": 0.01509944, + "auxiliary_loss_mlp": 0.01308564, + "balance_loss_clip": 1.16091561, + "balance_loss_mlp": 1.05126262, + "epoch": 0.2480084172553735, + "flos": 39679168168800.0, + "grad_norm": 1.6687093822056542, + "language_loss": 0.7371968, + "learning_rate": 3.521804257268357e-06, + "loss": 0.76538181, + "num_input_tokens_seen": 88904095, + "step": 4125, + "time_per_iteration": 2.9222187995910645 + }, + { + "auxiliary_loss_clip": 0.01502586, + "auxiliary_loss_mlp": 0.01335691, + "balance_loss_clip": 1.15343726, + "balance_loss_mlp": 1.07114148, + "epoch": 0.24806854050804147, + "flos": 22055862299040.0, + "grad_norm": 1.8192807015371064, + "language_loss": 0.69849217, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.72687495, + "num_input_tokens_seen": 88920740, + "step": 4126, + "time_per_iteration": 2.766923427581787 + }, + { + "auxiliary_loss_clip": 0.01504448, + "auxiliary_loss_mlp": 0.01331272, + "balance_loss_clip": 1.15464878, + "balance_loss_mlp": 1.07473338, + "epoch": 0.24812866376070947, + "flos": 15488573745600.0, + "grad_norm": 2.1876156803276023, + "language_loss": 0.80986094, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83821809, + "num_input_tokens_seen": 88938510, + "step": 4127, + "time_per_iteration": 4.444140672683716 + }, + { + "auxiliary_loss_clip": 0.01505564, + "auxiliary_loss_mlp": 0.01316273, + "balance_loss_clip": 1.15568233, + "balance_loss_mlp": 1.05706453, + "epoch": 0.24818878701337743, + "flos": 14759207942400.0, + "grad_norm": 2.946207544902937, + "language_loss": 0.8467896, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.87500799, + "num_input_tokens_seen": 88955235, + "step": 4128, + "time_per_iteration": 2.8328030109405518 + }, + { + "auxiliary_loss_clip": 0.01504688, + "auxiliary_loss_mlp": 0.01331819, + "balance_loss_clip": 1.15557265, + "balance_loss_mlp": 1.07470846, + "epoch": 0.2482489102660454, + "flos": 27091975731840.0, + "grad_norm": 2.2542033083185715, + "language_loss": 0.65529263, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.68365771, + "num_input_tokens_seen": 88975210, + "step": 4129, + "time_per_iteration": 2.8956663608551025 + }, + { + "auxiliary_loss_clip": 0.01511686, + "auxiliary_loss_mlp": 0.01325606, + "balance_loss_clip": 1.1620959, + "balance_loss_mlp": 1.07002115, + "epoch": 0.24830903351871336, + "flos": 26469871858080.0, + "grad_norm": 1.7004659372688975, + "language_loss": 0.75843352, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.7868064, + "num_input_tokens_seen": 88996120, + "step": 4130, + "time_per_iteration": 2.8902089595794678 + }, + { + "auxiliary_loss_clip": 0.01507962, + "auxiliary_loss_mlp": 0.01322004, + "balance_loss_clip": 1.15839446, + "balance_loss_mlp": 1.06355858, + "epoch": 0.24836915677138133, + "flos": 10229706043200.0, + "grad_norm": 5.309739774621314, + "language_loss": 0.76947904, + "learning_rate": 3.520286966670535e-06, + "loss": 0.79777873, + "num_input_tokens_seen": 89008685, + "step": 4131, + "time_per_iteration": 2.7555198669433594 + }, + { + "auxiliary_loss_clip": 0.01508406, + "auxiliary_loss_mlp": 0.01300461, + "balance_loss_clip": 1.15972519, + "balance_loss_mlp": 1.04163408, + "epoch": 0.2484292800240493, + "flos": 30083020303680.0, + "grad_norm": 1.6058389345955155, + "language_loss": 0.83805263, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86614132, + "num_input_tokens_seen": 89031160, + "step": 4132, + "time_per_iteration": 2.884060859680176 + }, + { + "auxiliary_loss_clip": 0.01503501, + "auxiliary_loss_mlp": 0.01304273, + "balance_loss_clip": 1.1548537, + "balance_loss_mlp": 1.03857923, + "epoch": 0.24848940327671726, + "flos": 13444604801280.0, + "grad_norm": 2.0057007790558266, + "language_loss": 0.71478951, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.74286723, + "num_input_tokens_seen": 89047235, + "step": 4133, + "time_per_iteration": 4.366900205612183 + }, + { + "auxiliary_loss_clip": 0.01510941, + "auxiliary_loss_mlp": 0.01322898, + "balance_loss_clip": 1.16075873, + "balance_loss_mlp": 1.04995656, + "epoch": 0.24854952652938525, + "flos": 19972068422400.0, + "grad_norm": 2.0990690313531317, + "language_loss": 0.61782193, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.6461603, + "num_input_tokens_seen": 89064790, + "step": 4134, + "time_per_iteration": 6.003474950790405 + }, + { + "auxiliary_loss_clip": 0.01502433, + "auxiliary_loss_mlp": 0.01307401, + "balance_loss_clip": 1.15402591, + "balance_loss_mlp": 1.04208875, + "epoch": 0.24860964978205322, + "flos": 18152143305120.0, + "grad_norm": 2.06483845736511, + "language_loss": 0.78669608, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.81479448, + "num_input_tokens_seen": 89083250, + "step": 4135, + "time_per_iteration": 2.7537498474121094 + }, + { + "auxiliary_loss_clip": 0.0152032, + "auxiliary_loss_mlp": 0.01311407, + "balance_loss_clip": 1.17149925, + "balance_loss_mlp": 1.04704857, + "epoch": 0.24866977303472118, + "flos": 11730234846240.0, + "grad_norm": 2.0919401964966666, + "language_loss": 0.82696623, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.8552835, + "num_input_tokens_seen": 89100905, + "step": 4136, + "time_per_iteration": 2.7650675773620605 + }, + { + "auxiliary_loss_clip": 0.01511357, + "auxiliary_loss_mlp": 0.01321221, + "balance_loss_clip": 1.16276455, + "balance_loss_mlp": 1.05781591, + "epoch": 0.24872989628738915, + "flos": 34826439211200.0, + "grad_norm": 1.965152097534657, + "language_loss": 0.71232176, + "learning_rate": 3.518767600693314e-06, + "loss": 0.74064755, + "num_input_tokens_seen": 89122630, + "step": 4137, + "time_per_iteration": 2.873398780822754 + }, + { + "auxiliary_loss_clip": 0.01507747, + "auxiliary_loss_mlp": 0.01313831, + "balance_loss_clip": 1.16005111, + "balance_loss_mlp": 1.04565775, + "epoch": 0.2487900195400571, + "flos": 13701077569440.0, + "grad_norm": 1.8156603045278665, + "language_loss": 0.67073107, + "learning_rate": 3.518514171403042e-06, + "loss": 0.69894689, + "num_input_tokens_seen": 89141050, + "step": 4138, + "time_per_iteration": 2.7506964206695557 + }, + { + "auxiliary_loss_clip": 0.0150555, + "auxiliary_loss_mlp": 0.01296093, + "balance_loss_clip": 1.15919662, + "balance_loss_mlp": 1.03097117, + "epoch": 0.24885014279272508, + "flos": 25340170318560.0, + "grad_norm": 1.989385299217308, + "language_loss": 0.84144378, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86946023, + "num_input_tokens_seen": 89160810, + "step": 4139, + "time_per_iteration": 2.826188087463379 + }, + { + "auxiliary_loss_clip": 0.01509583, + "auxiliary_loss_mlp": 0.01308858, + "balance_loss_clip": 1.16155553, + "balance_loss_mlp": 1.04163861, + "epoch": 0.24891026604539307, + "flos": 20633542090560.0, + "grad_norm": 1.7069947677406399, + "language_loss": 0.78742719, + "learning_rate": 3.518007140085481e-06, + "loss": 0.81561166, + "num_input_tokens_seen": 89180610, + "step": 4140, + "time_per_iteration": 2.836120128631592 + }, + { + "auxiliary_loss_clip": 0.01639195, + "auxiliary_loss_mlp": 0.01345078, + "balance_loss_clip": 1.29828477, + "balance_loss_mlp": 1.13221741, + "epoch": 0.24897038929806103, + "flos": 66966589530720.0, + "grad_norm": 0.8293608101345079, + "language_loss": 0.60918397, + "learning_rate": 3.51775353807742e-06, + "loss": 0.6390267, + "num_input_tokens_seen": 89241880, + "step": 4141, + "time_per_iteration": 3.406784772872925 + }, + { + "auxiliary_loss_clip": 0.01506703, + "auxiliary_loss_mlp": 0.01298813, + "balance_loss_clip": 1.15786088, + "balance_loss_mlp": 1.0300678, + "epoch": 0.249030512550729, + "flos": 36395504928000.0, + "grad_norm": 2.6001709510386366, + "language_loss": 0.73068178, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75873697, + "num_input_tokens_seen": 89263340, + "step": 4142, + "time_per_iteration": 2.9057013988494873 + }, + { + "auxiliary_loss_clip": 0.01506352, + "auxiliary_loss_mlp": 0.01304392, + "balance_loss_clip": 1.15743637, + "balance_loss_mlp": 1.03755426, + "epoch": 0.24909063580339696, + "flos": 20156514886080.0, + "grad_norm": 1.8721505459445271, + "language_loss": 0.81093359, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83904099, + "num_input_tokens_seen": 89282870, + "step": 4143, + "time_per_iteration": 2.838268756866455 + }, + { + "auxiliary_loss_clip": 0.01502157, + "auxiliary_loss_mlp": 0.01295628, + "balance_loss_clip": 1.15512705, + "balance_loss_mlp": 1.02783632, + "epoch": 0.24915075905606493, + "flos": 26399021326560.0, + "grad_norm": 1.9757170456516868, + "language_loss": 0.58736742, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61534524, + "num_input_tokens_seen": 89303830, + "step": 4144, + "time_per_iteration": 2.7632923126220703 + }, + { + "auxiliary_loss_clip": 0.01505969, + "auxiliary_loss_mlp": 0.01302261, + "balance_loss_clip": 1.157166, + "balance_loss_mlp": 1.03885674, + "epoch": 0.2492108823087329, + "flos": 27529708998240.0, + "grad_norm": 2.256922572673824, + "language_loss": 0.7858426, + "learning_rate": 3.516738554607708e-06, + "loss": 0.81392491, + "num_input_tokens_seen": 89324350, + "step": 4145, + "time_per_iteration": 2.8436169624328613 + }, + { + "auxiliary_loss_clip": 0.01510606, + "auxiliary_loss_mlp": 0.01302629, + "balance_loss_clip": 1.16135955, + "balance_loss_mlp": 1.02987814, + "epoch": 0.24927100556140086, + "flos": 16693297914240.0, + "grad_norm": 2.1011544136028792, + "language_loss": 0.65497446, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.68310678, + "num_input_tokens_seen": 89342875, + "step": 4146, + "time_per_iteration": 2.804415464401245 + }, + { + "auxiliary_loss_clip": 0.01634243, + "auxiliary_loss_mlp": 0.01260811, + "balance_loss_clip": 1.2949332, + "balance_loss_mlp": 1.04184723, + "epoch": 0.24933112881406885, + "flos": 62778975344640.0, + "grad_norm": 0.9864025188793025, + "language_loss": 0.67296493, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.7019155, + "num_input_tokens_seen": 89404925, + "step": 4147, + "time_per_iteration": 3.4603428840637207 + }, + { + "auxiliary_loss_clip": 0.01501586, + "auxiliary_loss_mlp": 0.01298186, + "balance_loss_clip": 1.15400553, + "balance_loss_mlp": 1.02848661, + "epoch": 0.24939125206673682, + "flos": 26654356249920.0, + "grad_norm": 1.8193370451276734, + "language_loss": 0.89540827, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.923406, + "num_input_tokens_seen": 89425090, + "step": 4148, + "time_per_iteration": 2.8793442249298096 + }, + { + "auxiliary_loss_clip": 0.01511208, + "auxiliary_loss_mlp": 0.01311164, + "balance_loss_clip": 1.16349173, + "balance_loss_mlp": 1.03631473, + "epoch": 0.24945137531940478, + "flos": 20706440742720.0, + "grad_norm": 2.041443779013634, + "language_loss": 0.68570149, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.71392518, + "num_input_tokens_seen": 89442615, + "step": 4149, + "time_per_iteration": 2.7943320274353027 + }, + { + "auxiliary_loss_clip": 0.01506573, + "auxiliary_loss_mlp": 0.01311756, + "balance_loss_clip": 1.15916348, + "balance_loss_mlp": 1.04663515, + "epoch": 0.24951149857207275, + "flos": 23770914960960.0, + "grad_norm": 1.6290704260793596, + "language_loss": 0.71457773, + "learning_rate": 3.515468531258095e-06, + "loss": 0.74276102, + "num_input_tokens_seen": 89463025, + "step": 4150, + "time_per_iteration": 2.8681209087371826 + }, + { + "auxiliary_loss_clip": 0.01499973, + "auxiliary_loss_mlp": 0.01299418, + "balance_loss_clip": 1.15150404, + "balance_loss_mlp": 1.03772974, + "epoch": 0.2495716218247407, + "flos": 15666458637600.0, + "grad_norm": 2.1147783350927827, + "language_loss": 0.73091865, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75891256, + "num_input_tokens_seen": 89480225, + "step": 4151, + "time_per_iteration": 2.875943183898926 + }, + { + "auxiliary_loss_clip": 0.01504455, + "auxiliary_loss_mlp": 0.01310111, + "balance_loss_clip": 1.15758228, + "balance_loss_mlp": 1.0415566, + "epoch": 0.24963174507740868, + "flos": 24054317087040.0, + "grad_norm": 3.322231659479449, + "language_loss": 0.63372552, + "learning_rate": 3.514960119583781e-06, + "loss": 0.66187114, + "num_input_tokens_seen": 89496985, + "step": 4152, + "time_per_iteration": 2.8018641471862793 + }, + { + "auxiliary_loss_clip": 0.0150052, + "auxiliary_loss_mlp": 0.01312492, + "balance_loss_clip": 1.15305591, + "balance_loss_mlp": 1.05080378, + "epoch": 0.24969186833007664, + "flos": 21801703148640.0, + "grad_norm": 1.9949443413912598, + "language_loss": 0.76930416, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79743433, + "num_input_tokens_seen": 89514420, + "step": 4153, + "time_per_iteration": 2.832871675491333 + }, + { + "auxiliary_loss_clip": 0.01499089, + "auxiliary_loss_mlp": 0.01305406, + "balance_loss_clip": 1.15220213, + "balance_loss_mlp": 1.04562497, + "epoch": 0.24975199158274464, + "flos": 19940208403680.0, + "grad_norm": 2.1008839307168463, + "language_loss": 0.76798218, + "learning_rate": 3.514451478119711e-06, + "loss": 0.79602718, + "num_input_tokens_seen": 89532925, + "step": 4154, + "time_per_iteration": 2.864668369293213 + }, + { + "auxiliary_loss_clip": 0.0150439, + "auxiliary_loss_mlp": 0.01320426, + "balance_loss_clip": 1.15638208, + "balance_loss_mlp": 1.05969167, + "epoch": 0.2498121148354126, + "flos": 25340777169120.0, + "grad_norm": 1.871449890985368, + "language_loss": 0.71240169, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.74064988, + "num_input_tokens_seen": 89552855, + "step": 4155, + "time_per_iteration": 2.8592679500579834 + }, + { + "auxiliary_loss_clip": 0.01501386, + "auxiliary_loss_mlp": 0.01325141, + "balance_loss_clip": 1.15382957, + "balance_loss_mlp": 1.06593287, + "epoch": 0.24987223808808057, + "flos": 20560908935520.0, + "grad_norm": 1.7435989237044662, + "language_loss": 0.74759531, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77586055, + "num_input_tokens_seen": 89572830, + "step": 4156, + "time_per_iteration": 2.7867953777313232 + }, + { + "auxiliary_loss_clip": 0.01498889, + "auxiliary_loss_mlp": 0.0130738, + "balance_loss_clip": 1.15248406, + "balance_loss_mlp": 1.04893422, + "epoch": 0.24993236134074853, + "flos": 19750717494720.0, + "grad_norm": 1.9978344339900722, + "language_loss": 0.77148533, + "learning_rate": 3.513688085236591e-06, + "loss": 0.79954803, + "num_input_tokens_seen": 89590345, + "step": 4157, + "time_per_iteration": 2.8297455310821533 + }, + { + "auxiliary_loss_clip": 0.01497285, + "auxiliary_loss_mlp": 0.01319599, + "balance_loss_clip": 1.15044594, + "balance_loss_mlp": 1.0594368, + "epoch": 0.2499924845934165, + "flos": 18772274914560.0, + "grad_norm": 1.902711848720539, + "language_loss": 0.81674618, + "learning_rate": 3.513433506130942e-06, + "loss": 0.84491503, + "num_input_tokens_seen": 89610295, + "step": 4158, + "time_per_iteration": 2.814520835876465 + }, + { + "auxiliary_loss_clip": 0.01495472, + "auxiliary_loss_mlp": 0.01309071, + "balance_loss_clip": 1.14861333, + "balance_loss_mlp": 1.04814529, + "epoch": 0.25005260784608446, + "flos": 16874027418240.0, + "grad_norm": 2.8724027451235945, + "language_loss": 0.75910199, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.7871474, + "num_input_tokens_seen": 89627795, + "step": 4159, + "time_per_iteration": 2.845440626144409 + }, + { + "auxiliary_loss_clip": 0.01490913, + "auxiliary_loss_mlp": 0.01302265, + "balance_loss_clip": 1.14439726, + "balance_loss_mlp": 1.0390507, + "epoch": 0.2501127310987524, + "flos": 22126978327680.0, + "grad_norm": 2.0272281324431534, + "language_loss": 0.71808141, + "learning_rate": 3.512924175760649e-06, + "loss": 0.74601322, + "num_input_tokens_seen": 89648090, + "step": 4160, + "time_per_iteration": 2.8764870166778564 + }, + { + "auxiliary_loss_clip": 0.01608117, + "auxiliary_loss_mlp": 0.0126696, + "balance_loss_clip": 1.27100027, + "balance_loss_mlp": 1.05257416, + "epoch": 0.2501728543514204, + "flos": 69465435269760.0, + "grad_norm": 0.7473238122859625, + "language_loss": 0.56747127, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.59622204, + "num_input_tokens_seen": 89710345, + "step": 4161, + "time_per_iteration": 3.4416985511779785 + }, + { + "auxiliary_loss_clip": 0.01491908, + "auxiliary_loss_mlp": 0.01317895, + "balance_loss_clip": 1.14612007, + "balance_loss_mlp": 1.05162883, + "epoch": 0.25023297760408836, + "flos": 16291862261280.0, + "grad_norm": 1.716512974954532, + "language_loss": 0.81287313, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.84097117, + "num_input_tokens_seen": 89729390, + "step": 4162, + "time_per_iteration": 2.816437005996704 + }, + { + "auxiliary_loss_clip": 0.01488696, + "auxiliary_loss_mlp": 0.01314407, + "balance_loss_clip": 1.14158881, + "balance_loss_mlp": 1.04947591, + "epoch": 0.2502931008567563, + "flos": 12239539279200.0, + "grad_norm": 5.342826876046854, + "language_loss": 0.87849069, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.9065218, + "num_input_tokens_seen": 89742805, + "step": 4163, + "time_per_iteration": 2.777784585952759 + }, + { + "auxiliary_loss_clip": 0.01491793, + "auxiliary_loss_mlp": 0.0132256, + "balance_loss_clip": 1.14637184, + "balance_loss_mlp": 1.05896413, + "epoch": 0.25035322410942434, + "flos": 23183932927680.0, + "grad_norm": 1.7729694648712255, + "language_loss": 0.83453441, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.86267793, + "num_input_tokens_seen": 89761145, + "step": 4164, + "time_per_iteration": 2.84281587600708 + }, + { + "auxiliary_loss_clip": 0.01498318, + "auxiliary_loss_mlp": 0.01317975, + "balance_loss_clip": 1.15274739, + "balance_loss_mlp": 1.06296229, + "epoch": 0.2504133473620923, + "flos": 20919333690720.0, + "grad_norm": 1.621160174071637, + "language_loss": 0.73845083, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76661372, + "num_input_tokens_seen": 89780905, + "step": 4165, + "time_per_iteration": 4.562702655792236 + }, + { + "auxiliary_loss_clip": 0.01491615, + "auxiliary_loss_mlp": 0.01304994, + "balance_loss_clip": 1.14419663, + "balance_loss_mlp": 1.04521298, + "epoch": 0.2504734706147603, + "flos": 20778618759840.0, + "grad_norm": 1.9016674968395375, + "language_loss": 0.74557889, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.77354497, + "num_input_tokens_seen": 89799230, + "step": 4166, + "time_per_iteration": 2.8087124824523926 + }, + { + "auxiliary_loss_clip": 0.01494938, + "auxiliary_loss_mlp": 0.01300077, + "balance_loss_clip": 1.14882684, + "balance_loss_mlp": 1.04144096, + "epoch": 0.25053359386742824, + "flos": 24351487135200.0, + "grad_norm": 1.9314566133166766, + "language_loss": 0.82001805, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84796822, + "num_input_tokens_seen": 89818240, + "step": 4167, + "time_per_iteration": 2.790874719619751 + }, + { + "auxiliary_loss_clip": 0.01501627, + "auxiliary_loss_mlp": 0.01314151, + "balance_loss_clip": 1.15476525, + "balance_loss_mlp": 1.05494189, + "epoch": 0.2505937171200962, + "flos": 21216427882560.0, + "grad_norm": 2.2008111330303324, + "language_loss": 0.80075812, + "learning_rate": 3.51088456024312e-06, + "loss": 0.82891583, + "num_input_tokens_seen": 89834485, + "step": 4168, + "time_per_iteration": 2.7950899600982666 + }, + { + "auxiliary_loss_clip": 0.01492579, + "auxiliary_loss_mlp": 0.01312597, + "balance_loss_clip": 1.14628911, + "balance_loss_mlp": 1.04728472, + "epoch": 0.25065384037276417, + "flos": 41430442587840.0, + "grad_norm": 2.1843862175678015, + "language_loss": 0.69811678, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72616851, + "num_input_tokens_seen": 89855645, + "step": 4169, + "time_per_iteration": 2.979901075363159 + }, + { + "auxiliary_loss_clip": 0.01496858, + "auxiliary_loss_mlp": 0.0130456, + "balance_loss_clip": 1.15010583, + "balance_loss_mlp": 1.04516077, + "epoch": 0.25071396362543213, + "flos": 26104695890400.0, + "grad_norm": 2.381195432232395, + "language_loss": 0.78239727, + "learning_rate": 3.510374083241361e-06, + "loss": 0.81041145, + "num_input_tokens_seen": 89874895, + "step": 4170, + "time_per_iteration": 2.840608835220337 + }, + { + "auxiliary_loss_clip": 0.01492938, + "auxiliary_loss_mlp": 0.01307045, + "balance_loss_clip": 1.14680946, + "balance_loss_mlp": 1.0463109, + "epoch": 0.2507740868781001, + "flos": 19101001556160.0, + "grad_norm": 2.5740563980596614, + "language_loss": 0.76778781, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.79578763, + "num_input_tokens_seen": 89891700, + "step": 4171, + "time_per_iteration": 2.8082916736602783 + }, + { + "auxiliary_loss_clip": 0.01612166, + "auxiliary_loss_mlp": 0.01254005, + "balance_loss_clip": 1.27469194, + "balance_loss_mlp": 1.03885651, + "epoch": 0.25083421013076806, + "flos": 64348496199360.0, + "grad_norm": 0.8429914938680305, + "language_loss": 0.60017765, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62883931, + "num_input_tokens_seen": 89955775, + "step": 4172, + "time_per_iteration": 4.823272943496704 + }, + { + "auxiliary_loss_clip": 0.01492825, + "auxiliary_loss_mlp": 0.01307914, + "balance_loss_clip": 1.14592409, + "balance_loss_mlp": 1.04851413, + "epoch": 0.25089433338343603, + "flos": 24281888232960.0, + "grad_norm": 1.6383685007163746, + "language_loss": 0.79043078, + "learning_rate": 3.509607938211409e-06, + "loss": 0.81843817, + "num_input_tokens_seen": 89977150, + "step": 4173, + "time_per_iteration": 5.854681968688965 + }, + { + "auxiliary_loss_clip": 0.01493458, + "auxiliary_loss_mlp": 0.01302197, + "balance_loss_clip": 1.1483767, + "balance_loss_mlp": 1.03955543, + "epoch": 0.250954456636104, + "flos": 14723289610560.0, + "grad_norm": 2.1315619930003145, + "language_loss": 0.83849514, + "learning_rate": 3.509352442032875e-06, + "loss": 0.86645174, + "num_input_tokens_seen": 89994925, + "step": 4174, + "time_per_iteration": 2.804206609725952 + }, + { + "auxiliary_loss_clip": 0.01493441, + "auxiliary_loss_mlp": 0.0130295, + "balance_loss_clip": 1.1460706, + "balance_loss_mlp": 1.03973579, + "epoch": 0.25101457988877196, + "flos": 22275809884800.0, + "grad_norm": 2.355791364055745, + "language_loss": 0.71309799, + "learning_rate": 3.509096888619545e-06, + "loss": 0.74106193, + "num_input_tokens_seen": 90013235, + "step": 4175, + "time_per_iteration": 2.8261590003967285 + }, + { + "auxiliary_loss_clip": 0.01486694, + "auxiliary_loss_mlp": 0.01299767, + "balance_loss_clip": 1.13975632, + "balance_loss_mlp": 1.03769684, + "epoch": 0.2510747031414399, + "flos": 25191111192480.0, + "grad_norm": 1.970703134057151, + "language_loss": 0.81016153, + "learning_rate": 3.50884127798111e-06, + "loss": 0.83802617, + "num_input_tokens_seen": 90032150, + "step": 4176, + "time_per_iteration": 2.8637232780456543 + }, + { + "auxiliary_loss_clip": 0.01501585, + "auxiliary_loss_mlp": 0.01300734, + "balance_loss_clip": 1.15396225, + "balance_loss_mlp": 1.0409534, + "epoch": 0.25113482639410795, + "flos": 20706289030080.0, + "grad_norm": 2.713661399904221, + "language_loss": 0.82700431, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.85502756, + "num_input_tokens_seen": 90049085, + "step": 4177, + "time_per_iteration": 2.8147523403167725 + }, + { + "auxiliary_loss_clip": 0.01499757, + "auxiliary_loss_mlp": 0.0131053, + "balance_loss_clip": 1.15192342, + "balance_loss_mlp": 1.05151153, + "epoch": 0.2511949496467759, + "flos": 21509084479680.0, + "grad_norm": 2.455227847514581, + "language_loss": 0.82574755, + "learning_rate": 3.508329885067698e-06, + "loss": 0.85385048, + "num_input_tokens_seen": 90067695, + "step": 4178, + "time_per_iteration": 2.8037352561950684 + }, + { + "auxiliary_loss_clip": 0.01493972, + "auxiliary_loss_mlp": 0.01305468, + "balance_loss_clip": 1.1480695, + "balance_loss_mlp": 1.04816699, + "epoch": 0.2512550728994439, + "flos": 20703975412320.0, + "grad_norm": 2.6924878676539095, + "language_loss": 0.76072407, + "learning_rate": 3.508074102812112e-06, + "loss": 0.78871852, + "num_input_tokens_seen": 90083890, + "step": 4179, + "time_per_iteration": 2.7841804027557373 + }, + { + "auxiliary_loss_clip": 0.01499785, + "auxiliary_loss_mlp": 0.01328191, + "balance_loss_clip": 1.1522944, + "balance_loss_mlp": 1.06364179, + "epoch": 0.25131519615211184, + "flos": 18480604449600.0, + "grad_norm": 2.160317903900543, + "language_loss": 0.70125341, + "learning_rate": 3.507818263370206e-06, + "loss": 0.7295332, + "num_input_tokens_seen": 90100995, + "step": 4180, + "time_per_iteration": 2.7743771076202393 + }, + { + "auxiliary_loss_clip": 0.01497274, + "auxiliary_loss_mlp": 0.01314368, + "balance_loss_clip": 1.15099692, + "balance_loss_mlp": 1.05649447, + "epoch": 0.2513753194047798, + "flos": 20487024151200.0, + "grad_norm": 1.7550818323469208, + "language_loss": 0.86094832, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88906479, + "num_input_tokens_seen": 90120365, + "step": 4181, + "time_per_iteration": 2.887687921524048 + }, + { + "auxiliary_loss_clip": 0.01494276, + "auxiliary_loss_mlp": 0.01291253, + "balance_loss_clip": 1.1475426, + "balance_loss_mlp": 1.03128171, + "epoch": 0.25143544265744777, + "flos": 37673127748800.0, + "grad_norm": 1.967130273184886, + "language_loss": 0.68217731, + "learning_rate": 3.507306412966238e-06, + "loss": 0.71003258, + "num_input_tokens_seen": 90142610, + "step": 4182, + "time_per_iteration": 2.917832851409912 + }, + { + "auxiliary_loss_clip": 0.01602241, + "auxiliary_loss_mlp": 0.01294212, + "balance_loss_clip": 1.26398373, + "balance_loss_mlp": 1.08440399, + "epoch": 0.25149556591011574, + "flos": 69373838034720.0, + "grad_norm": 0.8562592385993456, + "language_loss": 0.70046479, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72942936, + "num_input_tokens_seen": 90200555, + "step": 4183, + "time_per_iteration": 3.363046407699585 + }, + { + "auxiliary_loss_clip": 0.01484988, + "auxiliary_loss_mlp": 0.01306889, + "balance_loss_clip": 1.13590968, + "balance_loss_mlp": 1.04634523, + "epoch": 0.2515556891627837, + "flos": 13992103255680.0, + "grad_norm": 2.1430261888336766, + "language_loss": 0.74420929, + "learning_rate": 3.506794333933431e-06, + "loss": 0.77212811, + "num_input_tokens_seen": 90218120, + "step": 4184, + "time_per_iteration": 2.818233013153076 + }, + { + "auxiliary_loss_clip": 0.01491932, + "auxiliary_loss_mlp": 0.0130708, + "balance_loss_clip": 1.14397311, + "balance_loss_mlp": 1.04634547, + "epoch": 0.25161581241545167, + "flos": 22165703343360.0, + "grad_norm": 1.778299890966121, + "language_loss": 0.83142817, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85941833, + "num_input_tokens_seen": 90236790, + "step": 4185, + "time_per_iteration": 2.8103904724121094 + }, + { + "auxiliary_loss_clip": 0.01600538, + "auxiliary_loss_mlp": 0.01261497, + "balance_loss_clip": 1.26175737, + "balance_loss_mlp": 1.04711151, + "epoch": 0.25167593566811963, + "flos": 69364280138400.0, + "grad_norm": 0.7886503958498551, + "language_loss": 0.61458623, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.6432066, + "num_input_tokens_seen": 90297070, + "step": 4186, + "time_per_iteration": 3.254054307937622 + }, + { + "auxiliary_loss_clip": 0.01490827, + "auxiliary_loss_mlp": 0.01294214, + "balance_loss_clip": 1.14197969, + "balance_loss_mlp": 1.0289017, + "epoch": 0.2517360589207876, + "flos": 13263154662240.0, + "grad_norm": 3.46234972242175, + "language_loss": 0.79679388, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.82464427, + "num_input_tokens_seen": 90315255, + "step": 4187, + "time_per_iteration": 2.79361629486084 + }, + { + "auxiliary_loss_clip": 0.01495553, + "auxiliary_loss_mlp": 0.01300209, + "balance_loss_clip": 1.14793539, + "balance_loss_mlp": 1.03947449, + "epoch": 0.25179618217345556, + "flos": 20378965730400.0, + "grad_norm": 1.5493951099216008, + "language_loss": 0.79969251, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82765019, + "num_input_tokens_seen": 90334990, + "step": 4188, + "time_per_iteration": 2.7979886531829834 + }, + { + "auxiliary_loss_clip": 0.01492237, + "auxiliary_loss_mlp": 0.01291221, + "balance_loss_clip": 1.14378786, + "balance_loss_mlp": 1.02667201, + "epoch": 0.25185630542612353, + "flos": 27666744897600.0, + "grad_norm": 2.8986418131757183, + "language_loss": 0.74421632, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.77205086, + "num_input_tokens_seen": 90351825, + "step": 4189, + "time_per_iteration": 2.915071964263916 + }, + { + "auxiliary_loss_clip": 0.01488643, + "auxiliary_loss_mlp": 0.01298027, + "balance_loss_clip": 1.14117885, + "balance_loss_mlp": 1.03748274, + "epoch": 0.25191642867879155, + "flos": 20998566345600.0, + "grad_norm": 3.2880709396236973, + "language_loss": 0.84834445, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.87621111, + "num_input_tokens_seen": 90369860, + "step": 4190, + "time_per_iteration": 2.808091402053833 + }, + { + "auxiliary_loss_clip": 0.0149577, + "auxiliary_loss_mlp": 0.0130056, + "balance_loss_clip": 1.14783978, + "balance_loss_mlp": 1.03925323, + "epoch": 0.2519765519314595, + "flos": 21107800539360.0, + "grad_norm": 2.1339600035275494, + "language_loss": 0.75847477, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.78643805, + "num_input_tokens_seen": 90389245, + "step": 4191, + "time_per_iteration": 2.8343186378479004 + }, + { + "auxiliary_loss_clip": 0.01602674, + "auxiliary_loss_mlp": 0.01263428, + "balance_loss_clip": 1.26327872, + "balance_loss_mlp": 1.04598999, + "epoch": 0.2520366751841275, + "flos": 62752008058560.0, + "grad_norm": 0.7170992370250633, + "language_loss": 0.57074857, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59940958, + "num_input_tokens_seen": 90456735, + "step": 4192, + "time_per_iteration": 3.448667287826538 + }, + { + "auxiliary_loss_clip": 0.01494148, + "auxiliary_loss_mlp": 0.01300074, + "balance_loss_clip": 1.14522123, + "balance_loss_mlp": 1.03628778, + "epoch": 0.25209679843679544, + "flos": 22232571418080.0, + "grad_norm": 1.9905766183572726, + "language_loss": 0.76196158, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78990376, + "num_input_tokens_seen": 90474165, + "step": 4193, + "time_per_iteration": 2.8275485038757324 + }, + { + "auxiliary_loss_clip": 0.01494446, + "auxiliary_loss_mlp": 0.01304916, + "balance_loss_clip": 1.14644361, + "balance_loss_mlp": 1.04494441, + "epoch": 0.2521569216894634, + "flos": 12168271537920.0, + "grad_norm": 2.2675898212149366, + "language_loss": 0.84707963, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.87507319, + "num_input_tokens_seen": 90491660, + "step": 4194, + "time_per_iteration": 2.8262295722961426 + }, + { + "auxiliary_loss_clip": 0.01498783, + "auxiliary_loss_mlp": 0.01304332, + "balance_loss_clip": 1.15106487, + "balance_loss_mlp": 1.04779398, + "epoch": 0.2522170449421314, + "flos": 23702264262720.0, + "grad_norm": 1.4364676071754285, + "language_loss": 0.88633627, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.91436744, + "num_input_tokens_seen": 90514025, + "step": 4195, + "time_per_iteration": 2.881060838699341 + }, + { + "auxiliary_loss_clip": 0.01487158, + "auxiliary_loss_mlp": 0.01305763, + "balance_loss_clip": 1.13975, + "balance_loss_mlp": 1.04769862, + "epoch": 0.25227716819479934, + "flos": 20957338071360.0, + "grad_norm": 3.282585224835715, + "language_loss": 0.85674316, + "learning_rate": 3.503717062883053e-06, + "loss": 0.88467234, + "num_input_tokens_seen": 90533530, + "step": 4196, + "time_per_iteration": 2.815054178237915 + }, + { + "auxiliary_loss_clip": 0.01496261, + "auxiliary_loss_mlp": 0.0131422, + "balance_loss_clip": 1.14799356, + "balance_loss_mlp": 1.05520248, + "epoch": 0.2523372914474673, + "flos": 23333598904320.0, + "grad_norm": 1.8095881657593857, + "language_loss": 0.8351599, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.86326474, + "num_input_tokens_seen": 90554025, + "step": 4197, + "time_per_iteration": 2.8824515342712402 + }, + { + "auxiliary_loss_clip": 0.01491514, + "auxiliary_loss_mlp": 0.01313765, + "balance_loss_clip": 1.1439805, + "balance_loss_mlp": 1.05226779, + "epoch": 0.25239741470013527, + "flos": 36972966993120.0, + "grad_norm": 2.8689155995407862, + "language_loss": 0.73021472, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.75826752, + "num_input_tokens_seen": 90576930, + "step": 4198, + "time_per_iteration": 2.9566028118133545 + }, + { + "auxiliary_loss_clip": 0.01489118, + "auxiliary_loss_mlp": 0.0130446, + "balance_loss_clip": 1.14077604, + "balance_loss_mlp": 1.04219973, + "epoch": 0.25245753795280323, + "flos": 18517774410720.0, + "grad_norm": 3.482820882287055, + "language_loss": 0.76913702, + "learning_rate": 3.50294646148888e-06, + "loss": 0.79707283, + "num_input_tokens_seen": 90595710, + "step": 4199, + "time_per_iteration": 2.8078949451446533 + }, + { + "auxiliary_loss_clip": 0.01489206, + "auxiliary_loss_mlp": 0.01317715, + "balance_loss_clip": 1.14204073, + "balance_loss_mlp": 1.05774379, + "epoch": 0.2525176612054712, + "flos": 32347733325120.0, + "grad_norm": 3.244239707754105, + "language_loss": 0.73314619, + "learning_rate": 3.502689480360739e-06, + "loss": 0.76121533, + "num_input_tokens_seen": 90617945, + "step": 4200, + "time_per_iteration": 2.912355661392212 + }, + { + "auxiliary_loss_clip": 0.01490579, + "auxiliary_loss_mlp": 0.01309882, + "balance_loss_clip": 1.14337409, + "balance_loss_mlp": 1.05181766, + "epoch": 0.25257778445813917, + "flos": 45261642211200.0, + "grad_norm": 1.5806352862399806, + "language_loss": 0.82727313, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.85527772, + "num_input_tokens_seen": 90640855, + "step": 4201, + "time_per_iteration": 3.0629308223724365 + }, + { + "auxiliary_loss_clip": 0.01490947, + "auxiliary_loss_mlp": 0.01315742, + "balance_loss_clip": 1.14335036, + "balance_loss_mlp": 1.05538905, + "epoch": 0.25263790771080713, + "flos": 23370048230400.0, + "grad_norm": 1.8553162183332004, + "language_loss": 0.74929553, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77736235, + "num_input_tokens_seen": 90661350, + "step": 4202, + "time_per_iteration": 2.831092596054077 + }, + { + "auxiliary_loss_clip": 0.01494231, + "auxiliary_loss_mlp": 0.01303632, + "balance_loss_clip": 1.14650869, + "balance_loss_mlp": 1.04842925, + "epoch": 0.25269803096347515, + "flos": 18517015847520.0, + "grad_norm": 1.99027543252741, + "language_loss": 0.73796195, + "learning_rate": 3.501918195122491e-06, + "loss": 0.76594055, + "num_input_tokens_seen": 90680540, + "step": 4203, + "time_per_iteration": 4.5218706130981445 + }, + { + "auxiliary_loss_clip": 0.01488106, + "auxiliary_loss_mlp": 0.01312183, + "balance_loss_clip": 1.14109802, + "balance_loss_mlp": 1.05583501, + "epoch": 0.2527581542161431, + "flos": 24613004348640.0, + "grad_norm": 1.5077794330424372, + "language_loss": 0.7763449, + "learning_rate": 3.501660986124297e-06, + "loss": 0.80434781, + "num_input_tokens_seen": 90703460, + "step": 4204, + "time_per_iteration": 2.8805675506591797 + }, + { + "auxiliary_loss_clip": 0.01494954, + "auxiliary_loss_mlp": 0.01320928, + "balance_loss_clip": 1.14767528, + "balance_loss_mlp": 1.06400836, + "epoch": 0.2528182774688111, + "flos": 12642947196480.0, + "grad_norm": 2.0302712351513525, + "language_loss": 0.72230983, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.75046861, + "num_input_tokens_seen": 90718815, + "step": 4205, + "time_per_iteration": 2.7792019844055176 + }, + { + "auxiliary_loss_clip": 0.01492731, + "auxiliary_loss_mlp": 0.01314313, + "balance_loss_clip": 1.14598024, + "balance_loss_mlp": 1.06216121, + "epoch": 0.25287840072147905, + "flos": 46940321403360.0, + "grad_norm": 1.474308594695026, + "language_loss": 0.75721544, + "learning_rate": 3.50114639730826e-06, + "loss": 0.78528589, + "num_input_tokens_seen": 90742125, + "step": 4206, + "time_per_iteration": 3.0687179565429688 + }, + { + "auxiliary_loss_clip": 0.01494971, + "auxiliary_loss_mlp": 0.01300087, + "balance_loss_clip": 1.1470933, + "balance_loss_mlp": 1.03801703, + "epoch": 0.252938523974147, + "flos": 18881698749120.0, + "grad_norm": 1.7689393668497895, + "language_loss": 0.79172683, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81967741, + "num_input_tokens_seen": 90760785, + "step": 4207, + "time_per_iteration": 2.8346993923187256 + }, + { + "auxiliary_loss_clip": 0.01495937, + "auxiliary_loss_mlp": 0.01306009, + "balance_loss_clip": 1.14838099, + "balance_loss_mlp": 1.04813504, + "epoch": 0.252998647226815, + "flos": 21436906462560.0, + "grad_norm": 1.6351748347526947, + "language_loss": 0.76586676, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.79388618, + "num_input_tokens_seen": 90780045, + "step": 4208, + "time_per_iteration": 2.8471720218658447 + }, + { + "auxiliary_loss_clip": 0.0148894, + "auxiliary_loss_mlp": 0.01291325, + "balance_loss_clip": 1.14160156, + "balance_loss_mlp": 1.03020906, + "epoch": 0.25305877047948294, + "flos": 25444246282560.0, + "grad_norm": 2.376796560259193, + "language_loss": 0.70163512, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.72943783, + "num_input_tokens_seen": 90797980, + "step": 4209, + "time_per_iteration": 2.9414334297180176 + }, + { + "auxiliary_loss_clip": 0.01613761, + "auxiliary_loss_mlp": 0.01359955, + "balance_loss_clip": 1.27232456, + "balance_loss_mlp": 1.14862061, + "epoch": 0.2531188937321509, + "flos": 60192173109600.0, + "grad_norm": 0.7964536338515023, + "language_loss": 0.55075574, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.58049297, + "num_input_tokens_seen": 90864865, + "step": 4210, + "time_per_iteration": 4.918566703796387 + }, + { + "auxiliary_loss_clip": 0.01492738, + "auxiliary_loss_mlp": 0.01296168, + "balance_loss_clip": 1.14583933, + "balance_loss_mlp": 1.03161907, + "epoch": 0.25317901698481887, + "flos": 19684418342400.0, + "grad_norm": 1.7527483281132228, + "language_loss": 0.80485749, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.83274662, + "num_input_tokens_seen": 90882885, + "step": 4211, + "time_per_iteration": 4.5582966804504395 + }, + { + "auxiliary_loss_clip": 0.01494669, + "auxiliary_loss_mlp": 0.01281596, + "balance_loss_clip": 1.14707613, + "balance_loss_mlp": 1.0181911, + "epoch": 0.25323914023748684, + "flos": 24426585620640.0, + "grad_norm": 1.550467155696203, + "language_loss": 0.78595978, + "learning_rate": 3.499601265005622e-06, + "loss": 0.81372243, + "num_input_tokens_seen": 90902985, + "step": 4212, + "time_per_iteration": 2.990980625152588 + }, + { + "auxiliary_loss_clip": 0.01487553, + "auxiliary_loss_mlp": 0.01285521, + "balance_loss_clip": 1.14040828, + "balance_loss_mlp": 1.02173424, + "epoch": 0.2532992634901548, + "flos": 25449821722080.0, + "grad_norm": 2.041617688023128, + "language_loss": 0.53956103, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.56729174, + "num_input_tokens_seen": 90923550, + "step": 4213, + "time_per_iteration": 2.8936927318573 + }, + { + "auxiliary_loss_clip": 0.0149413, + "auxiliary_loss_mlp": 0.01307348, + "balance_loss_clip": 1.14664268, + "balance_loss_mlp": 1.03612256, + "epoch": 0.25335938674282277, + "flos": 18882571096800.0, + "grad_norm": 2.508259749113158, + "language_loss": 0.65054661, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67856139, + "num_input_tokens_seen": 90943260, + "step": 4214, + "time_per_iteration": 2.821316957473755 + }, + { + "auxiliary_loss_clip": 0.01604562, + "auxiliary_loss_mlp": 0.01288383, + "balance_loss_clip": 1.2628355, + "balance_loss_mlp": 1.06407928, + "epoch": 0.25341950999549073, + "flos": 53068359199680.0, + "grad_norm": 0.8794243903668032, + "language_loss": 0.57952726, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60845673, + "num_input_tokens_seen": 90996295, + "step": 4215, + "time_per_iteration": 3.224039077758789 + }, + { + "auxiliary_loss_clip": 0.0149322, + "auxiliary_loss_mlp": 0.01296772, + "balance_loss_clip": 1.14578581, + "balance_loss_mlp": 1.03451192, + "epoch": 0.2534796332481587, + "flos": 39023383724640.0, + "grad_norm": 1.6709149930633904, + "language_loss": 0.83108115, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85898101, + "num_input_tokens_seen": 91017545, + "step": 4216, + "time_per_iteration": 2.924558401107788 + }, + { + "auxiliary_loss_clip": 0.01486576, + "auxiliary_loss_mlp": 0.01293598, + "balance_loss_clip": 1.13815427, + "balance_loss_mlp": 1.031147, + "epoch": 0.2535397565008267, + "flos": 23589275181120.0, + "grad_norm": 1.80401940113627, + "language_loss": 0.80488181, + "learning_rate": 3.498312090875666e-06, + "loss": 0.83268356, + "num_input_tokens_seen": 91037715, + "step": 4217, + "time_per_iteration": 2.8582539558410645 + }, + { + "auxiliary_loss_clip": 0.01486552, + "auxiliary_loss_mlp": 0.01296006, + "balance_loss_clip": 1.13817501, + "balance_loss_mlp": 1.03717887, + "epoch": 0.2535998797534947, + "flos": 19283399899200.0, + "grad_norm": 2.329674034982802, + "language_loss": 0.75351411, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.7813397, + "num_input_tokens_seen": 91055295, + "step": 4218, + "time_per_iteration": 2.8617475032806396 + }, + { + "auxiliary_loss_clip": 0.01487938, + "auxiliary_loss_mlp": 0.01302367, + "balance_loss_clip": 1.14020681, + "balance_loss_mlp": 1.03819966, + "epoch": 0.25366000300616265, + "flos": 24026667094080.0, + "grad_norm": 1.943873243831078, + "language_loss": 0.7509048, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.77880788, + "num_input_tokens_seen": 91075485, + "step": 4219, + "time_per_iteration": 2.810880661010742 + }, + { + "auxiliary_loss_clip": 0.01490467, + "auxiliary_loss_mlp": 0.01304791, + "balance_loss_clip": 1.14258766, + "balance_loss_mlp": 1.04329348, + "epoch": 0.2537201262588306, + "flos": 16291217482560.0, + "grad_norm": 1.7256089381874522, + "language_loss": 0.81514895, + "learning_rate": 3.497537904525736e-06, + "loss": 0.84310156, + "num_input_tokens_seen": 91093620, + "step": 4220, + "time_per_iteration": 2.8360767364501953 + }, + { + "auxiliary_loss_clip": 0.01492978, + "auxiliary_loss_mlp": 0.01301382, + "balance_loss_clip": 1.14607561, + "balance_loss_mlp": 1.03931201, + "epoch": 0.2537802495114986, + "flos": 23296922009280.0, + "grad_norm": 2.1427684053593916, + "language_loss": 0.70968688, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73763049, + "num_input_tokens_seen": 91114110, + "step": 4221, + "time_per_iteration": 2.851248264312744 + }, + { + "auxiliary_loss_clip": 0.01489542, + "auxiliary_loss_mlp": 0.01298758, + "balance_loss_clip": 1.14227414, + "balance_loss_mlp": 1.04508066, + "epoch": 0.25384037276416654, + "flos": 17641056248640.0, + "grad_norm": 1.788067158408994, + "language_loss": 0.61891204, + "learning_rate": 3.497021496342202e-06, + "loss": 0.64679503, + "num_input_tokens_seen": 91133135, + "step": 4222, + "time_per_iteration": 2.849886655807495 + }, + { + "auxiliary_loss_clip": 0.01486439, + "auxiliary_loss_mlp": 0.01308697, + "balance_loss_clip": 1.13923144, + "balance_loss_mlp": 1.04491115, + "epoch": 0.2539004960168345, + "flos": 21509198264160.0, + "grad_norm": 1.6645444598088805, + "language_loss": 0.75175512, + "learning_rate": 3.496763207094731e-06, + "loss": 0.77970648, + "num_input_tokens_seen": 91151805, + "step": 4223, + "time_per_iteration": 2.8467299938201904 + }, + { + "auxiliary_loss_clip": 0.01492012, + "auxiliary_loss_mlp": 0.01303276, + "balance_loss_clip": 1.14526784, + "balance_loss_mlp": 1.04502058, + "epoch": 0.2539606192695025, + "flos": 23953161591360.0, + "grad_norm": 6.476828479899427, + "language_loss": 0.8013801, + "learning_rate": 3.49650486108985e-06, + "loss": 0.82933295, + "num_input_tokens_seen": 91172270, + "step": 4224, + "time_per_iteration": 2.850140333175659 + }, + { + "auxiliary_loss_clip": 0.01485913, + "auxiliary_loss_mlp": 0.01323468, + "balance_loss_clip": 1.13959491, + "balance_loss_mlp": 1.06712008, + "epoch": 0.25402074252217044, + "flos": 24172047188640.0, + "grad_norm": 1.5387601298795388, + "language_loss": 0.77666199, + "learning_rate": 3.496246458337354e-06, + "loss": 0.80475581, + "num_input_tokens_seen": 91192080, + "step": 4225, + "time_per_iteration": 2.9024786949157715 + }, + { + "auxiliary_loss_clip": 0.01489322, + "auxiliary_loss_mlp": 0.01311386, + "balance_loss_clip": 1.14326537, + "balance_loss_mlp": 1.05713618, + "epoch": 0.2540808657748384, + "flos": 22305621782880.0, + "grad_norm": 1.9117748353804864, + "language_loss": 0.84679943, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.87480652, + "num_input_tokens_seen": 91211450, + "step": 4226, + "time_per_iteration": 2.841705083847046 + }, + { + "auxiliary_loss_clip": 0.01488344, + "auxiliary_loss_mlp": 0.01314301, + "balance_loss_clip": 1.14294434, + "balance_loss_mlp": 1.05700016, + "epoch": 0.25414098902750637, + "flos": 27602000799840.0, + "grad_norm": 1.7882860804525136, + "language_loss": 0.71185654, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73988295, + "num_input_tokens_seen": 91231835, + "step": 4227, + "time_per_iteration": 2.868579387664795 + }, + { + "auxiliary_loss_clip": 0.01596828, + "auxiliary_loss_mlp": 0.01396408, + "balance_loss_clip": 1.25657773, + "balance_loss_mlp": 1.18736267, + "epoch": 0.25420111228017434, + "flos": 58176840290400.0, + "grad_norm": 1.0231091149969929, + "language_loss": 0.61846232, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.64839464, + "num_input_tokens_seen": 91288755, + "step": 4228, + "time_per_iteration": 3.1715848445892334 + }, + { + "auxiliary_loss_clip": 0.01489243, + "auxiliary_loss_mlp": 0.01310418, + "balance_loss_clip": 1.14354277, + "balance_loss_mlp": 1.05139995, + "epoch": 0.2542612355328423, + "flos": 11465493739200.0, + "grad_norm": 2.991432023156751, + "language_loss": 0.86908805, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.89708471, + "num_input_tokens_seen": 91302485, + "step": 4229, + "time_per_iteration": 2.8191473484039307 + }, + { + "auxiliary_loss_clip": 0.01496081, + "auxiliary_loss_mlp": 0.01307657, + "balance_loss_clip": 1.15077639, + "balance_loss_mlp": 1.04844809, + "epoch": 0.2543213587855103, + "flos": 22968081583200.0, + "grad_norm": 2.540368211083793, + "language_loss": 0.76664138, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79467869, + "num_input_tokens_seen": 91321120, + "step": 4230, + "time_per_iteration": 2.850459098815918 + }, + { + "auxiliary_loss_clip": 0.01484561, + "auxiliary_loss_mlp": 0.01289198, + "balance_loss_clip": 1.13868117, + "balance_loss_mlp": 1.02979851, + "epoch": 0.2543814820381783, + "flos": 18254777999040.0, + "grad_norm": 2.2154391746356064, + "language_loss": 0.7545867, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.78232431, + "num_input_tokens_seen": 91338575, + "step": 4231, + "time_per_iteration": 2.8567326068878174 + }, + { + "auxiliary_loss_clip": 0.01489147, + "auxiliary_loss_mlp": 0.01294456, + "balance_loss_clip": 1.1427443, + "balance_loss_mlp": 1.03524745, + "epoch": 0.25444160529084625, + "flos": 15634522762560.0, + "grad_norm": 2.710187054411745, + "language_loss": 0.73868144, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76651746, + "num_input_tokens_seen": 91357355, + "step": 4232, + "time_per_iteration": 2.897242546081543 + }, + { + "auxiliary_loss_clip": 0.01495666, + "auxiliary_loss_mlp": 0.01291677, + "balance_loss_clip": 1.14957738, + "balance_loss_mlp": 1.03151441, + "epoch": 0.2545017285435142, + "flos": 24603560236800.0, + "grad_norm": 2.372850129681275, + "language_loss": 0.87059826, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.89847171, + "num_input_tokens_seen": 91376515, + "step": 4233, + "time_per_iteration": 2.9427056312561035 + }, + { + "auxiliary_loss_clip": 0.01487263, + "auxiliary_loss_mlp": 0.0128395, + "balance_loss_clip": 1.14190435, + "balance_loss_mlp": 1.0266484, + "epoch": 0.2545618517961822, + "flos": 24681275765280.0, + "grad_norm": 1.8039760295582103, + "language_loss": 0.75786102, + "learning_rate": 3.493918281539737e-06, + "loss": 0.78557312, + "num_input_tokens_seen": 91397595, + "step": 4234, + "time_per_iteration": 2.8641252517700195 + }, + { + "auxiliary_loss_clip": 0.01489637, + "auxiliary_loss_mlp": 0.01292062, + "balance_loss_clip": 1.14332128, + "balance_loss_mlp": 1.03094637, + "epoch": 0.25462197504885015, + "flos": 23917394972160.0, + "grad_norm": 1.7012910675456214, + "language_loss": 0.7483874, + "learning_rate": 3.493659311850379e-06, + "loss": 0.77620435, + "num_input_tokens_seen": 91417775, + "step": 4235, + "time_per_iteration": 2.8661701679229736 + }, + { + "auxiliary_loss_clip": 0.01490618, + "auxiliary_loss_mlp": 0.01299301, + "balance_loss_clip": 1.14614534, + "balance_loss_mlp": 1.02941096, + "epoch": 0.2546820983015181, + "flos": 24791647803840.0, + "grad_norm": 2.3559908548200936, + "language_loss": 0.65168941, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.67958856, + "num_input_tokens_seen": 91437665, + "step": 4236, + "time_per_iteration": 2.8721988201141357 + }, + { + "auxiliary_loss_clip": 0.01494104, + "auxiliary_loss_mlp": 0.01289777, + "balance_loss_clip": 1.14798737, + "balance_loss_mlp": 1.03075945, + "epoch": 0.2547422215541861, + "flos": 18736129013760.0, + "grad_norm": 1.6068214596573045, + "language_loss": 0.6717149, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69955379, + "num_input_tokens_seen": 91456705, + "step": 4237, + "time_per_iteration": 2.8696365356445312 + }, + { + "auxiliary_loss_clip": 0.0149138, + "auxiliary_loss_mlp": 0.01294868, + "balance_loss_clip": 1.14622617, + "balance_loss_mlp": 1.03585052, + "epoch": 0.25480234480685404, + "flos": 21034560533760.0, + "grad_norm": 2.4763451724225995, + "language_loss": 0.75869477, + "learning_rate": 3.492882062983333e-06, + "loss": 0.78655726, + "num_input_tokens_seen": 91475535, + "step": 4238, + "time_per_iteration": 2.864960193634033 + }, + { + "auxiliary_loss_clip": 0.01486925, + "auxiliary_loss_mlp": 0.0129504, + "balance_loss_clip": 1.14180398, + "balance_loss_mlp": 1.03430521, + "epoch": 0.254862468059522, + "flos": 25084494041760.0, + "grad_norm": 1.755592086313286, + "language_loss": 0.80647266, + "learning_rate": 3.492622866794074e-06, + "loss": 0.83429235, + "num_input_tokens_seen": 91499140, + "step": 4239, + "time_per_iteration": 2.9112179279327393 + }, + { + "auxiliary_loss_clip": 0.01490017, + "auxiliary_loss_mlp": 0.01286106, + "balance_loss_clip": 1.14387369, + "balance_loss_mlp": 1.0278511, + "epoch": 0.25492259131219, + "flos": 20560491725760.0, + "grad_norm": 1.7292666484666248, + "language_loss": 0.77338815, + "learning_rate": 3.492363614004407e-06, + "loss": 0.80114937, + "num_input_tokens_seen": 91518335, + "step": 4240, + "time_per_iteration": 2.794107437133789 + }, + { + "auxiliary_loss_clip": 0.01493899, + "auxiliary_loss_mlp": 0.01303101, + "balance_loss_clip": 1.1469835, + "balance_loss_mlp": 1.04122233, + "epoch": 0.25498271456485794, + "flos": 25044631181280.0, + "grad_norm": 2.0467558693176366, + "language_loss": 0.83850348, + "learning_rate": 3.492104304624162e-06, + "loss": 0.86647356, + "num_input_tokens_seen": 91537655, + "step": 4241, + "time_per_iteration": 4.569481134414673 + }, + { + "auxiliary_loss_clip": 0.01498092, + "auxiliary_loss_mlp": 0.01309858, + "balance_loss_clip": 1.152233, + "balance_loss_mlp": 1.05083966, + "epoch": 0.2550428378175259, + "flos": 26180894292480.0, + "grad_norm": 1.7331688195358659, + "language_loss": 0.73506057, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.76314008, + "num_input_tokens_seen": 91557545, + "step": 4242, + "time_per_iteration": 2.9236984252929688 + }, + { + "auxiliary_loss_clip": 0.01493768, + "auxiliary_loss_mlp": 0.013042, + "balance_loss_clip": 1.14848852, + "balance_loss_mlp": 1.04384732, + "epoch": 0.2551029610701939, + "flos": 15268929585120.0, + "grad_norm": 3.2678780573807127, + "language_loss": 0.73188794, + "learning_rate": 3.491585516131273e-06, + "loss": 0.75986767, + "num_input_tokens_seen": 91574405, + "step": 4243, + "time_per_iteration": 2.8637337684631348 + }, + { + "auxiliary_loss_clip": 0.01495239, + "auxiliary_loss_mlp": 0.01318665, + "balance_loss_clip": 1.14844751, + "balance_loss_mlp": 1.06365287, + "epoch": 0.2551630843228619, + "flos": 18114063068160.0, + "grad_norm": 1.6509496007526343, + "language_loss": 0.8231461, + "learning_rate": 3.491326037038301e-06, + "loss": 0.8512851, + "num_input_tokens_seen": 91593755, + "step": 4244, + "time_per_iteration": 2.915227174758911 + }, + { + "auxiliary_loss_clip": 0.01601139, + "auxiliary_loss_mlp": 0.01284203, + "balance_loss_clip": 1.26098073, + "balance_loss_mlp": 1.05532074, + "epoch": 0.25522320757552985, + "flos": 70527851524800.0, + "grad_norm": 0.7516679716177656, + "language_loss": 0.57629758, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.605151, + "num_input_tokens_seen": 91660335, + "step": 4245, + "time_per_iteration": 3.4396400451660156 + }, + { + "auxiliary_loss_clip": 0.01482518, + "auxiliary_loss_mlp": 0.01326553, + "balance_loss_clip": 1.13515067, + "balance_loss_mlp": 1.06696296, + "epoch": 0.2552833308281978, + "flos": 22895296715520.0, + "grad_norm": 2.7269603204977573, + "language_loss": 0.65899795, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.68708873, + "num_input_tokens_seen": 91678500, + "step": 4246, + "time_per_iteration": 2.8093700408935547 + }, + { + "auxiliary_loss_clip": 0.01491388, + "auxiliary_loss_mlp": 0.0132289, + "balance_loss_clip": 1.14504886, + "balance_loss_mlp": 1.07417142, + "epoch": 0.2553434540808658, + "flos": 22055748514560.0, + "grad_norm": 1.904992193567806, + "language_loss": 0.81746304, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.84560585, + "num_input_tokens_seen": 91696430, + "step": 4247, + "time_per_iteration": 2.887791395187378 + }, + { + "auxiliary_loss_clip": 0.01488828, + "auxiliary_loss_mlp": 0.01350666, + "balance_loss_clip": 1.14180541, + "balance_loss_mlp": 1.09241104, + "epoch": 0.25540357733353375, + "flos": 16546021411680.0, + "grad_norm": 3.469878836437656, + "language_loss": 0.83737445, + "learning_rate": 3.490287555252514e-06, + "loss": 0.86576939, + "num_input_tokens_seen": 91713270, + "step": 4248, + "time_per_iteration": 4.388034343719482 + }, + { + "auxiliary_loss_clip": 0.01482964, + "auxiliary_loss_mlp": 0.01350403, + "balance_loss_clip": 1.13654852, + "balance_loss_mlp": 1.09634399, + "epoch": 0.2554637005862017, + "flos": 17566868039040.0, + "grad_norm": 2.036876359330588, + "language_loss": 0.8421216, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.87045527, + "num_input_tokens_seen": 91728865, + "step": 4249, + "time_per_iteration": 5.8963940143585205 + }, + { + "auxiliary_loss_clip": 0.01613174, + "auxiliary_loss_mlp": 0.01295937, + "balance_loss_clip": 1.27358878, + "balance_loss_mlp": 1.0777359, + "epoch": 0.2555238238388697, + "flos": 72251021177280.0, + "grad_norm": 0.7847090485582117, + "language_loss": 0.56231451, + "learning_rate": 3.489767975249115e-06, + "loss": 0.59140563, + "num_input_tokens_seen": 91787470, + "step": 4250, + "time_per_iteration": 3.3571743965148926 + }, + { + "auxiliary_loss_clip": 0.01485347, + "auxiliary_loss_mlp": 0.01365572, + "balance_loss_clip": 1.14002681, + "balance_loss_mlp": 1.11170423, + "epoch": 0.25558394709153764, + "flos": 24391729277280.0, + "grad_norm": 2.221516952300813, + "language_loss": 0.80780292, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.83631212, + "num_input_tokens_seen": 91805640, + "step": 4251, + "time_per_iteration": 2.8666481971740723 + }, + { + "auxiliary_loss_clip": 0.01611535, + "auxiliary_loss_mlp": 0.01274002, + "balance_loss_clip": 1.27198446, + "balance_loss_mlp": 1.05427551, + "epoch": 0.2556440703442056, + "flos": 69238129620960.0, + "grad_norm": 0.7982826761542883, + "language_loss": 0.66128016, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.6901356, + "num_input_tokens_seen": 91869695, + "step": 4252, + "time_per_iteration": 3.3456530570983887 + }, + { + "auxiliary_loss_clip": 0.01490574, + "auxiliary_loss_mlp": 0.01359076, + "balance_loss_clip": 1.1447891, + "balance_loss_mlp": 1.11035788, + "epoch": 0.2557041935968736, + "flos": 24866253223200.0, + "grad_norm": 1.8697864427110384, + "language_loss": 0.7387495, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.76724601, + "num_input_tokens_seen": 91889920, + "step": 4253, + "time_per_iteration": 2.965837001800537 + }, + { + "auxiliary_loss_clip": 0.01484442, + "auxiliary_loss_mlp": 0.01315248, + "balance_loss_clip": 1.13862765, + "balance_loss_mlp": 1.06099844, + "epoch": 0.25576431684954154, + "flos": 22494316200480.0, + "grad_norm": 2.1198067718818865, + "language_loss": 0.7324447, + "learning_rate": 3.488728137415357e-06, + "loss": 0.7604416, + "num_input_tokens_seen": 91908665, + "step": 4254, + "time_per_iteration": 2.8412249088287354 + }, + { + "auxiliary_loss_clip": 0.01494475, + "auxiliary_loss_mlp": 0.01315498, + "balance_loss_clip": 1.1484009, + "balance_loss_mlp": 1.06124842, + "epoch": 0.2558244401022095, + "flos": 19828698520320.0, + "grad_norm": 2.431970798037619, + "language_loss": 0.81217277, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.84027255, + "num_input_tokens_seen": 91927855, + "step": 4255, + "time_per_iteration": 2.867767333984375 + }, + { + "auxiliary_loss_clip": 0.01492731, + "auxiliary_loss_mlp": 0.01296725, + "balance_loss_clip": 1.14726102, + "balance_loss_mlp": 1.03866088, + "epoch": 0.2558845633548775, + "flos": 23222657943360.0, + "grad_norm": 1.5882779457294591, + "language_loss": 0.85868537, + "learning_rate": 3.488207879742721e-06, + "loss": 0.88657993, + "num_input_tokens_seen": 91948500, + "step": 4256, + "time_per_iteration": 2.9719789028167725 + }, + { + "auxiliary_loss_clip": 0.01488571, + "auxiliary_loss_mlp": 0.01306633, + "balance_loss_clip": 1.14281154, + "balance_loss_mlp": 1.04360962, + "epoch": 0.2559446866075455, + "flos": 16839815853600.0, + "grad_norm": 1.7065946780585277, + "language_loss": 0.7516073, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.77955937, + "num_input_tokens_seen": 91968375, + "step": 4257, + "time_per_iteration": 2.8705904483795166 + }, + { + "auxiliary_loss_clip": 0.01611178, + "auxiliary_loss_mlp": 0.01265602, + "balance_loss_clip": 1.27184629, + "balance_loss_mlp": 1.0351944, + "epoch": 0.25600480986021346, + "flos": 57600136788480.0, + "grad_norm": 0.8172876843048368, + "language_loss": 0.65233099, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.68109882, + "num_input_tokens_seen": 92028490, + "step": 4258, + "time_per_iteration": 3.3907310962677 + }, + { + "auxiliary_loss_clip": 0.01495224, + "auxiliary_loss_mlp": 0.01288738, + "balance_loss_clip": 1.14829433, + "balance_loss_mlp": 1.03448832, + "epoch": 0.2560649331128814, + "flos": 27822062170080.0, + "grad_norm": 1.6502243008127615, + "language_loss": 0.76977718, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.79761678, + "num_input_tokens_seen": 92048060, + "step": 4259, + "time_per_iteration": 2.9360790252685547 + }, + { + "auxiliary_loss_clip": 0.01610977, + "auxiliary_loss_mlp": 0.01245583, + "balance_loss_clip": 1.27112246, + "balance_loss_mlp": 1.02204132, + "epoch": 0.2561250563655494, + "flos": 70957771590240.0, + "grad_norm": 0.8028267426514096, + "language_loss": 0.58522969, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.61379528, + "num_input_tokens_seen": 92118180, + "step": 4260, + "time_per_iteration": 3.4716994762420654 + }, + { + "auxiliary_loss_clip": 0.01488561, + "auxiliary_loss_mlp": 0.01287305, + "balance_loss_clip": 1.14196181, + "balance_loss_mlp": 1.03095782, + "epoch": 0.25618517961821735, + "flos": 27014563628640.0, + "grad_norm": 1.9865669065411957, + "language_loss": 0.77092862, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.79868734, + "num_input_tokens_seen": 92137570, + "step": 4261, + "time_per_iteration": 2.8815438747406006 + }, + { + "auxiliary_loss_clip": 0.01486139, + "auxiliary_loss_mlp": 0.01292244, + "balance_loss_clip": 1.13972938, + "balance_loss_mlp": 1.03494263, + "epoch": 0.2562453028708853, + "flos": 23070109426560.0, + "grad_norm": 1.652862437631867, + "language_loss": 0.83599973, + "learning_rate": 3.486645752648842e-06, + "loss": 0.86378354, + "num_input_tokens_seen": 92157625, + "step": 4262, + "time_per_iteration": 2.967442750930786 + }, + { + "auxiliary_loss_clip": 0.01480544, + "auxiliary_loss_mlp": 0.01299042, + "balance_loss_clip": 1.13409328, + "balance_loss_mlp": 1.03735352, + "epoch": 0.2563054261235533, + "flos": 15122677142880.0, + "grad_norm": 2.5383590054456904, + "language_loss": 0.74034894, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76814485, + "num_input_tokens_seen": 92175350, + "step": 4263, + "time_per_iteration": 2.8825266361236572 + }, + { + "auxiliary_loss_clip": 0.01490567, + "auxiliary_loss_mlp": 0.01308302, + "balance_loss_clip": 1.14356327, + "balance_loss_mlp": 1.05615044, + "epoch": 0.25636554937622125, + "flos": 27857335723200.0, + "grad_norm": 2.162220342620178, + "language_loss": 0.83123803, + "learning_rate": 3.486124592522163e-06, + "loss": 0.85922658, + "num_input_tokens_seen": 92196070, + "step": 4264, + "time_per_iteration": 2.991886615753174 + }, + { + "auxiliary_loss_clip": 0.01493142, + "auxiliary_loss_mlp": 0.01302261, + "balance_loss_clip": 1.1458137, + "balance_loss_mlp": 1.04267049, + "epoch": 0.2564256726288892, + "flos": 28908676955520.0, + "grad_norm": 1.6406528066542236, + "language_loss": 0.74658835, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.77454245, + "num_input_tokens_seen": 92216310, + "step": 4265, + "time_per_iteration": 2.9735679626464844 + }, + { + "auxiliary_loss_clip": 0.01483211, + "auxiliary_loss_mlp": 0.01302571, + "balance_loss_clip": 1.13636196, + "balance_loss_mlp": 1.04526949, + "epoch": 0.2564857958815572, + "flos": 18516902063040.0, + "grad_norm": 1.7605322540249082, + "language_loss": 0.82036334, + "learning_rate": 3.485603206979513e-06, + "loss": 0.84822112, + "num_input_tokens_seen": 92234510, + "step": 4266, + "time_per_iteration": 2.9100167751312256 + }, + { + "auxiliary_loss_clip": 0.01484341, + "auxiliary_loss_mlp": 0.01297961, + "balance_loss_clip": 1.13705373, + "balance_loss_mlp": 1.04638183, + "epoch": 0.25654591913422514, + "flos": 25810446310560.0, + "grad_norm": 1.5637572275184701, + "language_loss": 0.79441226, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.82223523, + "num_input_tokens_seen": 92254070, + "step": 4267, + "time_per_iteration": 2.910780191421509 + }, + { + "auxiliary_loss_clip": 0.01492897, + "auxiliary_loss_mlp": 0.01311036, + "balance_loss_clip": 1.14559019, + "balance_loss_mlp": 1.05926585, + "epoch": 0.2566060423868931, + "flos": 19101722191200.0, + "grad_norm": 1.6651211790564133, + "language_loss": 0.7919594, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81999874, + "num_input_tokens_seen": 92275060, + "step": 4268, + "time_per_iteration": 2.916753053665161 + }, + { + "auxiliary_loss_clip": 0.01475854, + "auxiliary_loss_mlp": 0.01293058, + "balance_loss_clip": 1.12792981, + "balance_loss_mlp": 1.03785491, + "epoch": 0.25666616563956113, + "flos": 23844989386080.0, + "grad_norm": 1.5375007946965689, + "language_loss": 0.68418944, + "learning_rate": 3.484820706183595e-06, + "loss": 0.71187866, + "num_input_tokens_seen": 92293610, + "step": 4269, + "time_per_iteration": 2.887542724609375 + }, + { + "auxiliary_loss_clip": 0.01481401, + "auxiliary_loss_mlp": 0.01324939, + "balance_loss_clip": 1.13466489, + "balance_loss_mlp": 1.06973612, + "epoch": 0.2567262888922291, + "flos": 14605635365280.0, + "grad_norm": 3.644450464636078, + "language_loss": 0.79135668, + "learning_rate": 3.484559759962666e-06, + "loss": 0.81942004, + "num_input_tokens_seen": 92308305, + "step": 4270, + "time_per_iteration": 2.8445284366607666 + }, + { + "auxiliary_loss_clip": 0.01484217, + "auxiliary_loss_mlp": 0.01310417, + "balance_loss_clip": 1.13684511, + "balance_loss_mlp": 1.05330622, + "epoch": 0.25678641214489706, + "flos": 32925688456320.0, + "grad_norm": 2.0024172794155533, + "language_loss": 0.68467438, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.71262074, + "num_input_tokens_seen": 92329875, + "step": 4271, + "time_per_iteration": 2.9342150688171387 + }, + { + "auxiliary_loss_clip": 0.01482607, + "auxiliary_loss_mlp": 0.01304757, + "balance_loss_clip": 1.13519597, + "balance_loss_mlp": 1.04936266, + "epoch": 0.256846535397565, + "flos": 24101310441600.0, + "grad_norm": 1.4829802020731346, + "language_loss": 0.87418467, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.90205836, + "num_input_tokens_seen": 92348780, + "step": 4272, + "time_per_iteration": 2.9024033546447754 + }, + { + "auxiliary_loss_clip": 0.01489499, + "auxiliary_loss_mlp": 0.0130753, + "balance_loss_clip": 1.14100385, + "balance_loss_mlp": 1.05232704, + "epoch": 0.256906658650233, + "flos": 19720450458720.0, + "grad_norm": 1.7180780294564812, + "language_loss": 0.82190561, + "learning_rate": 3.483776583571541e-06, + "loss": 0.84987593, + "num_input_tokens_seen": 92368175, + "step": 4273, + "time_per_iteration": 2.8682823181152344 + }, + { + "auxiliary_loss_clip": 0.01483838, + "auxiliary_loss_mlp": 0.0130734, + "balance_loss_clip": 1.13577676, + "balance_loss_mlp": 1.05366313, + "epoch": 0.25696678190290095, + "flos": 22928029081920.0, + "grad_norm": 1.7247182553666422, + "language_loss": 0.77439523, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.80230701, + "num_input_tokens_seen": 92387755, + "step": 4274, + "time_per_iteration": 2.9490087032318115 + }, + { + "auxiliary_loss_clip": 0.01478005, + "auxiliary_loss_mlp": 0.01300253, + "balance_loss_clip": 1.12970376, + "balance_loss_mlp": 1.04695702, + "epoch": 0.2570269051555689, + "flos": 27310292406720.0, + "grad_norm": 1.666816090894508, + "language_loss": 0.84099972, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86878234, + "num_input_tokens_seen": 92409850, + "step": 4275, + "time_per_iteration": 2.83530855178833 + }, + { + "auxiliary_loss_clip": 0.01480997, + "auxiliary_loss_mlp": 0.01292287, + "balance_loss_clip": 1.13414419, + "balance_loss_mlp": 1.03212476, + "epoch": 0.2570870284082369, + "flos": 27565892827200.0, + "grad_norm": 1.9936805302336125, + "language_loss": 0.78412902, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.81186181, + "num_input_tokens_seen": 92431250, + "step": 4276, + "time_per_iteration": 2.8787124156951904 + }, + { + "auxiliary_loss_clip": 0.01482543, + "auxiliary_loss_mlp": 0.01303215, + "balance_loss_clip": 1.13626921, + "balance_loss_mlp": 1.04648614, + "epoch": 0.25714715166090485, + "flos": 28733371178400.0, + "grad_norm": 1.664690451225991, + "language_loss": 0.79790497, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.82576251, + "num_input_tokens_seen": 92452065, + "step": 4277, + "time_per_iteration": 2.904820203781128 + }, + { + "auxiliary_loss_clip": 0.01482814, + "auxiliary_loss_mlp": 0.0130307, + "balance_loss_clip": 1.13489401, + "balance_loss_mlp": 1.04901087, + "epoch": 0.2572072749135728, + "flos": 20117979511200.0, + "grad_norm": 2.0151107468256315, + "language_loss": 0.78922367, + "learning_rate": 3.482470164419295e-06, + "loss": 0.81708252, + "num_input_tokens_seen": 92470025, + "step": 4278, + "time_per_iteration": 2.7743844985961914 + }, + { + "auxiliary_loss_clip": 0.01486494, + "auxiliary_loss_mlp": 0.01304395, + "balance_loss_clip": 1.13861179, + "balance_loss_mlp": 1.0449959, + "epoch": 0.2572673981662408, + "flos": 26033238508320.0, + "grad_norm": 2.129891654555946, + "language_loss": 0.74908954, + "learning_rate": 3.482208711902952e-06, + "loss": 0.77699846, + "num_input_tokens_seen": 92489825, + "step": 4279, + "time_per_iteration": 4.451391220092773 + }, + { + "auxiliary_loss_clip": 0.0147774, + "auxiliary_loss_mlp": 0.0130586, + "balance_loss_clip": 1.13006425, + "balance_loss_mlp": 1.04951251, + "epoch": 0.25732752141890874, + "flos": 16108439857920.0, + "grad_norm": 2.0880755367012758, + "language_loss": 0.86247748, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.89031351, + "num_input_tokens_seen": 92507270, + "step": 4280, + "time_per_iteration": 2.85463809967041 + }, + { + "auxiliary_loss_clip": 0.01484516, + "auxiliary_loss_mlp": 0.01294221, + "balance_loss_clip": 1.13648558, + "balance_loss_mlp": 1.03768265, + "epoch": 0.2573876446715767, + "flos": 22526403788160.0, + "grad_norm": 3.2915796870471006, + "language_loss": 0.79122508, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.81901246, + "num_input_tokens_seen": 92526300, + "step": 4281, + "time_per_iteration": 2.7480688095092773 + }, + { + "auxiliary_loss_clip": 0.01478689, + "auxiliary_loss_mlp": 0.01296724, + "balance_loss_clip": 1.13106036, + "balance_loss_mlp": 1.04342806, + "epoch": 0.2574477679242447, + "flos": 23953085735040.0, + "grad_norm": 2.2895221745718177, + "language_loss": 0.87332243, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.90107656, + "num_input_tokens_seen": 92546465, + "step": 4282, + "time_per_iteration": 2.8233797550201416 + }, + { + "auxiliary_loss_clip": 0.01477814, + "auxiliary_loss_mlp": 0.01307166, + "balance_loss_clip": 1.12890816, + "balance_loss_mlp": 1.05444181, + "epoch": 0.2575078911769127, + "flos": 21983911850880.0, + "grad_norm": 1.481043206547219, + "language_loss": 0.70364374, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.73149353, + "num_input_tokens_seen": 92567260, + "step": 4283, + "time_per_iteration": 2.7715814113616943 + }, + { + "auxiliary_loss_clip": 0.01481626, + "auxiliary_loss_mlp": 0.01295672, + "balance_loss_clip": 1.13365138, + "balance_loss_mlp": 1.04428291, + "epoch": 0.25756801442958066, + "flos": 21947614237440.0, + "grad_norm": 1.8231959815845762, + "language_loss": 0.80757523, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.83534813, + "num_input_tokens_seen": 92585425, + "step": 4284, + "time_per_iteration": 2.8529863357543945 + }, + { + "auxiliary_loss_clip": 0.01480204, + "auxiliary_loss_mlp": 0.01292139, + "balance_loss_clip": 1.13288593, + "balance_loss_mlp": 1.03674543, + "epoch": 0.2576281376822486, + "flos": 35264893112640.0, + "grad_norm": 2.1475125029363618, + "language_loss": 0.70069289, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72841638, + "num_input_tokens_seen": 92604770, + "step": 4285, + "time_per_iteration": 2.894902229309082 + }, + { + "auxiliary_loss_clip": 0.0148051, + "auxiliary_loss_mlp": 0.01294896, + "balance_loss_clip": 1.13224995, + "balance_loss_mlp": 1.03740346, + "epoch": 0.2576882609349166, + "flos": 14133766390560.0, + "grad_norm": 2.0370717514487375, + "language_loss": 0.58746898, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.61522305, + "num_input_tokens_seen": 92622635, + "step": 4286, + "time_per_iteration": 4.244974613189697 + }, + { + "auxiliary_loss_clip": 0.01479862, + "auxiliary_loss_mlp": 0.01296087, + "balance_loss_clip": 1.13226581, + "balance_loss_mlp": 1.0342083, + "epoch": 0.25774838418758456, + "flos": 23260586467680.0, + "grad_norm": 2.0916544046140833, + "language_loss": 0.64215094, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66991043, + "num_input_tokens_seen": 92642960, + "step": 4287, + "time_per_iteration": 6.570277690887451 + }, + { + "auxiliary_loss_clip": 0.01476296, + "auxiliary_loss_mlp": 0.01293313, + "balance_loss_clip": 1.1280663, + "balance_loss_mlp": 1.03753734, + "epoch": 0.2578085074402525, + "flos": 22603967604000.0, + "grad_norm": 3.1672995402217112, + "language_loss": 0.71820843, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74590456, + "num_input_tokens_seen": 92662455, + "step": 4288, + "time_per_iteration": 2.7762935161590576 + }, + { + "auxiliary_loss_clip": 0.01475007, + "auxiliary_loss_mlp": 0.01293251, + "balance_loss_clip": 1.12669468, + "balance_loss_mlp": 1.03823888, + "epoch": 0.2578686306929205, + "flos": 24574165548480.0, + "grad_norm": 1.6212822849139892, + "language_loss": 0.77130401, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79898655, + "num_input_tokens_seen": 92683520, + "step": 4289, + "time_per_iteration": 2.8830490112304688 + }, + { + "auxiliary_loss_clip": 0.01484929, + "auxiliary_loss_mlp": 0.01293416, + "balance_loss_clip": 1.13617003, + "balance_loss_mlp": 1.03687739, + "epoch": 0.25792875394558845, + "flos": 18115921548000.0, + "grad_norm": 3.012350394412907, + "language_loss": 0.85515213, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.88293552, + "num_input_tokens_seen": 92701450, + "step": 4290, + "time_per_iteration": 2.7458009719848633 + }, + { + "auxiliary_loss_clip": 0.01484435, + "auxiliary_loss_mlp": 0.0130387, + "balance_loss_clip": 1.13559663, + "balance_loss_mlp": 1.04313588, + "epoch": 0.2579888771982564, + "flos": 17714865176640.0, + "grad_norm": 2.310245291767928, + "language_loss": 0.72801989, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.75590295, + "num_input_tokens_seen": 92720355, + "step": 4291, + "time_per_iteration": 2.796962022781372 + }, + { + "auxiliary_loss_clip": 0.01479655, + "auxiliary_loss_mlp": 0.01289526, + "balance_loss_clip": 1.13191724, + "balance_loss_mlp": 1.02802896, + "epoch": 0.2580490004509244, + "flos": 16436711361600.0, + "grad_norm": 4.476358850040138, + "language_loss": 0.8145535, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.84224534, + "num_input_tokens_seen": 92736755, + "step": 4292, + "time_per_iteration": 2.7571046352386475 + }, + { + "auxiliary_loss_clip": 0.01484316, + "auxiliary_loss_mlp": 0.01306557, + "balance_loss_clip": 1.13510668, + "balance_loss_mlp": 1.05001903, + "epoch": 0.25810912370359235, + "flos": 33837756027840.0, + "grad_norm": 2.4549992970040826, + "language_loss": 0.67679018, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.70469892, + "num_input_tokens_seen": 92757655, + "step": 4293, + "time_per_iteration": 2.8908438682556152 + }, + { + "auxiliary_loss_clip": 0.01484977, + "auxiliary_loss_mlp": 0.01297993, + "balance_loss_clip": 1.1352545, + "balance_loss_mlp": 1.04298019, + "epoch": 0.2581692469562603, + "flos": 25194562655040.0, + "grad_norm": 2.0252567647187347, + "language_loss": 0.75679672, + "learning_rate": 3.478280185054542e-06, + "loss": 0.78462642, + "num_input_tokens_seen": 92776100, + "step": 4294, + "time_per_iteration": 2.911350965499878 + }, + { + "auxiliary_loss_clip": 0.01484388, + "auxiliary_loss_mlp": 0.01300346, + "balance_loss_clip": 1.13400984, + "balance_loss_mlp": 1.04457021, + "epoch": 0.2582293702089283, + "flos": 34935028626240.0, + "grad_norm": 4.942959787086984, + "language_loss": 0.80642086, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83426821, + "num_input_tokens_seen": 92798880, + "step": 4295, + "time_per_iteration": 2.9242262840270996 + }, + { + "auxiliary_loss_clip": 0.01482824, + "auxiliary_loss_mlp": 0.01299525, + "balance_loss_clip": 1.1314621, + "balance_loss_mlp": 1.03821826, + "epoch": 0.2582894934615963, + "flos": 26836185670560.0, + "grad_norm": 1.8910181551518996, + "language_loss": 0.72985744, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.75768101, + "num_input_tokens_seen": 92817750, + "step": 4296, + "time_per_iteration": 2.888025999069214 + }, + { + "auxiliary_loss_clip": 0.014819, + "auxiliary_loss_mlp": 0.01302649, + "balance_loss_clip": 1.1323061, + "balance_loss_mlp": 1.04096103, + "epoch": 0.25834961671426426, + "flos": 23517590230080.0, + "grad_norm": 1.7881334891746197, + "language_loss": 0.87471902, + "learning_rate": 3.477492965085067e-06, + "loss": 0.90256453, + "num_input_tokens_seen": 92837995, + "step": 4297, + "time_per_iteration": 2.8550796508789062 + }, + { + "auxiliary_loss_clip": 0.01478907, + "auxiliary_loss_mlp": 0.01303537, + "balance_loss_clip": 1.12973547, + "balance_loss_mlp": 1.04356539, + "epoch": 0.25840973996693223, + "flos": 22452974141760.0, + "grad_norm": 1.756163702334371, + "language_loss": 0.84538972, + "learning_rate": 3.477230446361943e-06, + "loss": 0.87321413, + "num_input_tokens_seen": 92857245, + "step": 4298, + "time_per_iteration": 2.836473226547241 + }, + { + "auxiliary_loss_clip": 0.01477505, + "auxiliary_loss_mlp": 0.01296888, + "balance_loss_clip": 1.12765884, + "balance_loss_mlp": 1.03405499, + "epoch": 0.2584698632196002, + "flos": 11292425723520.0, + "grad_norm": 2.635910312661116, + "language_loss": 0.83620745, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.86395144, + "num_input_tokens_seen": 92873265, + "step": 4299, + "time_per_iteration": 2.862421751022339 + }, + { + "auxiliary_loss_clip": 0.01479786, + "auxiliary_loss_mlp": 0.01291162, + "balance_loss_clip": 1.13030505, + "balance_loss_mlp": 1.03786623, + "epoch": 0.25852998647226816, + "flos": 17931854365920.0, + "grad_norm": 2.1151340063178505, + "language_loss": 0.82841635, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.85612583, + "num_input_tokens_seen": 92890880, + "step": 4300, + "time_per_iteration": 2.8287410736083984 + }, + { + "auxiliary_loss_clip": 0.01479717, + "auxiliary_loss_mlp": 0.01297977, + "balance_loss_clip": 1.12919712, + "balance_loss_mlp": 1.04067612, + "epoch": 0.2585901097249361, + "flos": 33258966477120.0, + "grad_norm": 1.9299405808430021, + "language_loss": 0.67585498, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.70363194, + "num_input_tokens_seen": 92910770, + "step": 4301, + "time_per_iteration": 2.980236530303955 + }, + { + "auxiliary_loss_clip": 0.01481094, + "auxiliary_loss_mlp": 0.01324435, + "balance_loss_clip": 1.13071573, + "balance_loss_mlp": 1.06999481, + "epoch": 0.2586502329776041, + "flos": 18443206919520.0, + "grad_norm": 77.8320168237356, + "language_loss": 0.81776178, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.84581709, + "num_input_tokens_seen": 92929520, + "step": 4302, + "time_per_iteration": 2.8209261894226074 + }, + { + "auxiliary_loss_clip": 0.01481075, + "auxiliary_loss_mlp": 0.01300205, + "balance_loss_clip": 1.13139307, + "balance_loss_mlp": 1.04614639, + "epoch": 0.25871035623027205, + "flos": 17970086315520.0, + "grad_norm": 1.7413497872804917, + "language_loss": 0.92131853, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94913137, + "num_input_tokens_seen": 92947890, + "step": 4303, + "time_per_iteration": 2.8024685382843018 + }, + { + "auxiliary_loss_clip": 0.01479641, + "auxiliary_loss_mlp": 0.01296286, + "balance_loss_clip": 1.12923265, + "balance_loss_mlp": 1.04089165, + "epoch": 0.25877047948294, + "flos": 27779961548160.0, + "grad_norm": 2.6500812248522494, + "language_loss": 0.67669028, + "learning_rate": 3.475654158020507e-06, + "loss": 0.70444953, + "num_input_tokens_seen": 92967690, + "step": 4304, + "time_per_iteration": 2.8408493995666504 + }, + { + "auxiliary_loss_clip": 0.01478152, + "auxiliary_loss_mlp": 0.01297564, + "balance_loss_clip": 1.12691712, + "balance_loss_mlp": 1.04159737, + "epoch": 0.258830602735608, + "flos": 27128311273440.0, + "grad_norm": 2.5388925460109757, + "language_loss": 0.72452319, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.75228029, + "num_input_tokens_seen": 92986830, + "step": 4305, + "time_per_iteration": 2.9184815883636475 + }, + { + "auxiliary_loss_clip": 0.0147644, + "auxiliary_loss_mlp": 0.01318063, + "balance_loss_clip": 1.12612367, + "balance_loss_mlp": 1.05732882, + "epoch": 0.25889072598827595, + "flos": 17893243134720.0, + "grad_norm": 2.1212261046611887, + "language_loss": 0.75969911, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.78764415, + "num_input_tokens_seen": 93002740, + "step": 4306, + "time_per_iteration": 2.7893710136413574 + }, + { + "auxiliary_loss_clip": 0.01599171, + "auxiliary_loss_mlp": 0.01518715, + "balance_loss_clip": 1.25322783, + "balance_loss_mlp": 1.31195831, + "epoch": 0.2589508492409439, + "flos": 53940525982560.0, + "grad_norm": 0.9235829588957556, + "language_loss": 0.57035714, + "learning_rate": 3.474865258296403e-06, + "loss": 0.60153604, + "num_input_tokens_seen": 93058645, + "step": 4307, + "time_per_iteration": 3.173855781555176 + }, + { + "auxiliary_loss_clip": 0.01493206, + "auxiliary_loss_mlp": 0.01295167, + "balance_loss_clip": 1.14243245, + "balance_loss_mlp": 1.03977323, + "epoch": 0.2590109724936119, + "flos": 22127888603520.0, + "grad_norm": 1.6773571230169502, + "language_loss": 0.71946299, + "learning_rate": 3.474602179854327e-06, + "loss": 0.74734676, + "num_input_tokens_seen": 93077140, + "step": 4308, + "time_per_iteration": 2.773141860961914 + }, + { + "auxiliary_loss_clip": 0.0148847, + "auxiliary_loss_mlp": 0.01287283, + "balance_loss_clip": 1.13843989, + "balance_loss_mlp": 1.02426004, + "epoch": 0.2590710957462799, + "flos": 13475668328640.0, + "grad_norm": 1.8718281059121453, + "language_loss": 0.845725, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.87348258, + "num_input_tokens_seen": 93093580, + "step": 4309, + "time_per_iteration": 2.7108500003814697 + }, + { + "auxiliary_loss_clip": 0.01492614, + "auxiliary_loss_mlp": 0.01292542, + "balance_loss_clip": 1.14164972, + "balance_loss_mlp": 1.0289464, + "epoch": 0.25913121899894787, + "flos": 22309149101760.0, + "grad_norm": 1.8293460616159325, + "language_loss": 0.84975207, + "learning_rate": 3.474075855228966e-06, + "loss": 0.87760365, + "num_input_tokens_seen": 93112345, + "step": 4310, + "time_per_iteration": 2.7814650535583496 + }, + { + "auxiliary_loss_clip": 0.01490091, + "auxiliary_loss_mlp": 0.01294806, + "balance_loss_clip": 1.1381284, + "balance_loss_mlp": 1.03178251, + "epoch": 0.25919134225161583, + "flos": 25814087413920.0, + "grad_norm": 1.7863620917431127, + "language_loss": 0.77499151, + "learning_rate": 3.473812609065639e-06, + "loss": 0.80284047, + "num_input_tokens_seen": 93131545, + "step": 4311, + "time_per_iteration": 2.8075788021087646 + }, + { + "auxiliary_loss_clip": 0.01490465, + "auxiliary_loss_mlp": 0.01302615, + "balance_loss_clip": 1.13857007, + "balance_loss_mlp": 1.04092646, + "epoch": 0.2592514655042838, + "flos": 31214959604640.0, + "grad_norm": 1.9056085803641354, + "language_loss": 0.72646505, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.75439584, + "num_input_tokens_seen": 93150730, + "step": 4312, + "time_per_iteration": 2.8729045391082764 + }, + { + "auxiliary_loss_clip": 0.01484208, + "auxiliary_loss_mlp": 0.01293027, + "balance_loss_clip": 1.13152695, + "balance_loss_mlp": 1.03038478, + "epoch": 0.25931158875695176, + "flos": 18476773705440.0, + "grad_norm": 1.8636433725956212, + "language_loss": 0.70567292, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.73344529, + "num_input_tokens_seen": 93167895, + "step": 4313, + "time_per_iteration": 2.7642035484313965 + }, + { + "auxiliary_loss_clip": 0.01487287, + "auxiliary_loss_mlp": 0.01285222, + "balance_loss_clip": 1.13621259, + "balance_loss_mlp": 1.02830243, + "epoch": 0.2593717120096197, + "flos": 19209932324640.0, + "grad_norm": 1.5451974243900395, + "language_loss": 0.80332077, + "learning_rate": 3.473022535292867e-06, + "loss": 0.83104587, + "num_input_tokens_seen": 93187650, + "step": 4314, + "time_per_iteration": 2.802685022354126 + }, + { + "auxiliary_loss_clip": 0.01483392, + "auxiliary_loss_mlp": 0.01285703, + "balance_loss_clip": 1.13192618, + "balance_loss_mlp": 1.02420592, + "epoch": 0.2594318352622877, + "flos": 31250764152000.0, + "grad_norm": 3.1310756276040856, + "language_loss": 0.67761004, + "learning_rate": 3.472759065640968e-06, + "loss": 0.70530093, + "num_input_tokens_seen": 93207370, + "step": 4315, + "time_per_iteration": 2.906022787094116 + }, + { + "auxiliary_loss_clip": 0.01487578, + "auxiliary_loss_mlp": 0.0129034, + "balance_loss_clip": 1.13544202, + "balance_loss_mlp": 1.02903366, + "epoch": 0.25949195851495566, + "flos": 22239360558720.0, + "grad_norm": 1.514051791021998, + "language_loss": 0.7962634, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.82404256, + "num_input_tokens_seen": 93227925, + "step": 4316, + "time_per_iteration": 2.818960428237915 + }, + { + "auxiliary_loss_clip": 0.01492715, + "auxiliary_loss_mlp": 0.01295689, + "balance_loss_clip": 1.139925, + "balance_loss_mlp": 1.03667116, + "epoch": 0.2595520817676236, + "flos": 28078307369280.0, + "grad_norm": 1.6197184441882706, + "language_loss": 0.77891278, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.80679679, + "num_input_tokens_seen": 93250020, + "step": 4317, + "time_per_iteration": 2.8717265129089355 + }, + { + "auxiliary_loss_clip": 0.01492847, + "auxiliary_loss_mlp": 0.01294335, + "balance_loss_clip": 1.13912535, + "balance_loss_mlp": 1.03360033, + "epoch": 0.2596122050202916, + "flos": 20192888355840.0, + "grad_norm": 2.031794287548412, + "language_loss": 0.77632129, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.80419314, + "num_input_tokens_seen": 93269070, + "step": 4318, + "time_per_iteration": 4.4969353675842285 + }, + { + "auxiliary_loss_clip": 0.01493806, + "auxiliary_loss_mlp": 0.01294687, + "balance_loss_clip": 1.13983893, + "balance_loss_mlp": 1.04024696, + "epoch": 0.25967232827295955, + "flos": 22530120747840.0, + "grad_norm": 1.6310424959779284, + "language_loss": 0.7696113, + "learning_rate": 3.471704628661598e-06, + "loss": 0.79749626, + "num_input_tokens_seen": 93290250, + "step": 4319, + "time_per_iteration": 2.8043601512908936 + }, + { + "auxiliary_loss_clip": 0.01494229, + "auxiliary_loss_mlp": 0.01287333, + "balance_loss_clip": 1.14218569, + "balance_loss_mlp": 1.03365564, + "epoch": 0.2597324515256275, + "flos": 21070327152960.0, + "grad_norm": 1.8730941589080339, + "language_loss": 0.76636577, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.79418147, + "num_input_tokens_seen": 93310090, + "step": 4320, + "time_per_iteration": 2.8468263149261475 + }, + { + "auxiliary_loss_clip": 0.01488212, + "auxiliary_loss_mlp": 0.0129373, + "balance_loss_clip": 1.13550436, + "balance_loss_mlp": 1.03700066, + "epoch": 0.2597925747782955, + "flos": 22051955698560.0, + "grad_norm": 1.6014100066093513, + "language_loss": 0.71173418, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73955357, + "num_input_tokens_seen": 93329570, + "step": 4321, + "time_per_iteration": 2.8372108936309814 + }, + { + "auxiliary_loss_clip": 0.01488703, + "auxiliary_loss_mlp": 0.01297645, + "balance_loss_clip": 1.13743258, + "balance_loss_mlp": 1.03633845, + "epoch": 0.2598526980309635, + "flos": 19539000319680.0, + "grad_norm": 2.823567107645075, + "language_loss": 0.74746287, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.77532637, + "num_input_tokens_seen": 93347920, + "step": 4322, + "time_per_iteration": 2.8977880477905273 + }, + { + "auxiliary_loss_clip": 0.01495769, + "auxiliary_loss_mlp": 0.01299285, + "balance_loss_clip": 1.14424968, + "balance_loss_mlp": 1.04255605, + "epoch": 0.25991282128363147, + "flos": 24497322367680.0, + "grad_norm": 2.439711363160045, + "language_loss": 0.74054027, + "learning_rate": 3.470649298767278e-06, + "loss": 0.76849079, + "num_input_tokens_seen": 93367145, + "step": 4323, + "time_per_iteration": 2.8638265132904053 + }, + { + "auxiliary_loss_clip": 0.014846, + "auxiliary_loss_mlp": 0.01317942, + "balance_loss_clip": 1.13291621, + "balance_loss_mlp": 1.05549073, + "epoch": 0.25997294453629943, + "flos": 24203452069440.0, + "grad_norm": 1.7814696173476778, + "language_loss": 0.67235786, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.70038325, + "num_input_tokens_seen": 93386555, + "step": 4324, + "time_per_iteration": 4.295200347900391 + }, + { + "auxiliary_loss_clip": 0.01496201, + "auxiliary_loss_mlp": 0.0130093, + "balance_loss_clip": 1.14622331, + "balance_loss_mlp": 1.04572678, + "epoch": 0.2600330677889674, + "flos": 31434186555360.0, + "grad_norm": 2.112677626020363, + "language_loss": 0.71187836, + "learning_rate": 3.470121299177082e-06, + "loss": 0.73984969, + "num_input_tokens_seen": 93405590, + "step": 4325, + "time_per_iteration": 4.3571789264678955 + }, + { + "auxiliary_loss_clip": 0.01489849, + "auxiliary_loss_mlp": 0.01317677, + "balance_loss_clip": 1.13853478, + "balance_loss_mlp": 1.06418991, + "epoch": 0.26009319104163536, + "flos": 32269069592640.0, + "grad_norm": 2.041300964878192, + "language_loss": 0.73467493, + "learning_rate": 3.469857215756257e-06, + "loss": 0.76275015, + "num_input_tokens_seen": 93424750, + "step": 4326, + "time_per_iteration": 5.237579584121704 + }, + { + "auxiliary_loss_clip": 0.01492781, + "auxiliary_loss_mlp": 0.01306638, + "balance_loss_clip": 1.14164519, + "balance_loss_mlp": 1.0569663, + "epoch": 0.26015331429430333, + "flos": 26289180282240.0, + "grad_norm": 2.0602129937685953, + "language_loss": 0.87086833, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.89886248, + "num_input_tokens_seen": 93443465, + "step": 4327, + "time_per_iteration": 2.8616514205932617 + }, + { + "auxiliary_loss_clip": 0.01495263, + "auxiliary_loss_mlp": 0.01301574, + "balance_loss_clip": 1.14480412, + "balance_loss_mlp": 1.04484487, + "epoch": 0.2602134375469713, + "flos": 21144477434400.0, + "grad_norm": 1.5275637209610984, + "language_loss": 0.80528164, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.83324999, + "num_input_tokens_seen": 93462580, + "step": 4328, + "time_per_iteration": 2.780019521713257 + }, + { + "auxiliary_loss_clip": 0.01488486, + "auxiliary_loss_mlp": 0.01307971, + "balance_loss_clip": 1.13886845, + "balance_loss_mlp": 1.05391204, + "epoch": 0.26027356079963926, + "flos": 25923700889280.0, + "grad_norm": 1.7036565683329181, + "language_loss": 0.87836432, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.90632886, + "num_input_tokens_seen": 93482790, + "step": 4329, + "time_per_iteration": 2.773721933364868 + }, + { + "auxiliary_loss_clip": 0.01489748, + "auxiliary_loss_mlp": 0.01309885, + "balance_loss_clip": 1.13955951, + "balance_loss_mlp": 1.05639839, + "epoch": 0.2603336840523072, + "flos": 26361623796480.0, + "grad_norm": 2.1220162962471765, + "language_loss": 0.78089929, + "learning_rate": 3.468800324801802e-06, + "loss": 0.80889559, + "num_input_tokens_seen": 93498795, + "step": 4330, + "time_per_iteration": 2.862346649169922 + }, + { + "auxiliary_loss_clip": 0.01490111, + "auxiliary_loss_mlp": 0.01313934, + "balance_loss_clip": 1.14009094, + "balance_loss_mlp": 1.05548823, + "epoch": 0.2603938073049752, + "flos": 23515921391040.0, + "grad_norm": 1.5147932549505148, + "language_loss": 0.75811762, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.78615797, + "num_input_tokens_seen": 93518335, + "step": 4331, + "time_per_iteration": 2.7648000717163086 + }, + { + "auxiliary_loss_clip": 0.01499196, + "auxiliary_loss_mlp": 0.01312773, + "balance_loss_clip": 1.14846992, + "balance_loss_mlp": 1.06004906, + "epoch": 0.26045393055764315, + "flos": 25376657572800.0, + "grad_norm": 1.8082917448356908, + "language_loss": 0.69254285, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.72066253, + "num_input_tokens_seen": 93539170, + "step": 4332, + "time_per_iteration": 2.845458984375 + }, + { + "auxiliary_loss_clip": 0.01493237, + "auxiliary_loss_mlp": 0.01306462, + "balance_loss_clip": 1.1432662, + "balance_loss_mlp": 1.05202174, + "epoch": 0.2605140538103111, + "flos": 27637691562720.0, + "grad_norm": 2.147504036019132, + "language_loss": 0.80220562, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.83020258, + "num_input_tokens_seen": 93558480, + "step": 4333, + "time_per_iteration": 2.792731761932373 + }, + { + "auxiliary_loss_clip": 0.01488859, + "auxiliary_loss_mlp": 0.01301434, + "balance_loss_clip": 1.13944685, + "balance_loss_mlp": 1.04832888, + "epoch": 0.2605741770629791, + "flos": 13771435034880.0, + "grad_norm": 2.47402528199221, + "language_loss": 0.80454737, + "learning_rate": 3.467742542694501e-06, + "loss": 0.83245027, + "num_input_tokens_seen": 93575220, + "step": 4334, + "time_per_iteration": 2.819383144378662 + }, + { + "auxiliary_loss_clip": 0.01492438, + "auxiliary_loss_mlp": 0.01308544, + "balance_loss_clip": 1.14224756, + "balance_loss_mlp": 1.0516243, + "epoch": 0.26063430031564705, + "flos": 26034110856000.0, + "grad_norm": 1.8421774012340961, + "language_loss": 0.79865724, + "learning_rate": 3.46747795800024e-06, + "loss": 0.82666707, + "num_input_tokens_seen": 93597015, + "step": 4335, + "time_per_iteration": 2.8135955333709717 + }, + { + "auxiliary_loss_clip": 0.01644323, + "auxiliary_loss_mlp": 0.01390869, + "balance_loss_clip": 1.2961545, + "balance_loss_mlp": 1.16351318, + "epoch": 0.26069442356831507, + "flos": 62450286631200.0, + "grad_norm": 0.9239460654047882, + "language_loss": 0.60855675, + "learning_rate": 3.467213317659068e-06, + "loss": 0.63890868, + "num_input_tokens_seen": 93657775, + "step": 4336, + "time_per_iteration": 3.319950819015503 + }, + { + "auxiliary_loss_clip": 0.014977, + "auxiliary_loss_mlp": 0.01310324, + "balance_loss_clip": 1.14744544, + "balance_loss_mlp": 1.05378532, + "epoch": 0.26075454682098304, + "flos": 13628103060960.0, + "grad_norm": 1.9679838511889578, + "language_loss": 0.77618587, + "learning_rate": 3.46694862168102e-06, + "loss": 0.8042661, + "num_input_tokens_seen": 93676145, + "step": 4337, + "time_per_iteration": 2.788658380508423 + }, + { + "auxiliary_loss_clip": 0.01493897, + "auxiliary_loss_mlp": 0.01306252, + "balance_loss_clip": 1.14407992, + "balance_loss_mlp": 1.05066729, + "epoch": 0.260814670073651, + "flos": 12127953539520.0, + "grad_norm": 1.9698907236572736, + "language_loss": 0.74434531, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.77234679, + "num_input_tokens_seen": 93692480, + "step": 4338, + "time_per_iteration": 2.8095767498016357 + }, + { + "auxiliary_loss_clip": 0.01492576, + "auxiliary_loss_mlp": 0.01305797, + "balance_loss_clip": 1.14196789, + "balance_loss_mlp": 1.05021214, + "epoch": 0.26087479332631897, + "flos": 15124345981920.0, + "grad_norm": 2.33655285374031, + "language_loss": 0.8069877, + "learning_rate": 3.466419062854447e-06, + "loss": 0.83497143, + "num_input_tokens_seen": 93710165, + "step": 4339, + "time_per_iteration": 2.812913656234741 + }, + { + "auxiliary_loss_clip": 0.0149493, + "auxiliary_loss_mlp": 0.01313015, + "balance_loss_clip": 1.14512706, + "balance_loss_mlp": 1.05895615, + "epoch": 0.26093491657898693, + "flos": 24683248029600.0, + "grad_norm": 1.5745846652501665, + "language_loss": 0.7656951, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.7937746, + "num_input_tokens_seen": 93730185, + "step": 4340, + "time_per_iteration": 2.7921667098999023 + }, + { + "auxiliary_loss_clip": 0.0149729, + "auxiliary_loss_mlp": 0.01316494, + "balance_loss_clip": 1.14720714, + "balance_loss_mlp": 1.06071854, + "epoch": 0.2609950398316549, + "flos": 25118326324800.0, + "grad_norm": 1.7159514995942213, + "language_loss": 0.82731628, + "learning_rate": 3.465889281600845e-06, + "loss": 0.85545409, + "num_input_tokens_seen": 93747690, + "step": 4341, + "time_per_iteration": 2.8355493545532227 + }, + { + "auxiliary_loss_clip": 0.01499461, + "auxiliary_loss_mlp": 0.01309763, + "balance_loss_clip": 1.14987373, + "balance_loss_mlp": 1.056849, + "epoch": 0.26105516308432286, + "flos": 28551124548000.0, + "grad_norm": 2.3808518833521637, + "language_loss": 0.76653433, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.79462659, + "num_input_tokens_seen": 93767405, + "step": 4342, + "time_per_iteration": 2.824345350265503 + }, + { + "auxiliary_loss_clip": 0.01489521, + "auxiliary_loss_mlp": 0.01306122, + "balance_loss_clip": 1.14010191, + "balance_loss_mlp": 1.04958379, + "epoch": 0.2611152863369908, + "flos": 39533750146080.0, + "grad_norm": 1.8442848968917775, + "language_loss": 0.66379708, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.69175351, + "num_input_tokens_seen": 93789950, + "step": 4343, + "time_per_iteration": 2.944253921508789 + }, + { + "auxiliary_loss_clip": 0.01499523, + "auxiliary_loss_mlp": 0.01299084, + "balance_loss_clip": 1.15074599, + "balance_loss_mlp": 1.04006612, + "epoch": 0.2611754095896588, + "flos": 13737033829440.0, + "grad_norm": 2.4416952067439675, + "language_loss": 0.73729289, + "learning_rate": 3.465094192845553e-06, + "loss": 0.765279, + "num_input_tokens_seen": 93807835, + "step": 4344, + "time_per_iteration": 2.911625623703003 + }, + { + "auxiliary_loss_clip": 0.01503917, + "auxiliary_loss_mlp": 0.01312194, + "balance_loss_clip": 1.15543222, + "balance_loss_mlp": 1.05508339, + "epoch": 0.26123553284232676, + "flos": 21508781054400.0, + "grad_norm": 2.243243399941816, + "language_loss": 0.86909038, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.89725149, + "num_input_tokens_seen": 93825670, + "step": 4345, + "time_per_iteration": 2.7941346168518066 + }, + { + "auxiliary_loss_clip": 0.01504013, + "auxiliary_loss_mlp": 0.0131338, + "balance_loss_clip": 1.15490222, + "balance_loss_mlp": 1.0597024, + "epoch": 0.2612956560949947, + "flos": 21141784535040.0, + "grad_norm": 1.9623687817560596, + "language_loss": 0.76313651, + "learning_rate": 3.464563855876015e-06, + "loss": 0.79131043, + "num_input_tokens_seen": 93844045, + "step": 4346, + "time_per_iteration": 2.843496561050415 + }, + { + "auxiliary_loss_clip": 0.01496895, + "auxiliary_loss_mlp": 0.01288906, + "balance_loss_clip": 1.14781272, + "balance_loss_mlp": 1.03274882, + "epoch": 0.2613557793476627, + "flos": 25121360577600.0, + "grad_norm": 2.2991951317555515, + "language_loss": 0.75911295, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78697091, + "num_input_tokens_seen": 93864380, + "step": 4347, + "time_per_iteration": 2.8432042598724365 + }, + { + "auxiliary_loss_clip": 0.01505058, + "auxiliary_loss_mlp": 0.0129559, + "balance_loss_clip": 1.15597653, + "balance_loss_mlp": 1.04191279, + "epoch": 0.26141590260033065, + "flos": 26070105044160.0, + "grad_norm": 1.4027373854849692, + "language_loss": 0.73275542, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.76076192, + "num_input_tokens_seen": 93885475, + "step": 4348, + "time_per_iteration": 2.8527536392211914 + }, + { + "auxiliary_loss_clip": 0.01500957, + "auxiliary_loss_mlp": 0.01291109, + "balance_loss_clip": 1.15242016, + "balance_loss_mlp": 1.02865791, + "epoch": 0.2614760258529987, + "flos": 25703829159840.0, + "grad_norm": 1.8613796823164799, + "language_loss": 0.90930188, + "learning_rate": 3.463767933923799e-06, + "loss": 0.9372226, + "num_input_tokens_seen": 93905545, + "step": 4349, + "time_per_iteration": 2.77516508102417 + }, + { + "auxiliary_loss_clip": 0.01500426, + "auxiliary_loss_mlp": 0.01289852, + "balance_loss_clip": 1.15149915, + "balance_loss_mlp": 1.03312302, + "epoch": 0.26153614910566664, + "flos": 17459075115360.0, + "grad_norm": 1.8903328161601773, + "language_loss": 0.80014545, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82804823, + "num_input_tokens_seen": 93924185, + "step": 4350, + "time_per_iteration": 2.775479316711426 + }, + { + "auxiliary_loss_clip": 0.01498985, + "auxiliary_loss_mlp": 0.01288874, + "balance_loss_clip": 1.15051532, + "balance_loss_mlp": 1.03061938, + "epoch": 0.2615962723583346, + "flos": 17714789320320.0, + "grad_norm": 1.881267409315388, + "language_loss": 0.62802553, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.65590417, + "num_input_tokens_seen": 93942825, + "step": 4351, + "time_per_iteration": 2.7820560932159424 + }, + { + "auxiliary_loss_clip": 0.01493748, + "auxiliary_loss_mlp": 0.01306877, + "balance_loss_clip": 1.14598846, + "balance_loss_mlp": 1.04785967, + "epoch": 0.26165639561100257, + "flos": 23259941688960.0, + "grad_norm": 2.149712968078703, + "language_loss": 0.83643085, + "learning_rate": 3.462971512415555e-06, + "loss": 0.8644371, + "num_input_tokens_seen": 93962045, + "step": 4352, + "time_per_iteration": 2.7626020908355713 + }, + { + "auxiliary_loss_clip": 0.01670055, + "auxiliary_loss_mlp": 0.01294525, + "balance_loss_clip": 1.32734001, + "balance_loss_mlp": 1.07632446, + "epoch": 0.26171651886367053, + "flos": 66744062830080.0, + "grad_norm": 0.9064906643051863, + "language_loss": 0.70487976, + "learning_rate": 3.462705927613996e-06, + "loss": 0.7345255, + "num_input_tokens_seen": 94021175, + "step": 4353, + "time_per_iteration": 3.17755389213562 + }, + { + "auxiliary_loss_clip": 0.0149895, + "auxiliary_loss_mlp": 0.01290481, + "balance_loss_clip": 1.15024543, + "balance_loss_mlp": 1.03203511, + "epoch": 0.2617766421163385, + "flos": 22351970358720.0, + "grad_norm": 1.908928569240548, + "language_loss": 0.78198004, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.8098743, + "num_input_tokens_seen": 94043370, + "step": 4354, + "time_per_iteration": 2.8818817138671875 + }, + { + "auxiliary_loss_clip": 0.01502152, + "auxiliary_loss_mlp": 0.01300254, + "balance_loss_clip": 1.15420043, + "balance_loss_mlp": 1.03551412, + "epoch": 0.26183676536900646, + "flos": 26069498193600.0, + "grad_norm": 1.887940037513348, + "language_loss": 0.68234456, + "learning_rate": 3.462174591623085e-06, + "loss": 0.71036863, + "num_input_tokens_seen": 94063510, + "step": 4355, + "time_per_iteration": 2.8716084957122803 + }, + { + "auxiliary_loss_clip": 0.01501013, + "auxiliary_loss_mlp": 0.01289797, + "balance_loss_clip": 1.15223551, + "balance_loss_mlp": 1.0265826, + "epoch": 0.26189688862167443, + "flos": 20998604273760.0, + "grad_norm": 2.1085169372556867, + "language_loss": 0.676036, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.70394409, + "num_input_tokens_seen": 94083865, + "step": 4356, + "time_per_iteration": 2.8061232566833496 + }, + { + "auxiliary_loss_clip": 0.01664538, + "auxiliary_loss_mlp": 0.0124369, + "balance_loss_clip": 1.32275879, + "balance_loss_mlp": 1.02091217, + "epoch": 0.2619570118743424, + "flos": 65804800403520.0, + "grad_norm": 0.6871480998973687, + "language_loss": 0.53142971, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.56051201, + "num_input_tokens_seen": 94144095, + "step": 4357, + "time_per_iteration": 4.727796792984009 + }, + { + "auxiliary_loss_clip": 0.0149974, + "auxiliary_loss_mlp": 0.0127898, + "balance_loss_clip": 1.15158677, + "balance_loss_mlp": 1.01347733, + "epoch": 0.26201713512701036, + "flos": 28769327438400.0, + "grad_norm": 1.827290373615031, + "language_loss": 0.84447175, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.87225896, + "num_input_tokens_seen": 94163035, + "step": 4358, + "time_per_iteration": 2.808394193649292 + }, + { + "auxiliary_loss_clip": 0.01494632, + "auxiliary_loss_mlp": 0.01294107, + "balance_loss_clip": 1.14572823, + "balance_loss_mlp": 1.02497983, + "epoch": 0.2620772583796783, + "flos": 26434939658400.0, + "grad_norm": 2.3814491532142252, + "language_loss": 0.67387187, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.70175922, + "num_input_tokens_seen": 94182520, + "step": 4359, + "time_per_iteration": 2.8106985092163086 + }, + { + "auxiliary_loss_clip": 0.01502648, + "auxiliary_loss_mlp": 0.01296739, + "balance_loss_clip": 1.15509105, + "balance_loss_mlp": 1.03276217, + "epoch": 0.2621373816323463, + "flos": 20158600934880.0, + "grad_norm": 1.9764347419671437, + "language_loss": 0.78867376, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.81666768, + "num_input_tokens_seen": 94201795, + "step": 4360, + "time_per_iteration": 2.8079848289489746 + }, + { + "auxiliary_loss_clip": 0.01497797, + "auxiliary_loss_mlp": 0.01295527, + "balance_loss_clip": 1.15030837, + "balance_loss_mlp": 1.03975141, + "epoch": 0.26219750488501425, + "flos": 28623757703040.0, + "grad_norm": 4.526970013902979, + "language_loss": 0.68917328, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.71710652, + "num_input_tokens_seen": 94222390, + "step": 4361, + "time_per_iteration": 2.9257657527923584 + }, + { + "auxiliary_loss_clip": 0.01501551, + "auxiliary_loss_mlp": 0.01303653, + "balance_loss_clip": 1.15426171, + "balance_loss_mlp": 1.04043889, + "epoch": 0.2622576281376823, + "flos": 15043975482240.0, + "grad_norm": 2.024283994714281, + "language_loss": 0.84448987, + "learning_rate": 3.46031316964119e-06, + "loss": 0.8725419, + "num_input_tokens_seen": 94239980, + "step": 4362, + "time_per_iteration": 2.865849733352661 + }, + { + "auxiliary_loss_clip": 0.01503567, + "auxiliary_loss_mlp": 0.01288795, + "balance_loss_clip": 1.15582454, + "balance_loss_mlp": 1.02901387, + "epoch": 0.26231775139035024, + "flos": 26398604116800.0, + "grad_norm": 2.061178951089046, + "language_loss": 0.65067506, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67859864, + "num_input_tokens_seen": 94260715, + "step": 4363, + "time_per_iteration": 4.281650543212891 + }, + { + "auxiliary_loss_clip": 0.01662808, + "auxiliary_loss_mlp": 0.01261993, + "balance_loss_clip": 1.32106042, + "balance_loss_mlp": 1.04379272, + "epoch": 0.2623778746430182, + "flos": 65416108612320.0, + "grad_norm": 0.8896273022198564, + "language_loss": 0.61069703, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63994509, + "num_input_tokens_seen": 94321285, + "step": 4364, + "time_per_iteration": 5.375660419464111 + }, + { + "auxiliary_loss_clip": 0.01494831, + "auxiliary_loss_mlp": 0.01300455, + "balance_loss_clip": 1.14831901, + "balance_loss_mlp": 1.03743148, + "epoch": 0.26243799789568617, + "flos": 12605853091680.0, + "grad_norm": 2.829280323893326, + "language_loss": 0.71261477, + "learning_rate": 3.459514586533184e-06, + "loss": 0.74056768, + "num_input_tokens_seen": 94335420, + "step": 4365, + "time_per_iteration": 2.8027029037475586 + }, + { + "auxiliary_loss_clip": 0.01495634, + "auxiliary_loss_mlp": 0.01298959, + "balance_loss_clip": 1.14861536, + "balance_loss_mlp": 1.04146695, + "epoch": 0.26249812114835414, + "flos": 28626488530560.0, + "grad_norm": 1.5649527498898164, + "language_loss": 0.7756657, + "learning_rate": 3.459248281460509e-06, + "loss": 0.80361164, + "num_input_tokens_seen": 94357440, + "step": 4366, + "time_per_iteration": 2.8251452445983887 + }, + { + "auxiliary_loss_clip": 0.01492529, + "auxiliary_loss_mlp": 0.01290921, + "balance_loss_clip": 1.14697254, + "balance_loss_mlp": 1.02866054, + "epoch": 0.2625582444010221, + "flos": 14467158195840.0, + "grad_norm": 1.842060250168126, + "language_loss": 0.76341271, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.79124719, + "num_input_tokens_seen": 94375690, + "step": 4367, + "time_per_iteration": 2.7806055545806885 + }, + { + "auxiliary_loss_clip": 0.01503801, + "auxiliary_loss_mlp": 0.01297549, + "balance_loss_clip": 1.15830338, + "balance_loss_mlp": 1.03547978, + "epoch": 0.26261836765369007, + "flos": 16614785894400.0, + "grad_norm": 1.9012007849653079, + "language_loss": 0.69685471, + "learning_rate": 3.458715505320736e-06, + "loss": 0.7248683, + "num_input_tokens_seen": 94393190, + "step": 4368, + "time_per_iteration": 2.726029634475708 + }, + { + "auxiliary_loss_clip": 0.01499503, + "auxiliary_loss_mlp": 0.01287972, + "balance_loss_clip": 1.15352941, + "balance_loss_mlp": 1.02876329, + "epoch": 0.26267849090635803, + "flos": 20521918422720.0, + "grad_norm": 2.280853834796198, + "language_loss": 0.78810203, + "learning_rate": 3.458449034273841e-06, + "loss": 0.81597674, + "num_input_tokens_seen": 94410975, + "step": 4369, + "time_per_iteration": 2.8028347492218018 + }, + { + "auxiliary_loss_clip": 0.01499914, + "auxiliary_loss_mlp": 0.01290937, + "balance_loss_clip": 1.15376937, + "balance_loss_mlp": 1.03249133, + "epoch": 0.262738614159026, + "flos": 21326041357920.0, + "grad_norm": 1.925414422692189, + "language_loss": 0.83615172, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.86406022, + "num_input_tokens_seen": 94429985, + "step": 4370, + "time_per_iteration": 2.776860237121582 + }, + { + "auxiliary_loss_clip": 0.01502872, + "auxiliary_loss_mlp": 0.01298682, + "balance_loss_clip": 1.15565181, + "balance_loss_mlp": 1.03699338, + "epoch": 0.26279873741169396, + "flos": 17605555126560.0, + "grad_norm": 1.7620386993393256, + "language_loss": 0.71094567, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73896122, + "num_input_tokens_seen": 94448660, + "step": 4371, + "time_per_iteration": 2.7791547775268555 + }, + { + "auxiliary_loss_clip": 0.01673233, + "auxiliary_loss_mlp": 0.01241379, + "balance_loss_clip": 1.33424783, + "balance_loss_mlp": 1.0193634, + "epoch": 0.2628588606643619, + "flos": 60956243543520.0, + "grad_norm": 0.7040984316416763, + "language_loss": 0.56373072, + "learning_rate": 3.457649289346384e-06, + "loss": 0.59287685, + "num_input_tokens_seen": 94515630, + "step": 4372, + "time_per_iteration": 3.454834461212158 + }, + { + "auxiliary_loss_clip": 0.01506258, + "auxiliary_loss_mlp": 0.012876, + "balance_loss_clip": 1.15996575, + "balance_loss_mlp": 1.03144288, + "epoch": 0.2629189839170299, + "flos": 27018773654400.0, + "grad_norm": 4.7944233157045835, + "language_loss": 0.77958238, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.80752099, + "num_input_tokens_seen": 94535385, + "step": 4373, + "time_per_iteration": 2.819380044937134 + }, + { + "auxiliary_loss_clip": 0.01504063, + "auxiliary_loss_mlp": 0.01291577, + "balance_loss_clip": 1.15739369, + "balance_loss_mlp": 1.03522897, + "epoch": 0.26297910716969786, + "flos": 17021796986880.0, + "grad_norm": 3.6566826977503704, + "language_loss": 0.71753943, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.74549586, + "num_input_tokens_seen": 94552650, + "step": 4374, + "time_per_iteration": 2.8390085697174072 + }, + { + "auxiliary_loss_clip": 0.01512871, + "auxiliary_loss_mlp": 0.01304252, + "balance_loss_clip": 1.16669488, + "balance_loss_mlp": 1.05133784, + "epoch": 0.2630392304223659, + "flos": 24899554512000.0, + "grad_norm": 1.655465008063287, + "language_loss": 0.80949044, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83766174, + "num_input_tokens_seen": 94574075, + "step": 4375, + "time_per_iteration": 2.8073110580444336 + }, + { + "auxiliary_loss_clip": 0.01512772, + "auxiliary_loss_mlp": 0.01303737, + "balance_loss_clip": 1.16580367, + "balance_loss_mlp": 1.05215764, + "epoch": 0.26309935367503384, + "flos": 32856810189120.0, + "grad_norm": 1.7693688590855936, + "language_loss": 0.66588426, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.69404936, + "num_input_tokens_seen": 94594255, + "step": 4376, + "time_per_iteration": 2.8622708320617676 + }, + { + "auxiliary_loss_clip": 0.01509409, + "auxiliary_loss_mlp": 0.01300186, + "balance_loss_clip": 1.16098571, + "balance_loss_mlp": 1.04555476, + "epoch": 0.2631594769277018, + "flos": 15889781829600.0, + "grad_norm": 1.9000321243525113, + "language_loss": 0.69777358, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.72586954, + "num_input_tokens_seen": 94611410, + "step": 4377, + "time_per_iteration": 2.801546812057495 + }, + { + "auxiliary_loss_clip": 0.01507763, + "auxiliary_loss_mlp": 0.01307023, + "balance_loss_clip": 1.16073477, + "balance_loss_mlp": 1.05201077, + "epoch": 0.2632196001803698, + "flos": 50808728916000.0, + "grad_norm": 1.7106779183510774, + "language_loss": 0.79181266, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.81996047, + "num_input_tokens_seen": 94636575, + "step": 4378, + "time_per_iteration": 3.0625250339508057 + }, + { + "auxiliary_loss_clip": 0.01517121, + "auxiliary_loss_mlp": 0.01311576, + "balance_loss_clip": 1.17252851, + "balance_loss_mlp": 1.05732656, + "epoch": 0.26327972343303774, + "flos": 13734606427200.0, + "grad_norm": 2.266344680711187, + "language_loss": 0.7628473, + "learning_rate": 3.455781283723846e-06, + "loss": 0.79113424, + "num_input_tokens_seen": 94654345, + "step": 4379, + "time_per_iteration": 2.763193130493164 + }, + { + "auxiliary_loss_clip": 0.01519492, + "auxiliary_loss_mlp": 0.01312263, + "balance_loss_clip": 1.17385268, + "balance_loss_mlp": 1.05820382, + "epoch": 0.2633398466857057, + "flos": 23771332170720.0, + "grad_norm": 2.886596453851963, + "language_loss": 0.78097725, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.80929482, + "num_input_tokens_seen": 94673985, + "step": 4380, + "time_per_iteration": 2.8121252059936523 + }, + { + "auxiliary_loss_clip": 0.01514555, + "auxiliary_loss_mlp": 0.01314212, + "balance_loss_clip": 1.16824758, + "balance_loss_mlp": 1.05881846, + "epoch": 0.26339996993837367, + "flos": 27602721434880.0, + "grad_norm": 2.139107889625522, + "language_loss": 0.6421892, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.67047685, + "num_input_tokens_seen": 94693145, + "step": 4381, + "time_per_iteration": 2.893338441848755 + }, + { + "auxiliary_loss_clip": 0.01517139, + "auxiliary_loss_mlp": 0.01296569, + "balance_loss_clip": 1.17096615, + "balance_loss_mlp": 1.04003036, + "epoch": 0.26346009319104163, + "flos": 16948215627840.0, + "grad_norm": 1.8541991676220377, + "language_loss": 0.83245635, + "learning_rate": 3.454979881632595e-06, + "loss": 0.86059344, + "num_input_tokens_seen": 94710185, + "step": 4382, + "time_per_iteration": 2.7733705043792725 + }, + { + "auxiliary_loss_clip": 0.01524149, + "auxiliary_loss_mlp": 0.01306789, + "balance_loss_clip": 1.17737329, + "balance_loss_mlp": 1.050632, + "epoch": 0.2635202164437096, + "flos": 37235242769760.0, + "grad_norm": 1.9841197755049023, + "language_loss": 0.70422137, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.73253071, + "num_input_tokens_seen": 94730280, + "step": 4383, + "time_per_iteration": 2.900519847869873 + }, + { + "auxiliary_loss_clip": 0.01517905, + "auxiliary_loss_mlp": 0.01295385, + "balance_loss_clip": 1.17126513, + "balance_loss_mlp": 1.03979993, + "epoch": 0.26358033969637756, + "flos": 20998490489280.0, + "grad_norm": 1.879900153149087, + "language_loss": 0.69853884, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.72667181, + "num_input_tokens_seen": 94748560, + "step": 4384, + "time_per_iteration": 2.80295729637146 + }, + { + "auxiliary_loss_clip": 0.01513255, + "auxiliary_loss_mlp": 0.01287312, + "balance_loss_clip": 1.16653717, + "balance_loss_mlp": 1.03210902, + "epoch": 0.26364046294904553, + "flos": 27748594595520.0, + "grad_norm": 4.0921709043336, + "language_loss": 0.7032758, + "learning_rate": 3.45417798298451e-06, + "loss": 0.73128146, + "num_input_tokens_seen": 94767570, + "step": 4385, + "time_per_iteration": 2.792369842529297 + }, + { + "auxiliary_loss_clip": 0.01514944, + "auxiliary_loss_mlp": 0.01292548, + "balance_loss_clip": 1.16763341, + "balance_loss_mlp": 1.03581858, + "epoch": 0.2637005862017135, + "flos": 22895296715520.0, + "grad_norm": 1.9659845062832266, + "language_loss": 0.85185981, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87993467, + "num_input_tokens_seen": 94784985, + "step": 4386, + "time_per_iteration": 2.786561965942383 + }, + { + "auxiliary_loss_clip": 0.01510866, + "auxiliary_loss_mlp": 0.01294152, + "balance_loss_clip": 1.16324592, + "balance_loss_mlp": 1.03570676, + "epoch": 0.26376070945438146, + "flos": 15050612910240.0, + "grad_norm": 5.133865268292725, + "language_loss": 0.77847642, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.8065266, + "num_input_tokens_seen": 94802545, + "step": 4387, + "time_per_iteration": 2.7495687007904053 + }, + { + "auxiliary_loss_clip": 0.01518249, + "auxiliary_loss_mlp": 0.01288339, + "balance_loss_clip": 1.17129707, + "balance_loss_mlp": 1.02645993, + "epoch": 0.2638208327070494, + "flos": 21143946440160.0, + "grad_norm": 1.9160595713445672, + "language_loss": 0.76293421, + "learning_rate": 3.453375588053264e-06, + "loss": 0.79100001, + "num_input_tokens_seen": 94820730, + "step": 4388, + "time_per_iteration": 2.9047176837921143 + }, + { + "auxiliary_loss_clip": 0.01505143, + "auxiliary_loss_mlp": 0.01290799, + "balance_loss_clip": 1.15634537, + "balance_loss_mlp": 1.03731227, + "epoch": 0.26388095595971744, + "flos": 21727742508000.0, + "grad_norm": 2.227920052403554, + "language_loss": 0.86428958, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.89224899, + "num_input_tokens_seen": 94839175, + "step": 4389, + "time_per_iteration": 2.7667434215545654 + }, + { + "auxiliary_loss_clip": 0.01651379, + "auxiliary_loss_mlp": 0.01247383, + "balance_loss_clip": 1.31086409, + "balance_loss_mlp": 1.02918243, + "epoch": 0.2639410792123854, + "flos": 65522574050400.0, + "grad_norm": 0.8062916861758445, + "language_loss": 0.60245985, + "learning_rate": 3.452840382521457e-06, + "loss": 0.63144743, + "num_input_tokens_seen": 94898865, + "step": 4390, + "time_per_iteration": 3.3032119274139404 + }, + { + "auxiliary_loss_clip": 0.01505522, + "auxiliary_loss_mlp": 0.01284486, + "balance_loss_clip": 1.1577034, + "balance_loss_mlp": 1.02565885, + "epoch": 0.2640012024650534, + "flos": 23950696260960.0, + "grad_norm": 1.5778855367585958, + "language_loss": 0.77789557, + "learning_rate": 3.4525726971127e-06, + "loss": 0.80579567, + "num_input_tokens_seen": 94917490, + "step": 4391, + "time_per_iteration": 2.9033212661743164 + }, + { + "auxiliary_loss_clip": 0.01652764, + "auxiliary_loss_mlp": 0.01252808, + "balance_loss_clip": 1.31183219, + "balance_loss_mlp": 1.03613281, + "epoch": 0.26406132571772134, + "flos": 56448284839200.0, + "grad_norm": 0.9276609919033171, + "language_loss": 0.58679992, + "learning_rate": 3.45230495662224e-06, + "loss": 0.61585563, + "num_input_tokens_seen": 94969065, + "step": 4392, + "time_per_iteration": 3.264455795288086 + }, + { + "auxiliary_loss_clip": 0.01510919, + "auxiliary_loss_mlp": 0.01298683, + "balance_loss_clip": 1.16198421, + "balance_loss_mlp": 1.04195404, + "epoch": 0.2641214489703893, + "flos": 22092728834880.0, + "grad_norm": 1.6699830651139762, + "language_loss": 0.68434584, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.7124418, + "num_input_tokens_seen": 94988540, + "step": 4393, + "time_per_iteration": 2.8681788444519043 + }, + { + "auxiliary_loss_clip": 0.01503625, + "auxiliary_loss_mlp": 0.01301728, + "balance_loss_clip": 1.15431464, + "balance_loss_mlp": 1.04271054, + "epoch": 0.26418157222305727, + "flos": 16546628262240.0, + "grad_norm": 1.89687405692145, + "language_loss": 0.84325576, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.87130934, + "num_input_tokens_seen": 95004810, + "step": 4394, + "time_per_iteration": 4.415528059005737 + }, + { + "auxiliary_loss_clip": 0.01507913, + "auxiliary_loss_mlp": 0.01292494, + "balance_loss_clip": 1.15821671, + "balance_loss_mlp": 1.03271341, + "epoch": 0.26424169547572524, + "flos": 18004335808320.0, + "grad_norm": 2.289860214133722, + "language_loss": 0.70629752, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.73430163, + "num_input_tokens_seen": 95024085, + "step": 4395, + "time_per_iteration": 2.7511565685272217 + }, + { + "auxiliary_loss_clip": 0.01506069, + "auxiliary_loss_mlp": 0.01288101, + "balance_loss_clip": 1.15876853, + "balance_loss_mlp": 1.03060913, + "epoch": 0.2643018187283932, + "flos": 16985082163680.0, + "grad_norm": 1.8588605835612992, + "language_loss": 0.86944067, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89738238, + "num_input_tokens_seen": 95042515, + "step": 4396, + "time_per_iteration": 2.7730040550231934 + }, + { + "auxiliary_loss_clip": 0.01644266, + "auxiliary_loss_mlp": 0.01241997, + "balance_loss_clip": 1.30379689, + "balance_loss_mlp": 1.02379608, + "epoch": 0.26436194198106117, + "flos": 59670314455680.0, + "grad_norm": 0.7914472238647198, + "language_loss": 0.54966557, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57852817, + "num_input_tokens_seen": 95094835, + "step": 4397, + "time_per_iteration": 3.1085026264190674 + }, + { + "auxiliary_loss_clip": 0.01505704, + "auxiliary_loss_mlp": 0.01287149, + "balance_loss_clip": 1.15732038, + "balance_loss_mlp": 1.02946591, + "epoch": 0.26442206523372913, + "flos": 32923792048320.0, + "grad_norm": 2.266263489518502, + "language_loss": 0.77900243, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80693096, + "num_input_tokens_seen": 95113480, + "step": 4398, + "time_per_iteration": 2.8653156757354736 + }, + { + "auxiliary_loss_clip": 0.01507159, + "auxiliary_loss_mlp": 0.01302129, + "balance_loss_clip": 1.15899706, + "balance_loss_mlp": 1.04234767, + "epoch": 0.2644821884863971, + "flos": 21033346832640.0, + "grad_norm": 2.293054801178952, + "language_loss": 0.67253023, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.70062315, + "num_input_tokens_seen": 95132580, + "step": 4399, + "time_per_iteration": 2.881688117980957 + }, + { + "auxiliary_loss_clip": 0.01510442, + "auxiliary_loss_mlp": 0.0130639, + "balance_loss_clip": 1.16187966, + "balance_loss_mlp": 1.05423808, + "epoch": 0.26454231173906506, + "flos": 20778884256960.0, + "grad_norm": 1.6323503852632335, + "language_loss": 0.86005473, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88822305, + "num_input_tokens_seen": 95152375, + "step": 4400, + "time_per_iteration": 2.7953383922576904 + }, + { + "auxiliary_loss_clip": 0.01505339, + "auxiliary_loss_mlp": 0.01287779, + "balance_loss_clip": 1.15653658, + "balance_loss_mlp": 1.03276682, + "epoch": 0.264602434991733, + "flos": 16620588902880.0, + "grad_norm": 2.310151848789598, + "language_loss": 0.7605018, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.78843302, + "num_input_tokens_seen": 95170265, + "step": 4401, + "time_per_iteration": 4.1835010051727295 + }, + { + "auxiliary_loss_clip": 0.0149581, + "auxiliary_loss_mlp": 0.01296462, + "balance_loss_clip": 1.14604115, + "balance_loss_mlp": 1.0385884, + "epoch": 0.26466255824440105, + "flos": 19064703942720.0, + "grad_norm": 1.701640876057047, + "language_loss": 0.88185334, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90977609, + "num_input_tokens_seen": 95188655, + "step": 4402, + "time_per_iteration": 5.898227214813232 + }, + { + "auxiliary_loss_clip": 0.01503402, + "auxiliary_loss_mlp": 0.01304791, + "balance_loss_clip": 1.15432239, + "balance_loss_mlp": 1.05092263, + "epoch": 0.264722681497069, + "flos": 22640796211680.0, + "grad_norm": 1.8273690504330335, + "language_loss": 0.7861948, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.8142767, + "num_input_tokens_seen": 95209615, + "step": 4403, + "time_per_iteration": 2.8573970794677734 + }, + { + "auxiliary_loss_clip": 0.01500574, + "auxiliary_loss_mlp": 0.01297797, + "balance_loss_clip": 1.15116966, + "balance_loss_mlp": 1.04392862, + "epoch": 0.264782804749737, + "flos": 22494467913120.0, + "grad_norm": 2.077154658842461, + "language_loss": 0.88464499, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.91262865, + "num_input_tokens_seen": 95227810, + "step": 4404, + "time_per_iteration": 2.7343780994415283 + }, + { + "auxiliary_loss_clip": 0.01501022, + "auxiliary_loss_mlp": 0.01301503, + "balance_loss_clip": 1.15311766, + "balance_loss_mlp": 1.0476346, + "epoch": 0.26484292800240494, + "flos": 16802152826400.0, + "grad_norm": 1.8506135922273814, + "language_loss": 0.76413929, + "learning_rate": 3.448819322433709e-06, + "loss": 0.79216456, + "num_input_tokens_seen": 95245890, + "step": 4405, + "time_per_iteration": 2.7847580909729004 + }, + { + "auxiliary_loss_clip": 0.01506624, + "auxiliary_loss_mlp": 0.01311768, + "balance_loss_clip": 1.15763068, + "balance_loss_mlp": 1.0569458, + "epoch": 0.2649030512550729, + "flos": 20451712669920.0, + "grad_norm": 2.401598374420297, + "language_loss": 0.70086253, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72904646, + "num_input_tokens_seen": 95264955, + "step": 4406, + "time_per_iteration": 2.7440924644470215 + }, + { + "auxiliary_loss_clip": 0.01502448, + "auxiliary_loss_mlp": 0.01301534, + "balance_loss_clip": 1.15528059, + "balance_loss_mlp": 1.04594874, + "epoch": 0.2649631745077409, + "flos": 22418155726560.0, + "grad_norm": 1.7018292984009225, + "language_loss": 0.83843327, + "learning_rate": 3.448282246369912e-06, + "loss": 0.86647308, + "num_input_tokens_seen": 95284245, + "step": 4407, + "time_per_iteration": 2.7217416763305664 + }, + { + "auxiliary_loss_clip": 0.01502808, + "auxiliary_loss_mlp": 0.01300624, + "balance_loss_clip": 1.15449047, + "balance_loss_mlp": 1.04599261, + "epoch": 0.26502329776040884, + "flos": 35119019952000.0, + "grad_norm": 1.8182380929943576, + "language_loss": 0.76481998, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.79285431, + "num_input_tokens_seen": 95307125, + "step": 4408, + "time_per_iteration": 2.9269251823425293 + }, + { + "auxiliary_loss_clip": 0.01509907, + "auxiliary_loss_mlp": 0.01307081, + "balance_loss_clip": 1.16064, + "balance_loss_mlp": 1.05607367, + "epoch": 0.2650834210130768, + "flos": 38690371200960.0, + "grad_norm": 1.8520306793861299, + "language_loss": 0.71000624, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73817611, + "num_input_tokens_seen": 95329150, + "step": 4409, + "time_per_iteration": 3.003551959991455 + }, + { + "auxiliary_loss_clip": 0.01502612, + "auxiliary_loss_mlp": 0.01294485, + "balance_loss_clip": 1.15496349, + "balance_loss_mlp": 1.04137957, + "epoch": 0.26514354426574477, + "flos": 24719128433280.0, + "grad_norm": 1.94838978238119, + "language_loss": 0.73206377, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.76003468, + "num_input_tokens_seen": 95349880, + "step": 4410, + "time_per_iteration": 2.8727002143859863 + }, + { + "auxiliary_loss_clip": 0.01502115, + "auxiliary_loss_mlp": 0.01298397, + "balance_loss_clip": 1.15464413, + "balance_loss_mlp": 1.04224014, + "epoch": 0.26520366751841273, + "flos": 20342175050880.0, + "grad_norm": 2.276022576724049, + "language_loss": 0.73496109, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.76296616, + "num_input_tokens_seen": 95368570, + "step": 4411, + "time_per_iteration": 2.7593348026275635 + }, + { + "auxiliary_loss_clip": 0.01498985, + "auxiliary_loss_mlp": 0.01282222, + "balance_loss_clip": 1.15031409, + "balance_loss_mlp": 1.0302608, + "epoch": 0.2652637907710807, + "flos": 22345939781280.0, + "grad_norm": 2.4924262646910167, + "language_loss": 0.82112145, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84893358, + "num_input_tokens_seen": 95387065, + "step": 4412, + "time_per_iteration": 2.8298757076263428 + }, + { + "auxiliary_loss_clip": 0.01501749, + "auxiliary_loss_mlp": 0.01284998, + "balance_loss_clip": 1.15508235, + "balance_loss_mlp": 1.02769625, + "epoch": 0.26532391402374866, + "flos": 19356260623200.0, + "grad_norm": 1.7120011657483114, + "language_loss": 0.74671984, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.77458727, + "num_input_tokens_seen": 95406345, + "step": 4413, + "time_per_iteration": 2.8030731678009033 + }, + { + "auxiliary_loss_clip": 0.01641336, + "auxiliary_loss_mlp": 0.01263214, + "balance_loss_clip": 1.30375683, + "balance_loss_mlp": 1.04425049, + "epoch": 0.26538403727641663, + "flos": 44793262262880.0, + "grad_norm": 0.8771825528288038, + "language_loss": 0.56962943, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59867495, + "num_input_tokens_seen": 95463595, + "step": 4414, + "time_per_iteration": 3.2173895835876465 + }, + { + "auxiliary_loss_clip": 0.01505748, + "auxiliary_loss_mlp": 0.01303047, + "balance_loss_clip": 1.15835786, + "balance_loss_mlp": 1.05356562, + "epoch": 0.26544416052908465, + "flos": 28184545238400.0, + "grad_norm": 1.7976966023162633, + "language_loss": 0.75240093, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.78048885, + "num_input_tokens_seen": 95484115, + "step": 4415, + "time_per_iteration": 2.855043649673462 + }, + { + "auxiliary_loss_clip": 0.01502157, + "auxiliary_loss_mlp": 0.01305183, + "balance_loss_clip": 1.15326095, + "balance_loss_mlp": 1.0494076, + "epoch": 0.2655042837817526, + "flos": 17567133536160.0, + "grad_norm": 11.271384248749687, + "language_loss": 0.87089753, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.8989709, + "num_input_tokens_seen": 95501435, + "step": 4416, + "time_per_iteration": 2.7194554805755615 + }, + { + "auxiliary_loss_clip": 0.01504415, + "auxiliary_loss_mlp": 0.01303588, + "balance_loss_clip": 1.15667772, + "balance_loss_mlp": 1.04800344, + "epoch": 0.2655644070344206, + "flos": 23406914766240.0, + "grad_norm": 1.7046162333982082, + "language_loss": 0.76511073, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.79319072, + "num_input_tokens_seen": 95520135, + "step": 4417, + "time_per_iteration": 2.8575496673583984 + }, + { + "auxiliary_loss_clip": 0.01506944, + "auxiliary_loss_mlp": 0.01302634, + "balance_loss_clip": 1.15895772, + "balance_loss_mlp": 1.04952931, + "epoch": 0.26562453028708854, + "flos": 26470857990240.0, + "grad_norm": 1.6092892351608055, + "language_loss": 0.80215657, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.83025235, + "num_input_tokens_seen": 95541705, + "step": 4418, + "time_per_iteration": 2.7846648693084717 + }, + { + "auxiliary_loss_clip": 0.01506808, + "auxiliary_loss_mlp": 0.01306251, + "balance_loss_clip": 1.15844631, + "balance_loss_mlp": 1.05352747, + "epoch": 0.2656846535397565, + "flos": 19209553043040.0, + "grad_norm": 2.7470024191236324, + "language_loss": 0.66936827, + "learning_rate": 3.445055179644071e-06, + "loss": 0.6974988, + "num_input_tokens_seen": 95560300, + "step": 4419, + "time_per_iteration": 2.7751352787017822 + }, + { + "auxiliary_loss_clip": 0.01505507, + "auxiliary_loss_mlp": 0.01303441, + "balance_loss_clip": 1.15735984, + "balance_loss_mlp": 1.0512898, + "epoch": 0.2657447767924245, + "flos": 30553751433600.0, + "grad_norm": 1.657643058467638, + "language_loss": 0.79407859, + "learning_rate": 3.444785900995585e-06, + "loss": 0.82216811, + "num_input_tokens_seen": 95580150, + "step": 4420, + "time_per_iteration": 2.8734078407287598 + }, + { + "auxiliary_loss_clip": 0.01499505, + "auxiliary_loss_mlp": 0.01307599, + "balance_loss_clip": 1.15092802, + "balance_loss_mlp": 1.05449343, + "epoch": 0.26580490004509244, + "flos": 20924833273920.0, + "grad_norm": 2.1151043013661375, + "language_loss": 0.81756961, + "learning_rate": 3.444516567560673e-06, + "loss": 0.84564072, + "num_input_tokens_seen": 95597570, + "step": 4421, + "time_per_iteration": 2.7803571224212646 + }, + { + "auxiliary_loss_clip": 0.01500551, + "auxiliary_loss_mlp": 0.01307571, + "balance_loss_clip": 1.15212655, + "balance_loss_mlp": 1.05713582, + "epoch": 0.2658650232977604, + "flos": 43949087190720.0, + "grad_norm": 2.2472715696681993, + "language_loss": 0.6578691, + "learning_rate": 3.444247179349548e-06, + "loss": 0.68595028, + "num_input_tokens_seen": 95619415, + "step": 4422, + "time_per_iteration": 3.0130152702331543 + }, + { + "auxiliary_loss_clip": 0.01504496, + "auxiliary_loss_mlp": 0.01304087, + "balance_loss_clip": 1.15554678, + "balance_loss_mlp": 1.05288959, + "epoch": 0.26592514655042837, + "flos": 29719058037120.0, + "grad_norm": 2.349492290114776, + "language_loss": 0.75085485, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.77894062, + "num_input_tokens_seen": 95639155, + "step": 4423, + "time_per_iteration": 2.791534423828125 + }, + { + "auxiliary_loss_clip": 0.01498319, + "auxiliary_loss_mlp": 0.0130128, + "balance_loss_clip": 1.14990795, + "balance_loss_mlp": 1.04855621, + "epoch": 0.26598526980309634, + "flos": 46681041951360.0, + "grad_norm": 1.5679359147463903, + "language_loss": 0.77650118, + "learning_rate": 3.443708238639522e-06, + "loss": 0.80449718, + "num_input_tokens_seen": 95663320, + "step": 4424, + "time_per_iteration": 3.0351438522338867 + }, + { + "auxiliary_loss_clip": 0.01495782, + "auxiliary_loss_mlp": 0.01304248, + "balance_loss_clip": 1.14842677, + "balance_loss_mlp": 1.04599333, + "epoch": 0.2660453930557643, + "flos": 11511007895520.0, + "grad_norm": 2.1319495706861677, + "language_loss": 0.79673374, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.82473409, + "num_input_tokens_seen": 95680260, + "step": 4425, + "time_per_iteration": 2.718271017074585 + }, + { + "auxiliary_loss_clip": 0.01498905, + "auxiliary_loss_mlp": 0.01297613, + "balance_loss_clip": 1.15096426, + "balance_loss_mlp": 1.04488945, + "epoch": 0.26610551630843227, + "flos": 24793961421600.0, + "grad_norm": 2.008240730822958, + "language_loss": 0.80739754, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.83536279, + "num_input_tokens_seen": 95701140, + "step": 4426, + "time_per_iteration": 2.837852954864502 + }, + { + "auxiliary_loss_clip": 0.01508548, + "auxiliary_loss_mlp": 0.01289191, + "balance_loss_clip": 1.16079021, + "balance_loss_mlp": 1.02979124, + "epoch": 0.26616563956110023, + "flos": 27638867335680.0, + "grad_norm": 1.7144767144115929, + "language_loss": 0.77263355, + "learning_rate": 3.442899417008333e-06, + "loss": 0.80061102, + "num_input_tokens_seen": 95722060, + "step": 4427, + "time_per_iteration": 2.842254161834717 + }, + { + "auxiliary_loss_clip": 0.0150896, + "auxiliary_loss_mlp": 0.01293891, + "balance_loss_clip": 1.16114879, + "balance_loss_mlp": 1.04307461, + "epoch": 0.26622576281376825, + "flos": 28365236814240.0, + "grad_norm": 1.605197498036845, + "language_loss": 0.76964718, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.79767573, + "num_input_tokens_seen": 95742495, + "step": 4428, + "time_per_iteration": 2.847963571548462 + }, + { + "auxiliary_loss_clip": 0.01493676, + "auxiliary_loss_mlp": 0.01282087, + "balance_loss_clip": 1.14539266, + "balance_loss_mlp": 1.02573967, + "epoch": 0.2662858860664362, + "flos": 18043326321120.0, + "grad_norm": 2.3524233252245206, + "language_loss": 0.82961434, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.85737199, + "num_input_tokens_seen": 95761510, + "step": 4429, + "time_per_iteration": 2.803464889526367 + }, + { + "auxiliary_loss_clip": 0.01509614, + "auxiliary_loss_mlp": 0.01295721, + "balance_loss_clip": 1.16134775, + "balance_loss_mlp": 1.03937387, + "epoch": 0.2663460093191042, + "flos": 22747944356640.0, + "grad_norm": 1.6578428942580616, + "language_loss": 0.72193968, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74999303, + "num_input_tokens_seen": 95782385, + "step": 4430, + "time_per_iteration": 2.808318614959717 + }, + { + "auxiliary_loss_clip": 0.01500885, + "auxiliary_loss_mlp": 0.01292761, + "balance_loss_clip": 1.15280581, + "balance_loss_mlp": 1.0375576, + "epoch": 0.26640613257177215, + "flos": 16510672002240.0, + "grad_norm": 5.0537078476888615, + "language_loss": 0.82229954, + "learning_rate": 3.441820222206035e-06, + "loss": 0.850236, + "num_input_tokens_seen": 95800595, + "step": 4431, + "time_per_iteration": 2.870229482650757 + }, + { + "auxiliary_loss_clip": 0.01501252, + "auxiliary_loss_mlp": 0.01303598, + "balance_loss_clip": 1.15313077, + "balance_loss_mlp": 1.04534304, + "epoch": 0.2664662558244401, + "flos": 23078188124640.0, + "grad_norm": 2.4140371064710258, + "language_loss": 0.76428747, + "learning_rate": 3.44155028679496e-06, + "loss": 0.79233599, + "num_input_tokens_seen": 95818480, + "step": 4432, + "time_per_iteration": 2.823674201965332 + }, + { + "auxiliary_loss_clip": 0.01502298, + "auxiliary_loss_mlp": 0.0130059, + "balance_loss_clip": 1.15491652, + "balance_loss_mlp": 1.04500544, + "epoch": 0.2665263790771081, + "flos": 23771559739680.0, + "grad_norm": 1.9385494558605731, + "language_loss": 0.82722121, + "learning_rate": 3.441280296720154e-06, + "loss": 0.85525012, + "num_input_tokens_seen": 95837205, + "step": 4433, + "time_per_iteration": 4.474962949752808 + }, + { + "auxiliary_loss_clip": 0.01499529, + "auxiliary_loss_mlp": 0.01302295, + "balance_loss_clip": 1.15221441, + "balance_loss_mlp": 1.04651928, + "epoch": 0.26658650232977604, + "flos": 28003777806240.0, + "grad_norm": 2.001779212829228, + "language_loss": 0.76704437, + "learning_rate": 3.441010251991854e-06, + "loss": 0.79506254, + "num_input_tokens_seen": 95858395, + "step": 4434, + "time_per_iteration": 2.8427627086639404 + }, + { + "auxiliary_loss_clip": 0.01499346, + "auxiliary_loss_mlp": 0.01284982, + "balance_loss_clip": 1.15083313, + "balance_loss_mlp": 1.03149533, + "epoch": 0.266646625582444, + "flos": 22165817127840.0, + "grad_norm": 1.8235078555576179, + "language_loss": 0.82612169, + "learning_rate": 3.440740152620301e-06, + "loss": 0.85396498, + "num_input_tokens_seen": 95877875, + "step": 4435, + "time_per_iteration": 2.790391445159912 + }, + { + "auxiliary_loss_clip": 0.01495533, + "auxiliary_loss_mlp": 0.01293225, + "balance_loss_clip": 1.14745188, + "balance_loss_mlp": 1.03420675, + "epoch": 0.266706748835112, + "flos": 27855932381280.0, + "grad_norm": 2.483462874093252, + "language_loss": 0.87582433, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.90371186, + "num_input_tokens_seen": 95895820, + "step": 4436, + "time_per_iteration": 2.8250930309295654 + }, + { + "auxiliary_loss_clip": 0.01498025, + "auxiliary_loss_mlp": 0.01299788, + "balance_loss_clip": 1.14994693, + "balance_loss_mlp": 1.04305863, + "epoch": 0.26676687208777994, + "flos": 25814504623680.0, + "grad_norm": 1.4954841775951195, + "language_loss": 0.78654563, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81452376, + "num_input_tokens_seen": 95918025, + "step": 4437, + "time_per_iteration": 2.8702011108398438 + }, + { + "auxiliary_loss_clip": 0.01500599, + "auxiliary_loss_mlp": 0.01290883, + "balance_loss_clip": 1.15224349, + "balance_loss_mlp": 1.03587079, + "epoch": 0.2668269953404479, + "flos": 36067536849600.0, + "grad_norm": 2.077308088732565, + "language_loss": 0.64119738, + "learning_rate": 3.439929526748556e-06, + "loss": 0.66911227, + "num_input_tokens_seen": 95937725, + "step": 4438, + "time_per_iteration": 2.9058918952941895 + }, + { + "auxiliary_loss_clip": 0.01494249, + "auxiliary_loss_mlp": 0.01298925, + "balance_loss_clip": 1.14617324, + "balance_loss_mlp": 1.04410362, + "epoch": 0.26688711859311587, + "flos": 26572999618080.0, + "grad_norm": 1.838226907868952, + "language_loss": 0.76345187, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.79138362, + "num_input_tokens_seen": 95956335, + "step": 4439, + "time_per_iteration": 4.312219858169556 + }, + { + "auxiliary_loss_clip": 0.01498839, + "auxiliary_loss_mlp": 0.01302357, + "balance_loss_clip": 1.14946079, + "balance_loss_mlp": 1.04372072, + "epoch": 0.26694724184578383, + "flos": 26764045581600.0, + "grad_norm": 2.049252688556894, + "language_loss": 0.72082782, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.74883974, + "num_input_tokens_seen": 95977135, + "step": 4440, + "time_per_iteration": 4.329652786254883 + }, + { + "auxiliary_loss_clip": 0.0149281, + "auxiliary_loss_mlp": 0.01298149, + "balance_loss_clip": 1.14316082, + "balance_loss_mlp": 1.04103851, + "epoch": 0.2670073650984518, + "flos": 20961699809760.0, + "grad_norm": 2.0159346316932227, + "language_loss": 0.66495454, + "learning_rate": 3.439118409456376e-06, + "loss": 0.69286418, + "num_input_tokens_seen": 95995435, + "step": 4441, + "time_per_iteration": 2.7323758602142334 + }, + { + "auxiliary_loss_clip": 0.01499766, + "auxiliary_loss_mlp": 0.01294563, + "balance_loss_clip": 1.14981389, + "balance_loss_mlp": 1.03592634, + "epoch": 0.2670674883511198, + "flos": 28368574492320.0, + "grad_norm": 1.491115690903782, + "language_loss": 0.76550245, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.79344571, + "num_input_tokens_seen": 96016340, + "step": 4442, + "time_per_iteration": 2.8882217407226562 + }, + { + "auxiliary_loss_clip": 0.01650019, + "auxiliary_loss_mlp": 0.01242706, + "balance_loss_clip": 1.30594385, + "balance_loss_mlp": 1.02526855, + "epoch": 0.2671276116037878, + "flos": 58977246265920.0, + "grad_norm": 0.9308989927935226, + "language_loss": 0.6118111, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.64073837, + "num_input_tokens_seen": 96071205, + "step": 4443, + "time_per_iteration": 3.1665122509002686 + }, + { + "auxiliary_loss_clip": 0.01498774, + "auxiliary_loss_mlp": 0.0129141, + "balance_loss_clip": 1.14956641, + "balance_loss_mlp": 1.03506207, + "epoch": 0.26718773485645575, + "flos": 43948442412000.0, + "grad_norm": 1.800091285005204, + "language_loss": 0.7666598, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.79456162, + "num_input_tokens_seen": 96094240, + "step": 4444, + "time_per_iteration": 3.034391164779663 + }, + { + "auxiliary_loss_clip": 0.01498085, + "auxiliary_loss_mlp": 0.0128801, + "balance_loss_clip": 1.14838493, + "balance_loss_mlp": 1.03299713, + "epoch": 0.2672478581091237, + "flos": 25230670627680.0, + "grad_norm": 1.8109228162542215, + "language_loss": 0.80457002, + "learning_rate": 3.438036155780158e-06, + "loss": 0.83243096, + "num_input_tokens_seen": 96114105, + "step": 4445, + "time_per_iteration": 2.814681053161621 + }, + { + "auxiliary_loss_clip": 0.01496004, + "auxiliary_loss_mlp": 0.01306026, + "balance_loss_clip": 1.14608455, + "balance_loss_mlp": 1.0483433, + "epoch": 0.2673079813617917, + "flos": 15269991573600.0, + "grad_norm": 1.702470381085684, + "language_loss": 0.89200115, + "learning_rate": 3.43776545600926e-06, + "loss": 0.92002147, + "num_input_tokens_seen": 96132140, + "step": 4446, + "time_per_iteration": 2.7716312408447266 + }, + { + "auxiliary_loss_clip": 0.01490669, + "auxiliary_loss_mlp": 0.01301102, + "balance_loss_clip": 1.14333737, + "balance_loss_mlp": 1.04895091, + "epoch": 0.26736810461445965, + "flos": 25815149402400.0, + "grad_norm": 1.8318734650142197, + "language_loss": 0.68257523, + "learning_rate": 3.437494701718153e-06, + "loss": 0.71049297, + "num_input_tokens_seen": 96152090, + "step": 4447, + "time_per_iteration": 2.9032795429229736 + }, + { + "auxiliary_loss_clip": 0.01500252, + "auxiliary_loss_mlp": 0.01308915, + "balance_loss_clip": 1.15188932, + "balance_loss_mlp": 1.0550468, + "epoch": 0.2674282278671276, + "flos": 24314810240160.0, + "grad_norm": 1.9330150125773915, + "language_loss": 0.83643532, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.86452699, + "num_input_tokens_seen": 96170015, + "step": 4448, + "time_per_iteration": 2.8017585277557373 + }, + { + "auxiliary_loss_clip": 0.01501154, + "auxiliary_loss_mlp": 0.01315578, + "balance_loss_clip": 1.15352738, + "balance_loss_mlp": 1.06628776, + "epoch": 0.2674883511197956, + "flos": 22817467402560.0, + "grad_norm": 2.1533008096053052, + "language_loss": 0.84565938, + "learning_rate": 3.436953029616378e-06, + "loss": 0.87382674, + "num_input_tokens_seen": 96188065, + "step": 4449, + "time_per_iteration": 2.7854597568511963 + }, + { + "auxiliary_loss_clip": 0.01503432, + "auxiliary_loss_mlp": 0.01318562, + "balance_loss_clip": 1.1570164, + "balance_loss_mlp": 1.0606885, + "epoch": 0.26754847437246354, + "flos": 25372295834400.0, + "grad_norm": 1.6546993807640091, + "language_loss": 0.83704662, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86526656, + "num_input_tokens_seen": 96205780, + "step": 4450, + "time_per_iteration": 2.88189697265625 + }, + { + "auxiliary_loss_clip": 0.01508611, + "auxiliary_loss_mlp": 0.01294623, + "balance_loss_clip": 1.16090536, + "balance_loss_mlp": 1.04609537, + "epoch": 0.2676085976251315, + "flos": 20232523647360.0, + "grad_norm": 2.0458068555415916, + "language_loss": 0.81024778, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83828014, + "num_input_tokens_seen": 96224990, + "step": 4451, + "time_per_iteration": 2.792360544204712 + }, + { + "auxiliary_loss_clip": 0.01519261, + "auxiliary_loss_mlp": 0.01315406, + "balance_loss_clip": 1.17090702, + "balance_loss_mlp": 1.06535268, + "epoch": 0.26766872087779947, + "flos": 28040758126560.0, + "grad_norm": 1.7835300847374507, + "language_loss": 0.86793834, + "learning_rate": 3.436140112818882e-06, + "loss": 0.89628494, + "num_input_tokens_seen": 96245345, + "step": 4452, + "time_per_iteration": 2.857982873916626 + }, + { + "auxiliary_loss_clip": 0.01510786, + "auxiliary_loss_mlp": 0.01311753, + "balance_loss_clip": 1.16366363, + "balance_loss_mlp": 1.05883825, + "epoch": 0.26772884413046744, + "flos": 18326728447200.0, + "grad_norm": 2.989267815926672, + "language_loss": 0.83722234, + "learning_rate": 3.435869031622194e-06, + "loss": 0.8654477, + "num_input_tokens_seen": 96259000, + "step": 4453, + "time_per_iteration": 2.7208940982818604 + }, + { + "auxiliary_loss_clip": 0.01509932, + "auxiliary_loss_mlp": 0.01302837, + "balance_loss_clip": 1.16290104, + "balance_loss_mlp": 1.04973185, + "epoch": 0.2677889673831354, + "flos": 22129709155200.0, + "grad_norm": 1.6241470214436935, + "language_loss": 0.79487228, + "learning_rate": 3.435597895977208e-06, + "loss": 0.82299995, + "num_input_tokens_seen": 96277000, + "step": 4454, + "time_per_iteration": 2.81257700920105 + }, + { + "auxiliary_loss_clip": 0.0150423, + "auxiliary_loss_mlp": 0.01310687, + "balance_loss_clip": 1.1565311, + "balance_loss_mlp": 1.05452991, + "epoch": 0.2678490906358034, + "flos": 23731810663680.0, + "grad_norm": 1.5570092638230784, + "language_loss": 0.7273972, + "learning_rate": 3.435326705894206e-06, + "loss": 0.75554639, + "num_input_tokens_seen": 96297010, + "step": 4455, + "time_per_iteration": 2.790692090988159 + }, + { + "auxiliary_loss_clip": 0.01514072, + "auxiliary_loss_mlp": 0.01288186, + "balance_loss_clip": 1.16654587, + "balance_loss_mlp": 1.03565335, + "epoch": 0.2679092138884714, + "flos": 21765064181760.0, + "grad_norm": 1.8658716253316707, + "language_loss": 0.73939455, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76741707, + "num_input_tokens_seen": 96315780, + "step": 4456, + "time_per_iteration": 2.8308091163635254 + }, + { + "auxiliary_loss_clip": 0.01507468, + "auxiliary_loss_mlp": 0.0128773, + "balance_loss_clip": 1.15967011, + "balance_loss_mlp": 1.03004718, + "epoch": 0.26796933714113935, + "flos": 19863023869440.0, + "grad_norm": 2.1502361947016184, + "language_loss": 0.70959383, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73754585, + "num_input_tokens_seen": 96333465, + "step": 4457, + "time_per_iteration": 2.7572667598724365 + }, + { + "auxiliary_loss_clip": 0.01519336, + "auxiliary_loss_mlp": 0.01318239, + "balance_loss_clip": 1.17086208, + "balance_loss_mlp": 1.0637989, + "epoch": 0.2680294603938073, + "flos": 20049973591680.0, + "grad_norm": 1.681392422846907, + "language_loss": 0.79031026, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81868601, + "num_input_tokens_seen": 96352005, + "step": 4458, + "time_per_iteration": 2.8744704723358154 + }, + { + "auxiliary_loss_clip": 0.01652537, + "auxiliary_loss_mlp": 0.01283623, + "balance_loss_clip": 1.31043661, + "balance_loss_mlp": 1.06999969, + "epoch": 0.2680895836464753, + "flos": 72120850274880.0, + "grad_norm": 0.9646341623728424, + "language_loss": 0.58711636, + "learning_rate": 3.434241401387739e-06, + "loss": 0.61647797, + "num_input_tokens_seen": 96406265, + "step": 4459, + "time_per_iteration": 3.3407723903656006 + }, + { + "auxiliary_loss_clip": 0.01512307, + "auxiliary_loss_mlp": 0.01291375, + "balance_loss_clip": 1.16382599, + "balance_loss_mlp": 1.03559995, + "epoch": 0.26814970689914325, + "flos": 20451143747520.0, + "grad_norm": 2.171524727752142, + "language_loss": 0.85772479, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.88576162, + "num_input_tokens_seen": 96425225, + "step": 4460, + "time_per_iteration": 2.822899580001831 + }, + { + "auxiliary_loss_clip": 0.01508246, + "auxiliary_loss_mlp": 0.01290618, + "balance_loss_clip": 1.16061485, + "balance_loss_mlp": 1.03007388, + "epoch": 0.2682098301518112, + "flos": 17568612734400.0, + "grad_norm": 2.0947057169142687, + "language_loss": 0.68140167, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70939034, + "num_input_tokens_seen": 96443780, + "step": 4461, + "time_per_iteration": 2.834918737411499 + }, + { + "auxiliary_loss_clip": 0.01503923, + "auxiliary_loss_mlp": 0.01283994, + "balance_loss_clip": 1.15511274, + "balance_loss_mlp": 1.02268755, + "epoch": 0.2682699534044792, + "flos": 18335262283200.0, + "grad_norm": 2.247381391716156, + "language_loss": 0.672786, + "learning_rate": 3.43342685191282e-06, + "loss": 0.70066524, + "num_input_tokens_seen": 96464530, + "step": 4462, + "time_per_iteration": 2.904170036315918 + }, + { + "auxiliary_loss_clip": 0.01509926, + "auxiliary_loss_mlp": 0.0129419, + "balance_loss_clip": 1.16105819, + "balance_loss_mlp": 1.03898668, + "epoch": 0.26833007665714714, + "flos": 25303645136160.0, + "grad_norm": 1.7537395655405905, + "language_loss": 0.69750738, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.72554851, + "num_input_tokens_seen": 96483345, + "step": 4463, + "time_per_iteration": 2.9010229110717773 + }, + { + "auxiliary_loss_clip": 0.01505957, + "auxiliary_loss_mlp": 0.01285657, + "balance_loss_clip": 1.1573478, + "balance_loss_mlp": 1.02835536, + "epoch": 0.2683901999098151, + "flos": 16101195579360.0, + "grad_norm": 3.591247619464578, + "language_loss": 0.78725415, + "learning_rate": 3.432883547133931e-06, + "loss": 0.81517029, + "num_input_tokens_seen": 96498305, + "step": 4464, + "time_per_iteration": 2.805690288543701 + }, + { + "auxiliary_loss_clip": 0.01510398, + "auxiliary_loss_mlp": 0.01305685, + "balance_loss_clip": 1.1620295, + "balance_loss_mlp": 1.04571307, + "epoch": 0.2684503231624831, + "flos": 27310519975680.0, + "grad_norm": 1.6716251901623185, + "language_loss": 0.7120856, + "learning_rate": 3.432611813236704e-06, + "loss": 0.74024642, + "num_input_tokens_seen": 96519740, + "step": 4465, + "time_per_iteration": 2.906498908996582 + }, + { + "auxiliary_loss_clip": 0.01631548, + "auxiliary_loss_mlp": 0.01247116, + "balance_loss_clip": 1.28935504, + "balance_loss_mlp": 1.02510071, + "epoch": 0.26851044641515104, + "flos": 71865060213600.0, + "grad_norm": 0.6816278445695728, + "language_loss": 0.53040278, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55918944, + "num_input_tokens_seen": 96588870, + "step": 4466, + "time_per_iteration": 3.4586312770843506 + }, + { + "auxiliary_loss_clip": 0.0150524, + "auxiliary_loss_mlp": 0.01285302, + "balance_loss_clip": 1.15762091, + "balance_loss_mlp": 1.03009844, + "epoch": 0.268570569667819, + "flos": 18735749732160.0, + "grad_norm": 2.222770300811422, + "language_loss": 0.74160361, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.76950896, + "num_input_tokens_seen": 96605100, + "step": 4467, + "time_per_iteration": 2.7777810096740723 + }, + { + "auxiliary_loss_clip": 0.0151025, + "auxiliary_loss_mlp": 0.01295236, + "balance_loss_clip": 1.16220284, + "balance_loss_mlp": 1.03926992, + "epoch": 0.268630692920487, + "flos": 18179869154400.0, + "grad_norm": 2.929176311537472, + "language_loss": 0.8137536, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.8418085, + "num_input_tokens_seen": 96621410, + "step": 4468, + "time_per_iteration": 2.8105616569519043 + }, + { + "auxiliary_loss_clip": 0.016201, + "auxiliary_loss_mlp": 0.01253761, + "balance_loss_clip": 1.27934551, + "balance_loss_mlp": 1.03632355, + "epoch": 0.268690816173155, + "flos": 68739293360160.0, + "grad_norm": 1.0298006613927535, + "language_loss": 0.59485942, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.62359804, + "num_input_tokens_seen": 96684810, + "step": 4469, + "time_per_iteration": 3.3647069931030273 + }, + { + "auxiliary_loss_clip": 0.01506616, + "auxiliary_loss_mlp": 0.01294976, + "balance_loss_clip": 1.15989733, + "balance_loss_mlp": 1.04072618, + "epoch": 0.26875093942582295, + "flos": 23295822092640.0, + "grad_norm": 2.4140762301381544, + "language_loss": 0.82129508, + "learning_rate": 3.431252329084972e-06, + "loss": 0.84931099, + "num_input_tokens_seen": 96701920, + "step": 4470, + "time_per_iteration": 4.398313522338867 + }, + { + "auxiliary_loss_clip": 0.01499729, + "auxiliary_loss_mlp": 0.01303703, + "balance_loss_clip": 1.15173054, + "balance_loss_mlp": 1.05136108, + "epoch": 0.2688110626784909, + "flos": 21545609662080.0, + "grad_norm": 1.6306124273620566, + "language_loss": 0.83113956, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.85917383, + "num_input_tokens_seen": 96721260, + "step": 4471, + "time_per_iteration": 2.798953056335449 + }, + { + "auxiliary_loss_clip": 0.01513117, + "auxiliary_loss_mlp": 0.01315398, + "balance_loss_clip": 1.16366911, + "balance_loss_mlp": 1.06362784, + "epoch": 0.2688711859311589, + "flos": 28403013625920.0, + "grad_norm": 2.2638559649971395, + "language_loss": 0.69161129, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71989644, + "num_input_tokens_seen": 96740385, + "step": 4472, + "time_per_iteration": 2.895556688308716 + }, + { + "auxiliary_loss_clip": 0.0150315, + "auxiliary_loss_mlp": 0.01296769, + "balance_loss_clip": 1.15549374, + "balance_loss_mlp": 1.04786038, + "epoch": 0.26893130918382685, + "flos": 25997358104640.0, + "grad_norm": 1.7194353552733186, + "language_loss": 0.67826772, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70626688, + "num_input_tokens_seen": 96761860, + "step": 4473, + "time_per_iteration": 2.8334460258483887 + }, + { + "auxiliary_loss_clip": 0.01500515, + "auxiliary_loss_mlp": 0.01298798, + "balance_loss_clip": 1.15215421, + "balance_loss_mlp": 1.04950786, + "epoch": 0.2689914324364948, + "flos": 20341719912960.0, + "grad_norm": 2.0843460873937887, + "language_loss": 0.83248621, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.86047935, + "num_input_tokens_seen": 96781890, + "step": 4474, + "time_per_iteration": 2.7971041202545166 + }, + { + "auxiliary_loss_clip": 0.01501036, + "auxiliary_loss_mlp": 0.01312192, + "balance_loss_clip": 1.15298152, + "balance_loss_mlp": 1.06252015, + "epoch": 0.2690515556891628, + "flos": 19466632661760.0, + "grad_norm": 1.8139713983083123, + "language_loss": 0.70906746, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.73719978, + "num_input_tokens_seen": 96800390, + "step": 4475, + "time_per_iteration": 2.8524208068847656 + }, + { + "auxiliary_loss_clip": 0.01495727, + "auxiliary_loss_mlp": 0.01295272, + "balance_loss_clip": 1.14840257, + "balance_loss_mlp": 1.03778005, + "epoch": 0.26911167894183075, + "flos": 18148388417280.0, + "grad_norm": 1.6632616601408456, + "language_loss": 0.73535132, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.76326132, + "num_input_tokens_seen": 96816685, + "step": 4476, + "time_per_iteration": 2.7539243698120117 + }, + { + "auxiliary_loss_clip": 0.0149656, + "auxiliary_loss_mlp": 0.01296092, + "balance_loss_clip": 1.14868712, + "balance_loss_mlp": 1.04260528, + "epoch": 0.2691718021944987, + "flos": 19977226652160.0, + "grad_norm": 1.6677814364208599, + "language_loss": 0.80885917, + "learning_rate": 3.429346772085922e-06, + "loss": 0.83678567, + "num_input_tokens_seen": 96836285, + "step": 4477, + "time_per_iteration": 4.324236869812012 + }, + { + "auxiliary_loss_clip": 0.01493821, + "auxiliary_loss_mlp": 0.01291233, + "balance_loss_clip": 1.14679492, + "balance_loss_mlp": 1.03698313, + "epoch": 0.2692319254471667, + "flos": 37450032125760.0, + "grad_norm": 1.7515027286334268, + "language_loss": 0.65501338, + "learning_rate": 3.429074332770984e-06, + "loss": 0.68286383, + "num_input_tokens_seen": 96857745, + "step": 4478, + "time_per_iteration": 4.398770332336426 + }, + { + "auxiliary_loss_clip": 0.01497953, + "auxiliary_loss_mlp": 0.01286473, + "balance_loss_clip": 1.1509006, + "balance_loss_mlp": 1.03146052, + "epoch": 0.26929204869983464, + "flos": 22130012580480.0, + "grad_norm": 1.8647084294118315, + "language_loss": 0.80873293, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.83657718, + "num_input_tokens_seen": 96877295, + "step": 4479, + "time_per_iteration": 4.299362659454346 + }, + { + "auxiliary_loss_clip": 0.01499877, + "auxiliary_loss_mlp": 0.01293912, + "balance_loss_clip": 1.15249467, + "balance_loss_mlp": 1.03718305, + "epoch": 0.2693521719525026, + "flos": 19794904165440.0, + "grad_norm": 2.6524370361969676, + "language_loss": 0.8106302, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.83856809, + "num_input_tokens_seen": 96896160, + "step": 4480, + "time_per_iteration": 2.7863073348999023 + }, + { + "auxiliary_loss_clip": 0.0149649, + "auxiliary_loss_mlp": 0.01285496, + "balance_loss_clip": 1.14916873, + "balance_loss_mlp": 1.03506136, + "epoch": 0.2694122952051706, + "flos": 20996404440480.0, + "grad_norm": 1.5859045569903831, + "language_loss": 0.77594614, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.80376601, + "num_input_tokens_seen": 96915410, + "step": 4481, + "time_per_iteration": 2.8162360191345215 + }, + { + "auxiliary_loss_clip": 0.01501188, + "auxiliary_loss_mlp": 0.01295631, + "balance_loss_clip": 1.15195942, + "balance_loss_mlp": 1.03928375, + "epoch": 0.2694724184578386, + "flos": 25851712512960.0, + "grad_norm": 1.5989318882646097, + "language_loss": 0.74229169, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.77025986, + "num_input_tokens_seen": 96937865, + "step": 4482, + "time_per_iteration": 2.92925763130188 + }, + { + "auxiliary_loss_clip": 0.01493421, + "auxiliary_loss_mlp": 0.01292574, + "balance_loss_clip": 1.14631224, + "balance_loss_mlp": 1.03260279, + "epoch": 0.26953254171050656, + "flos": 21729183778080.0, + "grad_norm": 2.243866828983813, + "language_loss": 0.72742474, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.75528467, + "num_input_tokens_seen": 96957710, + "step": 4483, + "time_per_iteration": 2.863272190093994 + }, + { + "auxiliary_loss_clip": 0.01495357, + "auxiliary_loss_mlp": 0.01293211, + "balance_loss_clip": 1.14650869, + "balance_loss_mlp": 1.03419304, + "epoch": 0.2695926649631745, + "flos": 19684987264800.0, + "grad_norm": 2.2562356446071505, + "language_loss": 0.87065017, + "learning_rate": 3.427438559239605e-06, + "loss": 0.89853585, + "num_input_tokens_seen": 96975890, + "step": 4484, + "time_per_iteration": 2.728945732116699 + }, + { + "auxiliary_loss_clip": 0.01490589, + "auxiliary_loss_mlp": 0.01295077, + "balance_loss_clip": 1.1431427, + "balance_loss_mlp": 1.03396094, + "epoch": 0.2696527882158425, + "flos": 32889163273920.0, + "grad_norm": 1.586328390357918, + "language_loss": 0.66559625, + "learning_rate": 3.427165740807239e-06, + "loss": 0.69345289, + "num_input_tokens_seen": 96998595, + "step": 4485, + "time_per_iteration": 2.9770052433013916 + }, + { + "auxiliary_loss_clip": 0.01495186, + "auxiliary_loss_mlp": 0.0128886, + "balance_loss_clip": 1.14698207, + "balance_loss_mlp": 1.0288887, + "epoch": 0.26971291146851045, + "flos": 12126170916000.0, + "grad_norm": 2.850405442922978, + "language_loss": 0.72418582, + "learning_rate": 3.426892868256604e-06, + "loss": 0.75202632, + "num_input_tokens_seen": 97013715, + "step": 4486, + "time_per_iteration": 2.743942975997925 + }, + { + "auxiliary_loss_clip": 0.0149584, + "auxiliary_loss_mlp": 0.01287444, + "balance_loss_clip": 1.14852118, + "balance_loss_mlp": 1.02651858, + "epoch": 0.2697730347211784, + "flos": 22636472401440.0, + "grad_norm": 2.1321266595462167, + "language_loss": 0.84424096, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.87207377, + "num_input_tokens_seen": 97031570, + "step": 4487, + "time_per_iteration": 2.781039237976074 + }, + { + "auxiliary_loss_clip": 0.01506994, + "auxiliary_loss_mlp": 0.01310691, + "balance_loss_clip": 1.1578964, + "balance_loss_mlp": 1.06140065, + "epoch": 0.2698331579738464, + "flos": 23515276612320.0, + "grad_norm": 2.315127675677367, + "language_loss": 0.72212195, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.75029886, + "num_input_tokens_seen": 97049815, + "step": 4488, + "time_per_iteration": 2.8202362060546875 + }, + { + "auxiliary_loss_clip": 0.0149588, + "auxiliary_loss_mlp": 0.01306376, + "balance_loss_clip": 1.14528155, + "balance_loss_mlp": 1.05384302, + "epoch": 0.26989328122651435, + "flos": 24643157600160.0, + "grad_norm": 1.684675864867829, + "language_loss": 0.83878434, + "learning_rate": 3.426073925998578e-06, + "loss": 0.86680686, + "num_input_tokens_seen": 97067570, + "step": 4489, + "time_per_iteration": 2.7584495544433594 + }, + { + "auxiliary_loss_clip": 0.01493682, + "auxiliary_loss_mlp": 0.01295155, + "balance_loss_clip": 1.14533186, + "balance_loss_mlp": 1.03747177, + "epoch": 0.2699534044791823, + "flos": 10773677178720.0, + "grad_norm": 2.2353394933546893, + "language_loss": 0.90079403, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.92868245, + "num_input_tokens_seen": 97082180, + "step": 4490, + "time_per_iteration": 2.742324113845825 + }, + { + "auxiliary_loss_clip": 0.01495425, + "auxiliary_loss_mlp": 0.01293288, + "balance_loss_clip": 1.14649498, + "balance_loss_mlp": 1.04380655, + "epoch": 0.2700135277318503, + "flos": 36174419497440.0, + "grad_norm": 1.8111110574697098, + "language_loss": 0.73112518, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75901228, + "num_input_tokens_seen": 97103470, + "step": 4491, + "time_per_iteration": 2.8923282623291016 + }, + { + "auxiliary_loss_clip": 0.01502941, + "auxiliary_loss_mlp": 0.01300429, + "balance_loss_clip": 1.15421128, + "balance_loss_mlp": 1.0416019, + "epoch": 0.27007365098451824, + "flos": 17420425956000.0, + "grad_norm": 3.205425566534321, + "language_loss": 0.74598473, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.77401841, + "num_input_tokens_seen": 97118100, + "step": 4492, + "time_per_iteration": 2.7458152770996094 + }, + { + "auxiliary_loss_clip": 0.01493917, + "auxiliary_loss_mlp": 0.01304571, + "balance_loss_clip": 1.14524722, + "balance_loss_mlp": 1.04765129, + "epoch": 0.2701337742371862, + "flos": 23187953312640.0, + "grad_norm": 1.7923546035526672, + "language_loss": 0.89175326, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91973817, + "num_input_tokens_seen": 97136765, + "step": 4493, + "time_per_iteration": 2.881284713745117 + }, + { + "auxiliary_loss_clip": 0.0150389, + "auxiliary_loss_mlp": 0.01300089, + "balance_loss_clip": 1.15604651, + "balance_loss_mlp": 1.0494628, + "epoch": 0.2701938974898542, + "flos": 24391767205440.0, + "grad_norm": 1.4905228338145085, + "language_loss": 0.7106654, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73870516, + "num_input_tokens_seen": 97157470, + "step": 4494, + "time_per_iteration": 2.7958266735076904 + }, + { + "auxiliary_loss_clip": 0.01491358, + "auxiliary_loss_mlp": 0.01316707, + "balance_loss_clip": 1.14198935, + "balance_loss_mlp": 1.06627238, + "epoch": 0.2702540207425222, + "flos": 26216774696160.0, + "grad_norm": 2.0279491384394692, + "language_loss": 0.86408961, + "learning_rate": 3.42443458168683e-06, + "loss": 0.89217031, + "num_input_tokens_seen": 97176905, + "step": 4495, + "time_per_iteration": 2.817406415939331 + }, + { + "auxiliary_loss_clip": 0.01499016, + "auxiliary_loss_mlp": 0.01302124, + "balance_loss_clip": 1.1501087, + "balance_loss_mlp": 1.04940081, + "epoch": 0.27031414399519016, + "flos": 22928408363520.0, + "grad_norm": 2.6562038379172908, + "language_loss": 0.76963973, + "learning_rate": 3.424161168522959e-06, + "loss": 0.79765117, + "num_input_tokens_seen": 97196380, + "step": 4496, + "time_per_iteration": 2.7901768684387207 + }, + { + "auxiliary_loss_clip": 0.01591805, + "auxiliary_loss_mlp": 0.01307533, + "balance_loss_clip": 1.24710083, + "balance_loss_mlp": 1.09467316, + "epoch": 0.2703742672478581, + "flos": 63025852288320.0, + "grad_norm": 0.7235251238039077, + "language_loss": 0.50150394, + "learning_rate": 3.423887701354754e-06, + "loss": 0.53049737, + "num_input_tokens_seen": 97260100, + "step": 4497, + "time_per_iteration": 3.379340410232544 + }, + { + "auxiliary_loss_clip": 0.01498264, + "auxiliary_loss_mlp": 0.01303112, + "balance_loss_clip": 1.14904702, + "balance_loss_mlp": 1.04905283, + "epoch": 0.2704343905005261, + "flos": 18842670308160.0, + "grad_norm": 1.704214690222798, + "language_loss": 0.72298229, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.75099599, + "num_input_tokens_seen": 97277935, + "step": 4498, + "time_per_iteration": 2.746023178100586 + }, + { + "auxiliary_loss_clip": 0.01592158, + "auxiliary_loss_mlp": 0.0127021, + "balance_loss_clip": 1.24659252, + "balance_loss_mlp": 1.05658722, + "epoch": 0.27049451375319405, + "flos": 71240187584160.0, + "grad_norm": 0.7445421133443284, + "language_loss": 0.59180659, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.62043029, + "num_input_tokens_seen": 97338845, + "step": 4499, + "time_per_iteration": 3.302929401397705 + }, + { + "auxiliary_loss_clip": 0.01495787, + "auxiliary_loss_mlp": 0.01285112, + "balance_loss_clip": 1.14564586, + "balance_loss_mlp": 1.02952731, + "epoch": 0.270554637005862, + "flos": 24280826244480.0, + "grad_norm": 2.7778238581175105, + "language_loss": 0.73712277, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.7649318, + "num_input_tokens_seen": 97356640, + "step": 4500, + "time_per_iteration": 2.7578775882720947 + }, + { + "auxiliary_loss_clip": 0.01486163, + "auxiliary_loss_mlp": 0.01288506, + "balance_loss_clip": 1.13697505, + "balance_loss_mlp": 1.03425598, + "epoch": 0.27061476025853, + "flos": 17633849898240.0, + "grad_norm": 2.904869694173102, + "language_loss": 0.81212234, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.83986902, + "num_input_tokens_seen": 97372585, + "step": 4501, + "time_per_iteration": 2.77134108543396 + }, + { + "auxiliary_loss_clip": 0.01499417, + "auxiliary_loss_mlp": 0.0129321, + "balance_loss_clip": 1.15001893, + "balance_loss_mlp": 1.03590846, + "epoch": 0.27067488351119795, + "flos": 22712291521920.0, + "grad_norm": 2.132033871461253, + "language_loss": 0.72599804, + "learning_rate": 3.422519555811735e-06, + "loss": 0.75392431, + "num_input_tokens_seen": 97393315, + "step": 4502, + "time_per_iteration": 2.8727328777313232 + }, + { + "auxiliary_loss_clip": 0.01489767, + "auxiliary_loss_mlp": 0.01311384, + "balance_loss_clip": 1.14008927, + "balance_loss_mlp": 1.05122185, + "epoch": 0.2707350067638659, + "flos": 41722871616000.0, + "grad_norm": 2.0722998787426676, + "language_loss": 0.69017363, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.71818513, + "num_input_tokens_seen": 97417860, + "step": 4503, + "time_per_iteration": 2.9052441120147705 + }, + { + "auxiliary_loss_clip": 0.01490029, + "auxiliary_loss_mlp": 0.01284266, + "balance_loss_clip": 1.13966894, + "balance_loss_mlp": 1.02582014, + "epoch": 0.2707951300165339, + "flos": 20195088189120.0, + "grad_norm": 1.8737258920483408, + "language_loss": 0.67926586, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.70700878, + "num_input_tokens_seen": 97436780, + "step": 4504, + "time_per_iteration": 2.813694715499878 + }, + { + "auxiliary_loss_clip": 0.0149934, + "auxiliary_loss_mlp": 0.01294766, + "balance_loss_clip": 1.14952528, + "balance_loss_mlp": 1.04204178, + "epoch": 0.27085525326920185, + "flos": 21436109971200.0, + "grad_norm": 1.5571964555132392, + "language_loss": 0.75781053, + "learning_rate": 3.421698021097902e-06, + "loss": 0.78575158, + "num_input_tokens_seen": 97456190, + "step": 4505, + "time_per_iteration": 2.7641284465789795 + }, + { + "auxiliary_loss_clip": 0.01496353, + "auxiliary_loss_mlp": 0.01305744, + "balance_loss_clip": 1.14559054, + "balance_loss_mlp": 1.04748881, + "epoch": 0.2709153765218698, + "flos": 17677126293120.0, + "grad_norm": 2.6167862242356934, + "language_loss": 0.74058896, + "learning_rate": 3.42142406835758e-06, + "loss": 0.76860988, + "num_input_tokens_seen": 97474545, + "step": 4506, + "time_per_iteration": 2.787550687789917 + }, + { + "auxiliary_loss_clip": 0.01495043, + "auxiliary_loss_mlp": 0.01290036, + "balance_loss_clip": 1.14505386, + "balance_loss_mlp": 1.03082776, + "epoch": 0.2709754997745378, + "flos": 24458218070400.0, + "grad_norm": 2.075059074166087, + "language_loss": 0.812374, + "learning_rate": 3.421150061716715e-06, + "loss": 0.84022486, + "num_input_tokens_seen": 97494520, + "step": 4507, + "time_per_iteration": 2.7862067222595215 + }, + { + "auxiliary_loss_clip": 0.01591016, + "auxiliary_loss_mlp": 0.01247887, + "balance_loss_clip": 1.24522448, + "balance_loss_mlp": 1.02510834, + "epoch": 0.2710356230272058, + "flos": 65217097735200.0, + "grad_norm": 0.7498387165104429, + "language_loss": 0.50790727, + "learning_rate": 3.420876001185698e-06, + "loss": 0.53629631, + "num_input_tokens_seen": 97552455, + "step": 4508, + "time_per_iteration": 4.968835115432739 + }, + { + "auxiliary_loss_clip": 0.01487674, + "auxiliary_loss_mlp": 0.01283585, + "balance_loss_clip": 1.13740826, + "balance_loss_mlp": 1.02666545, + "epoch": 0.27109574627987376, + "flos": 25486839970560.0, + "grad_norm": 2.451973974116792, + "language_loss": 0.74936712, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.77707976, + "num_input_tokens_seen": 97572650, + "step": 4509, + "time_per_iteration": 2.86383318901062 + }, + { + "auxiliary_loss_clip": 0.01485669, + "auxiliary_loss_mlp": 0.01287644, + "balance_loss_clip": 1.13594079, + "balance_loss_mlp": 1.03740001, + "epoch": 0.2711558695325417, + "flos": 19684835552160.0, + "grad_norm": 10.188241740321835, + "language_loss": 0.71686745, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.74460065, + "num_input_tokens_seen": 97591150, + "step": 4510, + "time_per_iteration": 2.863677501678467 + }, + { + "auxiliary_loss_clip": 0.01498486, + "auxiliary_loss_mlp": 0.01297126, + "balance_loss_clip": 1.14789283, + "balance_loss_mlp": 1.04497457, + "epoch": 0.2712159927852097, + "flos": 18589724858880.0, + "grad_norm": 2.698244439062043, + "language_loss": 0.7053591, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.73331523, + "num_input_tokens_seen": 97607410, + "step": 4511, + "time_per_iteration": 2.7701103687286377 + }, + { + "auxiliary_loss_clip": 0.01490574, + "auxiliary_loss_mlp": 0.01303256, + "balance_loss_clip": 1.14135456, + "balance_loss_mlp": 1.04805219, + "epoch": 0.27127611603787766, + "flos": 25632637274880.0, + "grad_norm": 2.0723176469601228, + "language_loss": 0.81129432, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83923268, + "num_input_tokens_seen": 97626870, + "step": 4512, + "time_per_iteration": 2.7738215923309326 + }, + { + "auxiliary_loss_clip": 0.01487387, + "auxiliary_loss_mlp": 0.01291661, + "balance_loss_clip": 1.13752937, + "balance_loss_mlp": 1.03893745, + "epoch": 0.2713362392905456, + "flos": 23151200561280.0, + "grad_norm": 1.748867154171048, + "language_loss": 0.80598152, + "learning_rate": 3.419504890542124e-06, + "loss": 0.83377206, + "num_input_tokens_seen": 97646595, + "step": 4513, + "time_per_iteration": 2.7403743267059326 + }, + { + "auxiliary_loss_clip": 0.01486069, + "auxiliary_loss_mlp": 0.01292275, + "balance_loss_clip": 1.13503122, + "balance_loss_mlp": 1.04145896, + "epoch": 0.2713963625432136, + "flos": 18367842936960.0, + "grad_norm": 1.9808033405660352, + "language_loss": 0.88310945, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.9108929, + "num_input_tokens_seen": 97665485, + "step": 4514, + "time_per_iteration": 2.772639513015747 + }, + { + "auxiliary_loss_clip": 0.01491658, + "auxiliary_loss_mlp": 0.01303057, + "balance_loss_clip": 1.14085698, + "balance_loss_mlp": 1.04899859, + "epoch": 0.27145648579588155, + "flos": 22493936918880.0, + "grad_norm": 2.019126995767401, + "language_loss": 0.91645336, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94440049, + "num_input_tokens_seen": 97683800, + "step": 4515, + "time_per_iteration": 4.337966442108154 + }, + { + "auxiliary_loss_clip": 0.01502468, + "auxiliary_loss_mlp": 0.01310003, + "balance_loss_clip": 1.15013218, + "balance_loss_mlp": 1.05708885, + "epoch": 0.2715166090485495, + "flos": 19240882067520.0, + "grad_norm": 2.8765295091907848, + "language_loss": 0.73826247, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.76638722, + "num_input_tokens_seen": 97700505, + "step": 4516, + "time_per_iteration": 4.201813697814941 + }, + { + "auxiliary_loss_clip": 0.01487579, + "auxiliary_loss_mlp": 0.01310054, + "balance_loss_clip": 1.13724732, + "balance_loss_mlp": 1.05732989, + "epoch": 0.2715767323012175, + "flos": 17711375785920.0, + "grad_norm": 2.931187187982907, + "language_loss": 0.76332164, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.79129803, + "num_input_tokens_seen": 97717410, + "step": 4517, + "time_per_iteration": 4.447997331619263 + }, + { + "auxiliary_loss_clip": 0.01496116, + "auxiliary_loss_mlp": 0.01316541, + "balance_loss_clip": 1.14438629, + "balance_loss_mlp": 1.06629682, + "epoch": 0.27163685555388545, + "flos": 22385044078560.0, + "grad_norm": 2.4535610004860384, + "language_loss": 0.7697804, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.79790699, + "num_input_tokens_seen": 97734545, + "step": 4518, + "time_per_iteration": 2.75199294090271 + }, + { + "auxiliary_loss_clip": 0.0148737, + "auxiliary_loss_mlp": 0.01295414, + "balance_loss_clip": 1.13590097, + "balance_loss_mlp": 1.04364431, + "epoch": 0.2716969788065534, + "flos": 22348936105920.0, + "grad_norm": 2.4821219477446803, + "language_loss": 0.68444157, + "learning_rate": 3.41785778156811e-06, + "loss": 0.71226937, + "num_input_tokens_seen": 97754000, + "step": 4519, + "time_per_iteration": 2.7549021244049072 + }, + { + "auxiliary_loss_clip": 0.01490296, + "auxiliary_loss_mlp": 0.01297057, + "balance_loss_clip": 1.13982809, + "balance_loss_mlp": 1.04795766, + "epoch": 0.2717571020592214, + "flos": 25230822340320.0, + "grad_norm": 2.063938658677633, + "language_loss": 0.76217097, + "learning_rate": 3.417583075166451e-06, + "loss": 0.79004449, + "num_input_tokens_seen": 97772080, + "step": 4520, + "time_per_iteration": 2.7970683574676514 + }, + { + "auxiliary_loss_clip": 0.01500268, + "auxiliary_loss_mlp": 0.01304316, + "balance_loss_clip": 1.14994013, + "balance_loss_mlp": 1.05102038, + "epoch": 0.2718172253118894, + "flos": 20191636726560.0, + "grad_norm": 2.4773165204616268, + "language_loss": 0.77141011, + "learning_rate": 3.4173083150099e-06, + "loss": 0.799456, + "num_input_tokens_seen": 97789370, + "step": 4521, + "time_per_iteration": 2.8062565326690674 + }, + { + "auxiliary_loss_clip": 0.01494587, + "auxiliary_loss_mlp": 0.01319764, + "balance_loss_clip": 1.14280593, + "balance_loss_mlp": 1.06570506, + "epoch": 0.27187734856455736, + "flos": 14320867825440.0, + "grad_norm": 2.169412827320006, + "language_loss": 0.75599617, + "learning_rate": 3.417033501108875e-06, + "loss": 0.78413963, + "num_input_tokens_seen": 97807385, + "step": 4522, + "time_per_iteration": 2.769573450088501 + }, + { + "auxiliary_loss_clip": 0.01492884, + "auxiliary_loss_mlp": 0.01291536, + "balance_loss_clip": 1.14215565, + "balance_loss_mlp": 1.03366256, + "epoch": 0.27193747181722533, + "flos": 21109924516320.0, + "grad_norm": 1.8768271743435352, + "language_loss": 0.72854137, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75638556, + "num_input_tokens_seen": 97827930, + "step": 4523, + "time_per_iteration": 2.7630534172058105 + }, + { + "auxiliary_loss_clip": 0.01492503, + "auxiliary_loss_mlp": 0.01282294, + "balance_loss_clip": 1.1418047, + "balance_loss_mlp": 1.0278542, + "epoch": 0.2719975950698933, + "flos": 19684835552160.0, + "grad_norm": 1.608102067791249, + "language_loss": 0.74737662, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.77512467, + "num_input_tokens_seen": 97847440, + "step": 4524, + "time_per_iteration": 2.7856576442718506 + }, + { + "auxiliary_loss_clip": 0.01492909, + "auxiliary_loss_mlp": 0.01290198, + "balance_loss_clip": 1.1425643, + "balance_loss_mlp": 1.02965438, + "epoch": 0.27205771832256126, + "flos": 24756829388640.0, + "grad_norm": 2.140018935314756, + "language_loss": 0.76176465, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78959566, + "num_input_tokens_seen": 97867620, + "step": 4525, + "time_per_iteration": 2.8272621631622314 + }, + { + "auxiliary_loss_clip": 0.01491334, + "auxiliary_loss_mlp": 0.0129214, + "balance_loss_clip": 1.14121246, + "balance_loss_mlp": 1.03960693, + "epoch": 0.2721178415752292, + "flos": 21757364765280.0, + "grad_norm": 2.1223834603422462, + "language_loss": 0.82427299, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.8521077, + "num_input_tokens_seen": 97884345, + "step": 4526, + "time_per_iteration": 2.749872922897339 + }, + { + "auxiliary_loss_clip": 0.01495614, + "auxiliary_loss_mlp": 0.01295558, + "balance_loss_clip": 1.14408302, + "balance_loss_mlp": 1.03768468, + "epoch": 0.2721779648278972, + "flos": 12678448318560.0, + "grad_norm": 3.263868224867014, + "language_loss": 0.76982248, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79773414, + "num_input_tokens_seen": 97901500, + "step": 4527, + "time_per_iteration": 2.7312302589416504 + }, + { + "auxiliary_loss_clip": 0.01497196, + "auxiliary_loss_mlp": 0.01289085, + "balance_loss_clip": 1.14605999, + "balance_loss_mlp": 1.03025818, + "epoch": 0.27223808808056515, + "flos": 16255071581760.0, + "grad_norm": 2.224962432674028, + "language_loss": 0.81793731, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84580016, + "num_input_tokens_seen": 97917800, + "step": 4528, + "time_per_iteration": 2.697845458984375 + }, + { + "auxiliary_loss_clip": 0.01501445, + "auxiliary_loss_mlp": 0.01294377, + "balance_loss_clip": 1.15029109, + "balance_loss_mlp": 1.04260719, + "epoch": 0.2722982113332331, + "flos": 27748291170240.0, + "grad_norm": 2.4147349867101515, + "language_loss": 0.77515233, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.80311054, + "num_input_tokens_seen": 97937225, + "step": 4529, + "time_per_iteration": 2.8228366374969482 + }, + { + "auxiliary_loss_clip": 0.01497632, + "auxiliary_loss_mlp": 0.01287511, + "balance_loss_clip": 1.14651155, + "balance_loss_mlp": 1.02944684, + "epoch": 0.2723583345859011, + "flos": 21728425214880.0, + "grad_norm": 1.9048194819159687, + "language_loss": 0.82252532, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.85037673, + "num_input_tokens_seen": 97956845, + "step": 4530, + "time_per_iteration": 2.8007242679595947 + }, + { + "auxiliary_loss_clip": 0.01494349, + "auxiliary_loss_mlp": 0.01290387, + "balance_loss_clip": 1.14258313, + "balance_loss_mlp": 1.03327644, + "epoch": 0.27241845783856905, + "flos": 17349158214720.0, + "grad_norm": 3.4898913623616674, + "language_loss": 0.91735357, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.94520092, + "num_input_tokens_seen": 97972465, + "step": 4531, + "time_per_iteration": 2.714459180831909 + }, + { + "auxiliary_loss_clip": 0.01494332, + "auxiliary_loss_mlp": 0.01295893, + "balance_loss_clip": 1.14372909, + "balance_loss_mlp": 1.03306055, + "epoch": 0.272478581091237, + "flos": 24756639747840.0, + "grad_norm": 2.232447807998471, + "language_loss": 0.76292813, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.79083037, + "num_input_tokens_seen": 97990770, + "step": 4532, + "time_per_iteration": 2.816671371459961 + }, + { + "auxiliary_loss_clip": 0.0149733, + "auxiliary_loss_mlp": 0.01284488, + "balance_loss_clip": 1.14550376, + "balance_loss_mlp": 1.02928495, + "epoch": 0.272538704343905, + "flos": 17892332858880.0, + "grad_norm": 2.965855917218391, + "language_loss": 0.89317822, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.92099637, + "num_input_tokens_seen": 98005775, + "step": 4533, + "time_per_iteration": 2.6973507404327393 + }, + { + "auxiliary_loss_clip": 0.01506532, + "auxiliary_loss_mlp": 0.01297273, + "balance_loss_clip": 1.15491605, + "balance_loss_mlp": 1.04473948, + "epoch": 0.272598827596573, + "flos": 22934363084640.0, + "grad_norm": 2.79744956314649, + "language_loss": 0.7199229, + "learning_rate": 3.413731546022929e-06, + "loss": 0.74796093, + "num_input_tokens_seen": 98025750, + "step": 4534, + "time_per_iteration": 2.8483736515045166 + }, + { + "auxiliary_loss_clip": 0.01497122, + "auxiliary_loss_mlp": 0.01295672, + "balance_loss_clip": 1.14663076, + "balance_loss_mlp": 1.03665423, + "epoch": 0.27265895084924097, + "flos": 24240432389760.0, + "grad_norm": 1.702795072643634, + "language_loss": 0.9129557, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.94088364, + "num_input_tokens_seen": 98044955, + "step": 4535, + "time_per_iteration": 2.7908008098602295 + }, + { + "auxiliary_loss_clip": 0.01512233, + "auxiliary_loss_mlp": 0.01298632, + "balance_loss_clip": 1.16241717, + "balance_loss_mlp": 1.03885126, + "epoch": 0.27271907410190893, + "flos": 27015435976320.0, + "grad_norm": 1.6557013830156049, + "language_loss": 0.72889704, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.75700569, + "num_input_tokens_seen": 98065860, + "step": 4536, + "time_per_iteration": 2.920088768005371 + }, + { + "auxiliary_loss_clip": 0.01507157, + "auxiliary_loss_mlp": 0.01296523, + "balance_loss_clip": 1.15551424, + "balance_loss_mlp": 1.03807724, + "epoch": 0.2727791973545769, + "flos": 34455080953440.0, + "grad_norm": 1.750980460394382, + "language_loss": 0.71486628, + "learning_rate": 3.41290485034781e-06, + "loss": 0.74290305, + "num_input_tokens_seen": 98085450, + "step": 4537, + "time_per_iteration": 2.9283103942871094 + }, + { + "auxiliary_loss_clip": 0.01501724, + "auxiliary_loss_mlp": 0.01299631, + "balance_loss_clip": 1.1506803, + "balance_loss_mlp": 1.03641701, + "epoch": 0.27283932060724486, + "flos": 15043103134560.0, + "grad_norm": 4.379377623388029, + "language_loss": 0.783876, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.81188965, + "num_input_tokens_seen": 98099115, + "step": 4538, + "time_per_iteration": 2.6903207302093506 + }, + { + "auxiliary_loss_clip": 0.01500482, + "auxiliary_loss_mlp": 0.01302822, + "balance_loss_clip": 1.14970064, + "balance_loss_mlp": 1.04799998, + "epoch": 0.2728994438599128, + "flos": 21654540430560.0, + "grad_norm": 1.5257307022902589, + "language_loss": 0.90274352, + "learning_rate": 3.412353451992847e-06, + "loss": 0.93077654, + "num_input_tokens_seen": 98118415, + "step": 4539, + "time_per_iteration": 2.8868215084075928 + }, + { + "auxiliary_loss_clip": 0.01503712, + "auxiliary_loss_mlp": 0.01293189, + "balance_loss_clip": 1.15241766, + "balance_loss_mlp": 1.03455198, + "epoch": 0.2729595671125808, + "flos": 17490062786400.0, + "grad_norm": 1.9342963166078755, + "language_loss": 0.88261724, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.91058618, + "num_input_tokens_seen": 98136300, + "step": 4540, + "time_per_iteration": 2.7740228176116943 + }, + { + "auxiliary_loss_clip": 0.01503384, + "auxiliary_loss_mlp": 0.01302161, + "balance_loss_clip": 1.15256834, + "balance_loss_mlp": 1.04657602, + "epoch": 0.27301969036524876, + "flos": 19320190578720.0, + "grad_norm": 2.890507289988626, + "language_loss": 0.82074577, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.8488012, + "num_input_tokens_seen": 98154580, + "step": 4541, + "time_per_iteration": 2.855978488922119 + }, + { + "auxiliary_loss_clip": 0.01496982, + "auxiliary_loss_mlp": 0.0129187, + "balance_loss_clip": 1.14650524, + "balance_loss_mlp": 1.0401001, + "epoch": 0.2730798136179167, + "flos": 21067596325440.0, + "grad_norm": 2.960074267374481, + "language_loss": 0.79605556, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.82394409, + "num_input_tokens_seen": 98173115, + "step": 4542, + "time_per_iteration": 2.785355806350708 + }, + { + "auxiliary_loss_clip": 0.01501716, + "auxiliary_loss_mlp": 0.01298595, + "balance_loss_clip": 1.1513437, + "balance_loss_mlp": 1.04396367, + "epoch": 0.2731399368705847, + "flos": 19173748495680.0, + "grad_norm": 2.748223020246904, + "language_loss": 0.89731431, + "learning_rate": 3.411250012687582e-06, + "loss": 0.92531747, + "num_input_tokens_seen": 98190260, + "step": 4543, + "time_per_iteration": 2.7689712047576904 + }, + { + "auxiliary_loss_clip": 0.01506455, + "auxiliary_loss_mlp": 0.01313421, + "balance_loss_clip": 1.15619838, + "balance_loss_mlp": 1.05802655, + "epoch": 0.27320006012325265, + "flos": 18291758319360.0, + "grad_norm": 2.384092174313834, + "language_loss": 0.6376335, + "learning_rate": 3.410974019048255e-06, + "loss": 0.66583228, + "num_input_tokens_seen": 98207115, + "step": 4544, + "time_per_iteration": 2.7683486938476562 + }, + { + "auxiliary_loss_clip": 0.01504548, + "auxiliary_loss_mlp": 0.01308987, + "balance_loss_clip": 1.15355492, + "balance_loss_mlp": 1.05321169, + "epoch": 0.2732601833759206, + "flos": 34863836741280.0, + "grad_norm": 1.9953978731432918, + "language_loss": 0.69949722, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72763252, + "num_input_tokens_seen": 98230610, + "step": 4545, + "time_per_iteration": 3.0623018741607666 + }, + { + "auxiliary_loss_clip": 0.0162245, + "auxiliary_loss_mlp": 0.0126281, + "balance_loss_clip": 1.27761912, + "balance_loss_mlp": 1.04384613, + "epoch": 0.2733203066285886, + "flos": 53917086015360.0, + "grad_norm": 0.7174884808055173, + "language_loss": 0.61511505, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.64396763, + "num_input_tokens_seen": 98293585, + "step": 4546, + "time_per_iteration": 3.348301410675049 + }, + { + "auxiliary_loss_clip": 0.01504772, + "auxiliary_loss_mlp": 0.01306507, + "balance_loss_clip": 1.1545558, + "balance_loss_mlp": 1.05302012, + "epoch": 0.2733804298812566, + "flos": 20662064431200.0, + "grad_norm": 2.645345582366057, + "language_loss": 0.65133762, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67945039, + "num_input_tokens_seen": 98311680, + "step": 4547, + "time_per_iteration": 4.435338020324707 + }, + { + "auxiliary_loss_clip": 0.01505195, + "auxiliary_loss_mlp": 0.01319888, + "balance_loss_clip": 1.15478075, + "balance_loss_mlp": 1.07288647, + "epoch": 0.27344055313392457, + "flos": 25886682640800.0, + "grad_norm": 6.191517318942799, + "language_loss": 0.77927327, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.80752409, + "num_input_tokens_seen": 98330770, + "step": 4548, + "time_per_iteration": 2.7916371822357178 + }, + { + "auxiliary_loss_clip": 0.01504726, + "auxiliary_loss_mlp": 0.0130815, + "balance_loss_clip": 1.15394521, + "balance_loss_mlp": 1.05828786, + "epoch": 0.27350067638659253, + "flos": 22931973610560.0, + "grad_norm": 1.860092334644482, + "language_loss": 0.83021295, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.85834169, + "num_input_tokens_seen": 98349860, + "step": 4549, + "time_per_iteration": 2.784104585647583 + }, + { + "auxiliary_loss_clip": 0.01496488, + "auxiliary_loss_mlp": 0.01310206, + "balance_loss_clip": 1.14659882, + "balance_loss_mlp": 1.05042505, + "epoch": 0.2735607996392605, + "flos": 16576326375840.0, + "grad_norm": 2.2513450330733247, + "language_loss": 0.71187013, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73993707, + "num_input_tokens_seen": 98367040, + "step": 4550, + "time_per_iteration": 2.7369515895843506 + }, + { + "auxiliary_loss_clip": 0.01502499, + "auxiliary_loss_mlp": 0.01296667, + "balance_loss_clip": 1.15248656, + "balance_loss_mlp": 1.04394341, + "epoch": 0.27362092289192846, + "flos": 19647286309440.0, + "grad_norm": 2.3225951010745467, + "language_loss": 0.78687572, + "learning_rate": 3.409040566039563e-06, + "loss": 0.81486738, + "num_input_tokens_seen": 98384010, + "step": 4551, + "time_per_iteration": 2.7738187313079834 + }, + { + "auxiliary_loss_clip": 0.01501697, + "auxiliary_loss_mlp": 0.01290378, + "balance_loss_clip": 1.15058374, + "balance_loss_mlp": 1.03383946, + "epoch": 0.27368104614459643, + "flos": 17641056248640.0, + "grad_norm": 2.8803243581348177, + "language_loss": 0.7071681, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.73508888, + "num_input_tokens_seen": 98399625, + "step": 4552, + "time_per_iteration": 2.765233278274536 + }, + { + "auxiliary_loss_clip": 0.0149679, + "auxiliary_loss_mlp": 0.01299149, + "balance_loss_clip": 1.14610314, + "balance_loss_mlp": 1.03955913, + "epoch": 0.2737411693972644, + "flos": 21582172772640.0, + "grad_norm": 1.9169205460336065, + "language_loss": 0.71949506, + "learning_rate": 3.408487669858431e-06, + "loss": 0.74745446, + "num_input_tokens_seen": 98417310, + "step": 4553, + "time_per_iteration": 2.7625670433044434 + }, + { + "auxiliary_loss_clip": 0.01499855, + "auxiliary_loss_mlp": 0.01294387, + "balance_loss_clip": 1.14987481, + "balance_loss_mlp": 1.03479731, + "epoch": 0.27380129264993236, + "flos": 25486422760800.0, + "grad_norm": 1.5645674267790424, + "language_loss": 0.59471029, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.62265271, + "num_input_tokens_seen": 98438670, + "step": 4554, + "time_per_iteration": 4.311074256896973 + }, + { + "auxiliary_loss_clip": 0.01501469, + "auxiliary_loss_mlp": 0.0129662, + "balance_loss_clip": 1.15224528, + "balance_loss_mlp": 1.03474128, + "epoch": 0.2738614159026003, + "flos": 18663192433440.0, + "grad_norm": 2.386899848170374, + "language_loss": 0.74184924, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76983011, + "num_input_tokens_seen": 98456060, + "step": 4555, + "time_per_iteration": 4.276238679885864 + }, + { + "auxiliary_loss_clip": 0.01505688, + "auxiliary_loss_mlp": 0.01289883, + "balance_loss_clip": 1.15541387, + "balance_loss_mlp": 1.03220069, + "epoch": 0.2739215391552683, + "flos": 23479396208640.0, + "grad_norm": 2.0912085123309208, + "language_loss": 0.77843589, + "learning_rate": 3.407657925038002e-06, + "loss": 0.8063916, + "num_input_tokens_seen": 98473765, + "step": 4556, + "time_per_iteration": 2.7574069499969482 + }, + { + "auxiliary_loss_clip": 0.01504545, + "auxiliary_loss_mlp": 0.01315097, + "balance_loss_clip": 1.15407538, + "balance_loss_mlp": 1.05226445, + "epoch": 0.27398166240793626, + "flos": 17130538114560.0, + "grad_norm": 4.274579108027022, + "language_loss": 0.82328439, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.85148078, + "num_input_tokens_seen": 98490590, + "step": 4557, + "time_per_iteration": 2.809467315673828 + }, + { + "auxiliary_loss_clip": 0.01501071, + "auxiliary_loss_mlp": 0.01304161, + "balance_loss_clip": 1.15026999, + "balance_loss_mlp": 1.04590607, + "epoch": 0.2740417856606042, + "flos": 23407028550720.0, + "grad_norm": 1.8763580725982747, + "language_loss": 0.72807026, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.75612253, + "num_input_tokens_seen": 98510590, + "step": 4558, + "time_per_iteration": 2.7666075229644775 + }, + { + "auxiliary_loss_clip": 0.01493673, + "auxiliary_loss_mlp": 0.0129468, + "balance_loss_clip": 1.14201808, + "balance_loss_mlp": 1.0400486, + "epoch": 0.2741019089132722, + "flos": 12781727791200.0, + "grad_norm": 2.217830617614057, + "language_loss": 0.68002719, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70791072, + "num_input_tokens_seen": 98527875, + "step": 4559, + "time_per_iteration": 2.771944284439087 + }, + { + "auxiliary_loss_clip": 0.01499769, + "auxiliary_loss_mlp": 0.01303007, + "balance_loss_clip": 1.14797676, + "balance_loss_mlp": 1.04932976, + "epoch": 0.27416203216594015, + "flos": 20633655875040.0, + "grad_norm": 2.1852542772631645, + "language_loss": 0.72467136, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.75269908, + "num_input_tokens_seen": 98547575, + "step": 4560, + "time_per_iteration": 2.806222915649414 + }, + { + "auxiliary_loss_clip": 0.01501001, + "auxiliary_loss_mlp": 0.01296213, + "balance_loss_clip": 1.14925981, + "balance_loss_mlp": 1.03891146, + "epoch": 0.27422215541860817, + "flos": 26543984211360.0, + "grad_norm": 1.751594125892917, + "language_loss": 0.81549442, + "learning_rate": 3.406273949573303e-06, + "loss": 0.84346658, + "num_input_tokens_seen": 98566290, + "step": 4561, + "time_per_iteration": 2.7881293296813965 + }, + { + "auxiliary_loss_clip": 0.01506883, + "auxiliary_loss_mlp": 0.0129723, + "balance_loss_clip": 1.15442944, + "balance_loss_mlp": 1.04298091, + "epoch": 0.27428227867127614, + "flos": 23333409263520.0, + "grad_norm": 2.181794431366789, + "language_loss": 0.75411683, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.78215796, + "num_input_tokens_seen": 98586255, + "step": 4562, + "time_per_iteration": 2.8968985080718994 + }, + { + "auxiliary_loss_clip": 0.01499441, + "auxiliary_loss_mlp": 0.01302763, + "balance_loss_clip": 1.14811754, + "balance_loss_mlp": 1.04603386, + "epoch": 0.2743424019239441, + "flos": 23037111563040.0, + "grad_norm": 1.7086904614186251, + "language_loss": 0.74863142, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.77665347, + "num_input_tokens_seen": 98606030, + "step": 4563, + "time_per_iteration": 2.772054672241211 + }, + { + "auxiliary_loss_clip": 0.01510889, + "auxiliary_loss_mlp": 0.01313111, + "balance_loss_clip": 1.15648425, + "balance_loss_mlp": 1.05275798, + "epoch": 0.27440252517661207, + "flos": 21983267072160.0, + "grad_norm": 1.8605053784828947, + "language_loss": 0.62780499, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.65604508, + "num_input_tokens_seen": 98625225, + "step": 4564, + "time_per_iteration": 2.779977798461914 + }, + { + "auxiliary_loss_clip": 0.01507394, + "auxiliary_loss_mlp": 0.01299623, + "balance_loss_clip": 1.15435588, + "balance_loss_mlp": 1.04117739, + "epoch": 0.27446264842928003, + "flos": 40190634506880.0, + "grad_norm": 1.6605267176643121, + "language_loss": 0.78576511, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.81383526, + "num_input_tokens_seen": 98649470, + "step": 4565, + "time_per_iteration": 2.962012529373169 + }, + { + "auxiliary_loss_clip": 0.01508054, + "auxiliary_loss_mlp": 0.0130613, + "balance_loss_clip": 1.15425992, + "balance_loss_mlp": 1.05073619, + "epoch": 0.274522771681948, + "flos": 13481357552640.0, + "grad_norm": 1.9057165019032742, + "language_loss": 0.6916784, + "learning_rate": 3.404888640957477e-06, + "loss": 0.71982026, + "num_input_tokens_seen": 98666915, + "step": 4566, + "time_per_iteration": 2.824056386947632 + }, + { + "auxiliary_loss_clip": 0.01499289, + "auxiliary_loss_mlp": 0.01305714, + "balance_loss_clip": 1.14639282, + "balance_loss_mlp": 1.05413413, + "epoch": 0.27458289493461596, + "flos": 28624705907040.0, + "grad_norm": 1.8688818579092146, + "language_loss": 0.60829866, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63634872, + "num_input_tokens_seen": 98688240, + "step": 4567, + "time_per_iteration": 2.823010206222534 + }, + { + "auxiliary_loss_clip": 0.01503167, + "auxiliary_loss_mlp": 0.01309298, + "balance_loss_clip": 1.14939761, + "balance_loss_mlp": 1.0523777, + "epoch": 0.2746430181872839, + "flos": 20121848183520.0, + "grad_norm": 2.178103186608422, + "language_loss": 0.82360327, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.8517279, + "num_input_tokens_seen": 98708245, + "step": 4568, + "time_per_iteration": 2.8056910037994385 + }, + { + "auxiliary_loss_clip": 0.01508994, + "auxiliary_loss_mlp": 0.01290839, + "balance_loss_clip": 1.15542614, + "balance_loss_mlp": 1.03086734, + "epoch": 0.2747031414399519, + "flos": 20195543327040.0, + "grad_norm": 4.083645918093756, + "language_loss": 0.68553114, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.71352947, + "num_input_tokens_seen": 98724575, + "step": 4569, + "time_per_iteration": 2.9158904552459717 + }, + { + "auxiliary_loss_clip": 0.01502904, + "auxiliary_loss_mlp": 0.01307246, + "balance_loss_clip": 1.14823961, + "balance_loss_mlp": 1.048419, + "epoch": 0.27476326469261986, + "flos": 13518034447680.0, + "grad_norm": 1.958437063092781, + "language_loss": 0.71108079, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73918229, + "num_input_tokens_seen": 98740700, + "step": 4570, + "time_per_iteration": 2.8160362243652344 + }, + { + "auxiliary_loss_clip": 0.0160471, + "auxiliary_loss_mlp": 0.01236671, + "balance_loss_clip": 1.25513887, + "balance_loss_mlp": 1.01084137, + "epoch": 0.2748233879452878, + "flos": 65943277572960.0, + "grad_norm": 0.7256805918346345, + "language_loss": 0.5578289, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.58624279, + "num_input_tokens_seen": 98803030, + "step": 4571, + "time_per_iteration": 3.446810483932495 + }, + { + "auxiliary_loss_clip": 0.01505365, + "auxiliary_loss_mlp": 0.01294297, + "balance_loss_clip": 1.15011454, + "balance_loss_mlp": 1.03585172, + "epoch": 0.2748835111979558, + "flos": 17386138535040.0, + "grad_norm": 2.8439935843651334, + "language_loss": 0.77483892, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.80283564, + "num_input_tokens_seen": 98820505, + "step": 4572, + "time_per_iteration": 2.763132333755493 + }, + { + "auxiliary_loss_clip": 0.01505758, + "auxiliary_loss_mlp": 0.01303139, + "balance_loss_clip": 1.15296018, + "balance_loss_mlp": 1.0496521, + "epoch": 0.27494363445062375, + "flos": 23589882031680.0, + "grad_norm": 1.6298009656306636, + "language_loss": 0.81425011, + "learning_rate": 3.402946971702147e-06, + "loss": 0.8423391, + "num_input_tokens_seen": 98842150, + "step": 4573, + "time_per_iteration": 2.871119737625122 + }, + { + "auxiliary_loss_clip": 0.0149834, + "auxiliary_loss_mlp": 0.01288441, + "balance_loss_clip": 1.14454222, + "balance_loss_mlp": 1.03094864, + "epoch": 0.2750037577032918, + "flos": 17166456446400.0, + "grad_norm": 1.7400779897282554, + "language_loss": 0.79495329, + "learning_rate": 3.402669377496223e-06, + "loss": 0.82282114, + "num_input_tokens_seen": 98861050, + "step": 4574, + "time_per_iteration": 2.746809482574463 + }, + { + "auxiliary_loss_clip": 0.01503849, + "auxiliary_loss_mlp": 0.01299584, + "balance_loss_clip": 1.1497345, + "balance_loss_mlp": 1.04666984, + "epoch": 0.27506388095595974, + "flos": 24493832976960.0, + "grad_norm": 2.894517829737378, + "language_loss": 0.74220628, + "learning_rate": 3.402391730100936e-06, + "loss": 0.77024055, + "num_input_tokens_seen": 98879695, + "step": 4575, + "time_per_iteration": 2.8251101970672607 + }, + { + "auxiliary_loss_clip": 0.01507646, + "auxiliary_loss_mlp": 0.0129709, + "balance_loss_clip": 1.15225518, + "balance_loss_mlp": 1.04245877, + "epoch": 0.2751240042086277, + "flos": 38767821232320.0, + "grad_norm": 1.9158051573784085, + "language_loss": 0.72114682, + "learning_rate": 3.402114029526814e-06, + "loss": 0.74919415, + "num_input_tokens_seen": 98902035, + "step": 4576, + "time_per_iteration": 2.926492929458618 + }, + { + "auxiliary_loss_clip": 0.01501027, + "auxiliary_loss_mlp": 0.0128848, + "balance_loss_clip": 1.14624679, + "balance_loss_mlp": 1.03270423, + "epoch": 0.27518412746129567, + "flos": 26909501532480.0, + "grad_norm": 1.9956301425401624, + "language_loss": 0.73324227, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.76113737, + "num_input_tokens_seen": 98921835, + "step": 4577, + "time_per_iteration": 2.8025918006896973 + }, + { + "auxiliary_loss_clip": 0.01507039, + "auxiliary_loss_mlp": 0.01304432, + "balance_loss_clip": 1.15064573, + "balance_loss_mlp": 1.04484189, + "epoch": 0.27524425071396363, + "flos": 24902892190080.0, + "grad_norm": 1.99742849502611, + "language_loss": 0.7605961, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78871083, + "num_input_tokens_seen": 98939610, + "step": 4578, + "time_per_iteration": 2.9535388946533203 + }, + { + "auxiliary_loss_clip": 0.01501817, + "auxiliary_loss_mlp": 0.01304678, + "balance_loss_clip": 1.14536142, + "balance_loss_mlp": 1.04833066, + "epoch": 0.2753043739666316, + "flos": 26290545696000.0, + "grad_norm": 1.6064407002608831, + "language_loss": 0.66403615, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.69210112, + "num_input_tokens_seen": 98962250, + "step": 4579, + "time_per_iteration": 2.8686635494232178 + }, + { + "auxiliary_loss_clip": 0.01496181, + "auxiliary_loss_mlp": 0.01290259, + "balance_loss_clip": 1.14094055, + "balance_loss_mlp": 1.03276718, + "epoch": 0.27536449721929956, + "flos": 24209065437120.0, + "grad_norm": 1.954746618643593, + "language_loss": 0.80051887, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82838327, + "num_input_tokens_seen": 98981845, + "step": 4580, + "time_per_iteration": 2.8937904834747314 + }, + { + "auxiliary_loss_clip": 0.01507395, + "auxiliary_loss_mlp": 0.01287406, + "balance_loss_clip": 1.1503433, + "balance_loss_mlp": 1.02934229, + "epoch": 0.27542462047196753, + "flos": 19539341673120.0, + "grad_norm": 4.147256020840413, + "language_loss": 0.67817473, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.70612276, + "num_input_tokens_seen": 99001855, + "step": 4581, + "time_per_iteration": 2.7645819187164307 + }, + { + "auxiliary_loss_clip": 0.01493213, + "auxiliary_loss_mlp": 0.01295139, + "balance_loss_clip": 1.13763714, + "balance_loss_mlp": 1.03306961, + "epoch": 0.2754847437246355, + "flos": 14320905753600.0, + "grad_norm": 1.8018014841839571, + "language_loss": 0.7810626, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80894613, + "num_input_tokens_seen": 99019880, + "step": 4582, + "time_per_iteration": 2.701568126678467 + }, + { + "auxiliary_loss_clip": 0.015015, + "auxiliary_loss_mlp": 0.01289101, + "balance_loss_clip": 1.14626193, + "balance_loss_mlp": 1.03618705, + "epoch": 0.27554486697730346, + "flos": 18840508403040.0, + "grad_norm": 1.7431238009561292, + "language_loss": 0.84732664, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.87523264, + "num_input_tokens_seen": 99037570, + "step": 4583, + "time_per_iteration": 2.7821571826934814 + }, + { + "auxiliary_loss_clip": 0.01499745, + "auxiliary_loss_mlp": 0.01299846, + "balance_loss_clip": 1.14414358, + "balance_loss_mlp": 1.04349828, + "epoch": 0.2756049902299714, + "flos": 22384475156160.0, + "grad_norm": 8.28404683474385, + "language_loss": 0.67224169, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.70023763, + "num_input_tokens_seen": 99056875, + "step": 4584, + "time_per_iteration": 2.776392936706543 + }, + { + "auxiliary_loss_clip": 0.01505433, + "auxiliary_loss_mlp": 0.0129826, + "balance_loss_clip": 1.14915562, + "balance_loss_mlp": 1.03828847, + "epoch": 0.2756651134826394, + "flos": 19575980640000.0, + "grad_norm": 1.817128228277985, + "language_loss": 0.77464753, + "learning_rate": 3.399612333050327e-06, + "loss": 0.80268443, + "num_input_tokens_seen": 99074685, + "step": 4585, + "time_per_iteration": 4.352849245071411 + }, + { + "auxiliary_loss_clip": 0.01506891, + "auxiliary_loss_mlp": 0.01301854, + "balance_loss_clip": 1.15145755, + "balance_loss_mlp": 1.04436147, + "epoch": 0.27572523673530736, + "flos": 23588820043200.0, + "grad_norm": 1.5796010641171316, + "language_loss": 0.72260141, + "learning_rate": 3.399334101267362e-06, + "loss": 0.75068891, + "num_input_tokens_seen": 99095300, + "step": 4586, + "time_per_iteration": 2.77966570854187 + }, + { + "auxiliary_loss_clip": 0.01508492, + "auxiliary_loss_mlp": 0.01303609, + "balance_loss_clip": 1.15391898, + "balance_loss_mlp": 1.045735, + "epoch": 0.2757853599879754, + "flos": 22822322207040.0, + "grad_norm": 1.5283262014359142, + "language_loss": 0.80305606, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.83117712, + "num_input_tokens_seen": 99115965, + "step": 4587, + "time_per_iteration": 2.7227749824523926 + }, + { + "auxiliary_loss_clip": 0.01498325, + "auxiliary_loss_mlp": 0.0129714, + "balance_loss_clip": 1.14244187, + "balance_loss_mlp": 1.03945732, + "epoch": 0.27584548324064334, + "flos": 18553692742560.0, + "grad_norm": 1.8294070955201471, + "language_loss": 0.83142877, + "learning_rate": 3.398777478523316e-06, + "loss": 0.85938346, + "num_input_tokens_seen": 99134265, + "step": 4588, + "time_per_iteration": 2.7508604526519775 + }, + { + "auxiliary_loss_clip": 0.0150311, + "auxiliary_loss_mlp": 0.01287931, + "balance_loss_clip": 1.1465385, + "balance_loss_mlp": 1.0327282, + "epoch": 0.2759056064933113, + "flos": 23772204518400.0, + "grad_norm": 1.4205170452733484, + "language_loss": 0.75463521, + "learning_rate": 3.398499087583342e-06, + "loss": 0.78254569, + "num_input_tokens_seen": 99156185, + "step": 4589, + "time_per_iteration": 2.775404691696167 + }, + { + "auxiliary_loss_clip": 0.01507653, + "auxiliary_loss_mlp": 0.01299059, + "balance_loss_clip": 1.14910197, + "balance_loss_mlp": 1.04480982, + "epoch": 0.27596572974597927, + "flos": 24284315635200.0, + "grad_norm": 1.7011800092767548, + "language_loss": 0.88878053, + "learning_rate": 3.398220643612143e-06, + "loss": 0.91684771, + "num_input_tokens_seen": 99176735, + "step": 4590, + "time_per_iteration": 2.8057682514190674 + }, + { + "auxiliary_loss_clip": 0.01510703, + "auxiliary_loss_mlp": 0.01318133, + "balance_loss_clip": 1.15229225, + "balance_loss_mlp": 1.05930555, + "epoch": 0.27602585299864724, + "flos": 35043769753920.0, + "grad_norm": 1.6407997049750174, + "language_loss": 0.71753204, + "learning_rate": 3.397942146620277e-06, + "loss": 0.7458204, + "num_input_tokens_seen": 99199765, + "step": 4591, + "time_per_iteration": 2.9391229152679443 + }, + { + "auxiliary_loss_clip": 0.01505986, + "auxiliary_loss_mlp": 0.01300846, + "balance_loss_clip": 1.14937985, + "balance_loss_mlp": 1.03763163, + "epoch": 0.2760859762513152, + "flos": 24311320849440.0, + "grad_norm": 2.1133113654395066, + "language_loss": 0.80546761, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.83353591, + "num_input_tokens_seen": 99218435, + "step": 4592, + "time_per_iteration": 4.2074737548828125 + }, + { + "auxiliary_loss_clip": 0.01602113, + "auxiliary_loss_mlp": 0.01258575, + "balance_loss_clip": 1.24913347, + "balance_loss_mlp": 1.03121948, + "epoch": 0.27614609950398317, + "flos": 71267003157600.0, + "grad_norm": 0.7071761346891443, + "language_loss": 0.61541009, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.64401692, + "num_input_tokens_seen": 99276200, + "step": 4593, + "time_per_iteration": 6.280070543289185 + }, + { + "auxiliary_loss_clip": 0.01506766, + "auxiliary_loss_mlp": 0.01301888, + "balance_loss_clip": 1.14888918, + "balance_loss_mlp": 1.04382372, + "epoch": 0.27620622275665113, + "flos": 29677184984160.0, + "grad_norm": 1.872133350283773, + "language_loss": 0.77580929, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.80389583, + "num_input_tokens_seen": 99297625, + "step": 4594, + "time_per_iteration": 2.8300535678863525 + }, + { + "auxiliary_loss_clip": 0.01515267, + "auxiliary_loss_mlp": 0.01313963, + "balance_loss_clip": 1.15814126, + "balance_loss_mlp": 1.05933189, + "epoch": 0.2762663460093191, + "flos": 15379908474240.0, + "grad_norm": 1.6489506544268913, + "language_loss": 0.91796577, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.94625807, + "num_input_tokens_seen": 99315790, + "step": 4595, + "time_per_iteration": 2.7603843212127686 + }, + { + "auxiliary_loss_clip": 0.01513043, + "auxiliary_loss_mlp": 0.01320357, + "balance_loss_clip": 1.15620375, + "balance_loss_mlp": 1.06210208, + "epoch": 0.27632646926198706, + "flos": 20706175245600.0, + "grad_norm": 2.0805328019981904, + "language_loss": 0.69440138, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.7227354, + "num_input_tokens_seen": 99334615, + "step": 4596, + "time_per_iteration": 2.7479138374328613 + }, + { + "auxiliary_loss_clip": 0.0151438, + "auxiliary_loss_mlp": 0.01312712, + "balance_loss_clip": 1.15766954, + "balance_loss_mlp": 1.04911685, + "epoch": 0.276386592514655, + "flos": 32815923268320.0, + "grad_norm": 1.9606244077950519, + "language_loss": 0.63616377, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.66443467, + "num_input_tokens_seen": 99356685, + "step": 4597, + "time_per_iteration": 2.91196608543396 + }, + { + "auxiliary_loss_clip": 0.01513701, + "auxiliary_loss_mlp": 0.01301522, + "balance_loss_clip": 1.15565872, + "balance_loss_mlp": 1.04555583, + "epoch": 0.276446715767323, + "flos": 18553692742560.0, + "grad_norm": 1.8881651864107305, + "language_loss": 0.86111605, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88926828, + "num_input_tokens_seen": 99374810, + "step": 4598, + "time_per_iteration": 2.725212812423706 + }, + { + "auxiliary_loss_clip": 0.01516246, + "auxiliary_loss_mlp": 0.0129503, + "balance_loss_clip": 1.15792572, + "balance_loss_mlp": 1.03353238, + "epoch": 0.27650683901999096, + "flos": 22821791212800.0, + "grad_norm": 2.2664031170788848, + "language_loss": 0.80010009, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82821286, + "num_input_tokens_seen": 99391290, + "step": 4599, + "time_per_iteration": 2.807924747467041 + }, + { + "auxiliary_loss_clip": 0.01507374, + "auxiliary_loss_mlp": 0.01298892, + "balance_loss_clip": 1.15044951, + "balance_loss_mlp": 1.03396189, + "epoch": 0.276566962272659, + "flos": 21363704385120.0, + "grad_norm": 1.5934580351778171, + "language_loss": 0.79111242, + "learning_rate": 3.395433289506639e-06, + "loss": 0.81917506, + "num_input_tokens_seen": 99409120, + "step": 4600, + "time_per_iteration": 2.840846538543701 + }, + { + "auxiliary_loss_clip": 0.01516585, + "auxiliary_loss_mlp": 0.01317069, + "balance_loss_clip": 1.15824151, + "balance_loss_mlp": 1.05442691, + "epoch": 0.27662708552532694, + "flos": 17712437774400.0, + "grad_norm": 2.119721506675847, + "language_loss": 0.73580974, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.76414621, + "num_input_tokens_seen": 99426180, + "step": 4601, + "time_per_iteration": 2.810899019241333 + }, + { + "auxiliary_loss_clip": 0.0151541, + "auxiliary_loss_mlp": 0.01310186, + "balance_loss_clip": 1.1567378, + "balance_loss_mlp": 1.05326617, + "epoch": 0.2766872087779949, + "flos": 21254925329280.0, + "grad_norm": 1.4949465081528177, + "language_loss": 0.80389708, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.83215308, + "num_input_tokens_seen": 99447720, + "step": 4602, + "time_per_iteration": 2.892631769180298 + }, + { + "auxiliary_loss_clip": 0.01505028, + "auxiliary_loss_mlp": 0.01303643, + "balance_loss_clip": 1.14683914, + "balance_loss_mlp": 1.03833091, + "epoch": 0.2767473320306629, + "flos": 12933517744800.0, + "grad_norm": 2.2540642268742044, + "language_loss": 0.77368212, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.80176878, + "num_input_tokens_seen": 99464720, + "step": 4603, + "time_per_iteration": 2.740274429321289 + }, + { + "auxiliary_loss_clip": 0.01515761, + "auxiliary_loss_mlp": 0.01298313, + "balance_loss_clip": 1.15782475, + "balance_loss_mlp": 1.03986692, + "epoch": 0.27680745528333084, + "flos": 15014656650240.0, + "grad_norm": 1.6625589171861972, + "language_loss": 0.81688809, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.84502876, + "num_input_tokens_seen": 99482310, + "step": 4604, + "time_per_iteration": 2.739048957824707 + }, + { + "auxiliary_loss_clip": 0.01505234, + "auxiliary_loss_mlp": 0.01295905, + "balance_loss_clip": 1.14748156, + "balance_loss_mlp": 1.0372684, + "epoch": 0.2768675785359988, + "flos": 22640037648480.0, + "grad_norm": 2.0034240521714013, + "language_loss": 0.70356995, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.73158133, + "num_input_tokens_seen": 99501255, + "step": 4605, + "time_per_iteration": 2.7628071308135986 + }, + { + "auxiliary_loss_clip": 0.01611974, + "auxiliary_loss_mlp": 0.01246323, + "balance_loss_clip": 1.25463963, + "balance_loss_mlp": 1.02583313, + "epoch": 0.27692770178866677, + "flos": 66137319861120.0, + "grad_norm": 0.7003951267174615, + "language_loss": 0.57085359, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59943652, + "num_input_tokens_seen": 99568925, + "step": 4606, + "time_per_iteration": 3.449902296066284 + }, + { + "auxiliary_loss_clip": 0.0151411, + "auxiliary_loss_mlp": 0.01307304, + "balance_loss_clip": 1.15643096, + "balance_loss_mlp": 1.05000305, + "epoch": 0.27698782504133473, + "flos": 26467178958720.0, + "grad_norm": 2.157699823283183, + "language_loss": 0.69856888, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.72678304, + "num_input_tokens_seen": 99588455, + "step": 4607, + "time_per_iteration": 2.8587965965270996 + }, + { + "auxiliary_loss_clip": 0.01500652, + "auxiliary_loss_mlp": 0.01293065, + "balance_loss_clip": 1.14295113, + "balance_loss_mlp": 1.04015088, + "epoch": 0.2770479482940027, + "flos": 25887175706880.0, + "grad_norm": 1.744551085047869, + "language_loss": 0.70006126, + "learning_rate": 3.393199595837555e-06, + "loss": 0.72799844, + "num_input_tokens_seen": 99609355, + "step": 4608, + "time_per_iteration": 2.829860210418701 + }, + { + "auxiliary_loss_clip": 0.01506336, + "auxiliary_loss_mlp": 0.0130541, + "balance_loss_clip": 1.14881182, + "balance_loss_mlp": 1.04582024, + "epoch": 0.27710807154667066, + "flos": 22859492168160.0, + "grad_norm": 1.7448877679751786, + "language_loss": 0.72894418, + "learning_rate": 3.392920146281499e-06, + "loss": 0.75706172, + "num_input_tokens_seen": 99628780, + "step": 4609, + "time_per_iteration": 2.82085919380188 + }, + { + "auxiliary_loss_clip": 0.01517665, + "auxiliary_loss_mlp": 0.01297515, + "balance_loss_clip": 1.16026187, + "balance_loss_mlp": 1.03658986, + "epoch": 0.27716819479933863, + "flos": 17712930840480.0, + "grad_norm": 2.524622640075876, + "language_loss": 0.83593667, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86408848, + "num_input_tokens_seen": 99644545, + "step": 4610, + "time_per_iteration": 2.7407379150390625 + }, + { + "auxiliary_loss_clip": 0.01515587, + "auxiliary_loss_mlp": 0.01307062, + "balance_loss_clip": 1.15825391, + "balance_loss_mlp": 1.04651797, + "epoch": 0.2772283180520066, + "flos": 19648082800800.0, + "grad_norm": 2.0640260315743486, + "language_loss": 0.70315301, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.73137951, + "num_input_tokens_seen": 99663125, + "step": 4611, + "time_per_iteration": 2.7884979248046875 + }, + { + "auxiliary_loss_clip": 0.01513273, + "auxiliary_loss_mlp": 0.01299106, + "balance_loss_clip": 1.15744066, + "balance_loss_mlp": 1.04333043, + "epoch": 0.27728844130467456, + "flos": 21034750174560.0, + "grad_norm": 1.804960340513762, + "language_loss": 0.74033928, + "learning_rate": 3.392081480737698e-06, + "loss": 0.76846302, + "num_input_tokens_seen": 99682645, + "step": 4612, + "time_per_iteration": 2.771608829498291 + }, + { + "auxiliary_loss_clip": 0.01519699, + "auxiliary_loss_mlp": 0.01305828, + "balance_loss_clip": 1.16265464, + "balance_loss_mlp": 1.05291367, + "epoch": 0.2773485645573425, + "flos": 18991198440000.0, + "grad_norm": 2.316167513410815, + "language_loss": 0.6666249, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.69488013, + "num_input_tokens_seen": 99700520, + "step": 4613, + "time_per_iteration": 2.8392813205718994 + }, + { + "auxiliary_loss_clip": 0.0150821, + "auxiliary_loss_mlp": 0.01318167, + "balance_loss_clip": 1.15113187, + "balance_loss_mlp": 1.0637269, + "epoch": 0.27740868781001055, + "flos": 21470473248480.0, + "grad_norm": 2.0551245529659643, + "language_loss": 0.79519153, + "learning_rate": 3.39152210641815e-06, + "loss": 0.82345533, + "num_input_tokens_seen": 99720355, + "step": 4614, + "time_per_iteration": 2.7935030460357666 + }, + { + "auxiliary_loss_clip": 0.01512328, + "auxiliary_loss_mlp": 0.01311296, + "balance_loss_clip": 1.15541983, + "balance_loss_mlp": 1.05647397, + "epoch": 0.2774688110626785, + "flos": 19829836365120.0, + "grad_norm": 2.342677254651568, + "language_loss": 0.79796588, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.82620215, + "num_input_tokens_seen": 99736090, + "step": 4615, + "time_per_iteration": 2.784562587738037 + }, + { + "auxiliary_loss_clip": 0.01516741, + "auxiliary_loss_mlp": 0.01317205, + "balance_loss_clip": 1.15922678, + "balance_loss_mlp": 1.05990374, + "epoch": 0.2775289343153465, + "flos": 18217797678720.0, + "grad_norm": 2.498304830531258, + "language_loss": 0.63489544, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.66323495, + "num_input_tokens_seen": 99751805, + "step": 4616, + "time_per_iteration": 2.7460145950317383 + }, + { + "auxiliary_loss_clip": 0.01520413, + "auxiliary_loss_mlp": 0.0130338, + "balance_loss_clip": 1.16378856, + "balance_loss_mlp": 1.05275488, + "epoch": 0.27758905756801444, + "flos": 16474374388800.0, + "grad_norm": 2.7594607246568597, + "language_loss": 0.82788295, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.85612082, + "num_input_tokens_seen": 99770610, + "step": 4617, + "time_per_iteration": 2.801772356033325 + }, + { + "auxiliary_loss_clip": 0.01516332, + "auxiliary_loss_mlp": 0.01303493, + "balance_loss_clip": 1.15892005, + "balance_loss_mlp": 1.04771769, + "epoch": 0.2776491808206824, + "flos": 18729908795520.0, + "grad_norm": 2.4479136672583803, + "language_loss": 0.77126741, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79946572, + "num_input_tokens_seen": 99787305, + "step": 4618, + "time_per_iteration": 2.7708027362823486 + }, + { + "auxiliary_loss_clip": 0.0151248, + "auxiliary_loss_mlp": 0.01301939, + "balance_loss_clip": 1.15611148, + "balance_loss_mlp": 1.0459733, + "epoch": 0.27770930407335037, + "flos": 28040720198400.0, + "grad_norm": 1.78720202389366, + "language_loss": 0.84631658, + "learning_rate": 3.390122747388459e-06, + "loss": 0.87446076, + "num_input_tokens_seen": 99808940, + "step": 4619, + "time_per_iteration": 2.7881593704223633 + }, + { + "auxiliary_loss_clip": 0.01514452, + "auxiliary_loss_mlp": 0.01287771, + "balance_loss_clip": 1.15829563, + "balance_loss_mlp": 1.03581047, + "epoch": 0.27776942732601834, + "flos": 23552067291840.0, + "grad_norm": 1.6152104132933058, + "language_loss": 0.76934159, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.79736376, + "num_input_tokens_seen": 99829575, + "step": 4620, + "time_per_iteration": 2.859483480453491 + }, + { + "auxiliary_loss_clip": 0.01510557, + "auxiliary_loss_mlp": 0.01298295, + "balance_loss_clip": 1.1551801, + "balance_loss_mlp": 1.04633451, + "epoch": 0.2778295505786863, + "flos": 23910643759680.0, + "grad_norm": 1.7965720303886383, + "language_loss": 0.78684783, + "learning_rate": 3.389562634707122e-06, + "loss": 0.81493628, + "num_input_tokens_seen": 99847575, + "step": 4621, + "time_per_iteration": 2.7662172317504883 + }, + { + "auxiliary_loss_clip": 0.01516494, + "auxiliary_loss_mlp": 0.01292074, + "balance_loss_clip": 1.16062427, + "balance_loss_mlp": 1.03267443, + "epoch": 0.27788967383135427, + "flos": 25556818154400.0, + "grad_norm": 2.287870360486588, + "language_loss": 0.87416875, + "learning_rate": 3.389282499322611e-06, + "loss": 0.90225446, + "num_input_tokens_seen": 99864995, + "step": 4622, + "time_per_iteration": 4.492748022079468 + }, + { + "auxiliary_loss_clip": 0.01505073, + "auxiliary_loss_mlp": 0.0128908, + "balance_loss_clip": 1.1500423, + "balance_loss_mlp": 1.02910805, + "epoch": 0.27794979708402223, + "flos": 16254464731200.0, + "grad_norm": 2.1225594926642266, + "language_loss": 0.81423664, + "learning_rate": 3.389002311256369e-06, + "loss": 0.84217817, + "num_input_tokens_seen": 99881540, + "step": 4623, + "time_per_iteration": 2.850599527359009 + }, + { + "auxiliary_loss_clip": 0.01516029, + "auxiliary_loss_mlp": 0.01285163, + "balance_loss_clip": 1.15931201, + "balance_loss_mlp": 1.02709925, + "epoch": 0.2780099203366902, + "flos": 20669725919520.0, + "grad_norm": 1.8925474411488399, + "language_loss": 0.81297493, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.84098685, + "num_input_tokens_seen": 99899595, + "step": 4624, + "time_per_iteration": 2.733398914337158 + }, + { + "auxiliary_loss_clip": 0.01506683, + "auxiliary_loss_mlp": 0.01295309, + "balance_loss_clip": 1.15257812, + "balance_loss_mlp": 1.03438377, + "epoch": 0.27807004358935816, + "flos": 17740998043200.0, + "grad_norm": 2.2362127797885227, + "language_loss": 0.76705837, + "learning_rate": 3.388441777121191e-06, + "loss": 0.79507828, + "num_input_tokens_seen": 99913020, + "step": 4625, + "time_per_iteration": 2.749220132827759 + }, + { + "auxiliary_loss_clip": 0.01502811, + "auxiliary_loss_mlp": 0.0128672, + "balance_loss_clip": 1.14789617, + "balance_loss_mlp": 1.02903676, + "epoch": 0.2781301668420261, + "flos": 16728988677120.0, + "grad_norm": 2.4050674204931086, + "language_loss": 0.70320439, + "learning_rate": 3.388161431073511e-06, + "loss": 0.73109972, + "num_input_tokens_seen": 99931405, + "step": 4626, + "time_per_iteration": 2.718287467956543 + }, + { + "auxiliary_loss_clip": 0.01513539, + "auxiliary_loss_mlp": 0.01314193, + "balance_loss_clip": 1.15615153, + "balance_loss_mlp": 1.05822647, + "epoch": 0.27819029009469415, + "flos": 13846457664000.0, + "grad_norm": 2.8114427712851526, + "language_loss": 0.93502152, + "learning_rate": 3.38788103238661e-06, + "loss": 0.96329892, + "num_input_tokens_seen": 99948100, + "step": 4627, + "time_per_iteration": 2.7807931900024414 + }, + { + "auxiliary_loss_clip": 0.0151214, + "auxiliary_loss_mlp": 0.0130102, + "balance_loss_clip": 1.15606558, + "balance_loss_mlp": 1.04276526, + "epoch": 0.2782504133473621, + "flos": 27091748162880.0, + "grad_norm": 2.250073407035074, + "language_loss": 0.86025071, + "learning_rate": 3.387600581071121e-06, + "loss": 0.88838232, + "num_input_tokens_seen": 99966470, + "step": 4628, + "time_per_iteration": 2.827259063720703 + }, + { + "auxiliary_loss_clip": 0.01501881, + "auxiliary_loss_mlp": 0.01290069, + "balance_loss_clip": 1.14699709, + "balance_loss_mlp": 1.03505635, + "epoch": 0.2783105366000301, + "flos": 21070934003520.0, + "grad_norm": 1.5386355712493218, + "language_loss": 0.79534936, + "learning_rate": 3.387320077137679e-06, + "loss": 0.82326889, + "num_input_tokens_seen": 99985930, + "step": 4629, + "time_per_iteration": 4.183778762817383 + }, + { + "auxiliary_loss_clip": 0.01506636, + "auxiliary_loss_mlp": 0.01286761, + "balance_loss_clip": 1.15093493, + "balance_loss_mlp": 1.03136683, + "epoch": 0.27837065985269804, + "flos": 26504045494560.0, + "grad_norm": 1.5634157671097637, + "language_loss": 0.84775579, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.87568969, + "num_input_tokens_seen": 100006235, + "step": 4630, + "time_per_iteration": 2.8150794506073 + }, + { + "auxiliary_loss_clip": 0.01504558, + "auxiliary_loss_mlp": 0.01289842, + "balance_loss_clip": 1.14962566, + "balance_loss_mlp": 1.03196859, + "epoch": 0.278430783105366, + "flos": 20224900087200.0, + "grad_norm": 1.8483830357890991, + "language_loss": 0.81560063, + "learning_rate": 3.386758911459485e-06, + "loss": 0.8435446, + "num_input_tokens_seen": 100023655, + "step": 4631, + "time_per_iteration": 4.430392026901245 + }, + { + "auxiliary_loss_clip": 0.015047, + "auxiliary_loss_mlp": 0.01297099, + "balance_loss_clip": 1.14906645, + "balance_loss_mlp": 1.03521967, + "epoch": 0.278490906358034, + "flos": 25595239744800.0, + "grad_norm": 2.073330361770331, + "language_loss": 0.71817899, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.74619699, + "num_input_tokens_seen": 100043280, + "step": 4632, + "time_per_iteration": 2.80879282951355 + }, + { + "auxiliary_loss_clip": 0.01505967, + "auxiliary_loss_mlp": 0.0128327, + "balance_loss_clip": 1.15082669, + "balance_loss_mlp": 1.02825737, + "epoch": 0.27855102961070194, + "flos": 16172918458560.0, + "grad_norm": 2.078203369097644, + "language_loss": 0.82512188, + "learning_rate": 3.386197535437145e-06, + "loss": 0.85301429, + "num_input_tokens_seen": 100057690, + "step": 4633, + "time_per_iteration": 2.7511191368103027 + }, + { + "auxiliary_loss_clip": 0.0150311, + "auxiliary_loss_mlp": 0.01287467, + "balance_loss_clip": 1.14621902, + "balance_loss_mlp": 1.03321731, + "epoch": 0.2786111528633699, + "flos": 22929697920960.0, + "grad_norm": 1.7261117189288468, + "language_loss": 0.88064492, + "learning_rate": 3.385916768573529e-06, + "loss": 0.90855062, + "num_input_tokens_seen": 100075875, + "step": 4634, + "time_per_iteration": 2.7362473011016846 + }, + { + "auxiliary_loss_clip": 0.01505018, + "auxiliary_loss_mlp": 0.01296725, + "balance_loss_clip": 1.15039718, + "balance_loss_mlp": 1.03713489, + "epoch": 0.27867127611603787, + "flos": 23406459628320.0, + "grad_norm": 1.6333286488454508, + "language_loss": 0.77063406, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79865146, + "num_input_tokens_seen": 100092930, + "step": 4635, + "time_per_iteration": 2.7757506370544434 + }, + { + "auxiliary_loss_clip": 0.01502491, + "auxiliary_loss_mlp": 0.01295691, + "balance_loss_clip": 1.14651322, + "balance_loss_mlp": 1.03858078, + "epoch": 0.27873139936870583, + "flos": 19831087994400.0, + "grad_norm": 1.751529004157013, + "language_loss": 0.65379548, + "learning_rate": 3.385355077194637e-06, + "loss": 0.68177724, + "num_input_tokens_seen": 100110790, + "step": 4636, + "time_per_iteration": 2.7551534175872803 + }, + { + "auxiliary_loss_clip": 0.01499097, + "auxiliary_loss_mlp": 0.01294204, + "balance_loss_clip": 1.14403832, + "balance_loss_mlp": 1.03404129, + "epoch": 0.2787915226213738, + "flos": 17709327665280.0, + "grad_norm": 4.217466880184444, + "language_loss": 0.83277142, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.86070442, + "num_input_tokens_seen": 100126970, + "step": 4637, + "time_per_iteration": 2.685967206954956 + }, + { + "auxiliary_loss_clip": 0.01496415, + "auxiliary_loss_mlp": 0.01280791, + "balance_loss_clip": 1.14046085, + "balance_loss_mlp": 1.02673197, + "epoch": 0.27885164587404176, + "flos": 22092728834880.0, + "grad_norm": 1.6431061921431707, + "language_loss": 0.76172841, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78950047, + "num_input_tokens_seen": 100146720, + "step": 4638, + "time_per_iteration": 2.7942862510681152 + }, + { + "auxiliary_loss_clip": 0.01503466, + "auxiliary_loss_mlp": 0.01286672, + "balance_loss_clip": 1.14689362, + "balance_loss_mlp": 1.02536547, + "epoch": 0.27891176912670973, + "flos": 19209666827520.0, + "grad_norm": 1.4629836265176375, + "language_loss": 0.71790242, + "learning_rate": 3.38451214615691e-06, + "loss": 0.74580383, + "num_input_tokens_seen": 100165920, + "step": 4639, + "time_per_iteration": 2.7363195419311523 + }, + { + "auxiliary_loss_clip": 0.01499853, + "auxiliary_loss_mlp": 0.01281792, + "balance_loss_clip": 1.14501166, + "balance_loss_mlp": 1.02468109, + "epoch": 0.27897189237937775, + "flos": 27602493865920.0, + "grad_norm": 2.1371943293940854, + "language_loss": 0.65788615, + "learning_rate": 3.384231064128447e-06, + "loss": 0.68570256, + "num_input_tokens_seen": 100185525, + "step": 4640, + "time_per_iteration": 2.8166444301605225 + }, + { + "auxiliary_loss_clip": 0.01503354, + "auxiliary_loss_mlp": 0.01284342, + "balance_loss_clip": 1.14806151, + "balance_loss_mlp": 1.02799463, + "epoch": 0.2790320156320457, + "flos": 21180319909920.0, + "grad_norm": 1.8741001045914294, + "language_loss": 0.72280788, + "learning_rate": 3.383949929609804e-06, + "loss": 0.75068486, + "num_input_tokens_seen": 100204850, + "step": 4641, + "time_per_iteration": 2.7482781410217285 + }, + { + "auxiliary_loss_clip": 0.01500622, + "auxiliary_loss_mlp": 0.0129651, + "balance_loss_clip": 1.14537573, + "balance_loss_mlp": 1.0388267, + "epoch": 0.2790921388847137, + "flos": 22786138378080.0, + "grad_norm": 2.5991237317106375, + "language_loss": 0.7507894, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77876079, + "num_input_tokens_seen": 100224520, + "step": 4642, + "time_per_iteration": 2.764619827270508 + }, + { + "auxiliary_loss_clip": 0.01504325, + "auxiliary_loss_mlp": 0.01316305, + "balance_loss_clip": 1.14944172, + "balance_loss_mlp": 1.06377172, + "epoch": 0.27915226213738165, + "flos": 23402704740480.0, + "grad_norm": 1.7285356994559138, + "language_loss": 0.86019403, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.88840026, + "num_input_tokens_seen": 100243935, + "step": 4643, + "time_per_iteration": 2.737856864929199 + }, + { + "auxiliary_loss_clip": 0.01510507, + "auxiliary_loss_mlp": 0.01297773, + "balance_loss_clip": 1.15621972, + "balance_loss_mlp": 1.04104388, + "epoch": 0.2792123853900496, + "flos": 22750030405440.0, + "grad_norm": 1.7981125252738834, + "language_loss": 0.83085954, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85894239, + "num_input_tokens_seen": 100262290, + "step": 4644, + "time_per_iteration": 2.852235794067383 + }, + { + "auxiliary_loss_clip": 0.01508964, + "auxiliary_loss_mlp": 0.01295985, + "balance_loss_clip": 1.15458703, + "balance_loss_mlp": 1.04192662, + "epoch": 0.2792725086427176, + "flos": 15051181832640.0, + "grad_norm": 3.7217113391619763, + "language_loss": 0.79093081, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81898034, + "num_input_tokens_seen": 100280015, + "step": 4645, + "time_per_iteration": 2.7561588287353516 + }, + { + "auxiliary_loss_clip": 0.0162597, + "auxiliary_loss_mlp": 0.01256577, + "balance_loss_clip": 1.27118564, + "balance_loss_mlp": 1.03684998, + "epoch": 0.27933263189538554, + "flos": 62550873204480.0, + "grad_norm": 0.7819964757043133, + "language_loss": 0.62213385, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.65095925, + "num_input_tokens_seen": 100338935, + "step": 4646, + "time_per_iteration": 3.2894346714019775 + }, + { + "auxiliary_loss_clip": 0.01502097, + "auxiliary_loss_mlp": 0.01301648, + "balance_loss_clip": 1.14738846, + "balance_loss_mlp": 1.04930568, + "epoch": 0.2793927551480535, + "flos": 25120677870720.0, + "grad_norm": 1.6679589444734235, + "language_loss": 0.89617169, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.92420918, + "num_input_tokens_seen": 100359905, + "step": 4647, + "time_per_iteration": 2.834211826324463 + }, + { + "auxiliary_loss_clip": 0.01505516, + "auxiliary_loss_mlp": 0.01300214, + "balance_loss_clip": 1.15203369, + "balance_loss_mlp": 1.0444386, + "epoch": 0.27945287840072147, + "flos": 21326724064800.0, + "grad_norm": 1.6864020624453593, + "language_loss": 0.87282664, + "learning_rate": 3.381980519149988e-06, + "loss": 0.90088391, + "num_input_tokens_seen": 100376955, + "step": 4648, + "time_per_iteration": 2.7825517654418945 + }, + { + "auxiliary_loss_clip": 0.01504521, + "auxiliary_loss_mlp": 0.01295817, + "balance_loss_clip": 1.14955187, + "balance_loss_mlp": 1.03927851, + "epoch": 0.27951300165338944, + "flos": 27452827889280.0, + "grad_norm": 2.23693761725531, + "language_loss": 0.73129237, + "learning_rate": 3.38169896509385e-06, + "loss": 0.7592957, + "num_input_tokens_seen": 100397545, + "step": 4649, + "time_per_iteration": 2.8210978507995605 + }, + { + "auxiliary_loss_clip": 0.01504512, + "auxiliary_loss_mlp": 0.0131682, + "balance_loss_clip": 1.14986634, + "balance_loss_mlp": 1.0661943, + "epoch": 0.2795731249060574, + "flos": 15160833236160.0, + "grad_norm": 2.9851051413390746, + "language_loss": 0.81189674, + "learning_rate": 3.381417358643549e-06, + "loss": 0.84011006, + "num_input_tokens_seen": 100415080, + "step": 4650, + "time_per_iteration": 2.722881317138672 + }, + { + "auxiliary_loss_clip": 0.01614385, + "auxiliary_loss_mlp": 0.01248207, + "balance_loss_clip": 1.26066089, + "balance_loss_mlp": 1.02771759, + "epoch": 0.27963324815872537, + "flos": 60127087658400.0, + "grad_norm": 0.8353904810093833, + "language_loss": 0.5890134, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.6176393, + "num_input_tokens_seen": 100471105, + "step": 4651, + "time_per_iteration": 3.3823444843292236 + }, + { + "auxiliary_loss_clip": 0.01500539, + "auxiliary_loss_mlp": 0.0130561, + "balance_loss_clip": 1.14576316, + "balance_loss_mlp": 1.04602003, + "epoch": 0.27969337141139333, + "flos": 21768326003520.0, + "grad_norm": 1.7437654610970754, + "language_loss": 0.74010277, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76816422, + "num_input_tokens_seen": 100492520, + "step": 4652, + "time_per_iteration": 2.8017914295196533 + }, + { + "auxiliary_loss_clip": 0.01508089, + "auxiliary_loss_mlp": 0.01299751, + "balance_loss_clip": 1.15453339, + "balance_loss_mlp": 1.04092431, + "epoch": 0.27975349466406135, + "flos": 39854132592480.0, + "grad_norm": 2.3429570689186, + "language_loss": 0.79982889, + "learning_rate": 3.380572225034461e-06, + "loss": 0.82790732, + "num_input_tokens_seen": 100512870, + "step": 4653, + "time_per_iteration": 2.8878977298736572 + }, + { + "auxiliary_loss_clip": 0.01507105, + "auxiliary_loss_mlp": 0.01294762, + "balance_loss_clip": 1.1526804, + "balance_loss_mlp": 1.03669715, + "epoch": 0.2798136179167293, + "flos": 21581945203680.0, + "grad_norm": 2.2353586785108512, + "language_loss": 0.79061055, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81862921, + "num_input_tokens_seen": 100531655, + "step": 4654, + "time_per_iteration": 2.807128667831421 + }, + { + "auxiliary_loss_clip": 0.01510408, + "auxiliary_loss_mlp": 0.01290575, + "balance_loss_clip": 1.15500617, + "balance_loss_mlp": 1.02945864, + "epoch": 0.2798737411693973, + "flos": 21539503228320.0, + "grad_norm": 1.9728502341200866, + "language_loss": 0.81231701, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.84032691, + "num_input_tokens_seen": 100548005, + "step": 4655, + "time_per_iteration": 2.8154261112213135 + }, + { + "auxiliary_loss_clip": 0.0150158, + "auxiliary_loss_mlp": 0.01297218, + "balance_loss_clip": 1.14738345, + "balance_loss_mlp": 1.03686523, + "epoch": 0.27993386442206525, + "flos": 26983575957600.0, + "grad_norm": 1.5099789058798545, + "language_loss": 0.81580424, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.8437922, + "num_input_tokens_seen": 100567980, + "step": 4656, + "time_per_iteration": 2.819080114364624 + }, + { + "auxiliary_loss_clip": 0.01504939, + "auxiliary_loss_mlp": 0.01295499, + "balance_loss_clip": 1.14965558, + "balance_loss_mlp": 1.03571856, + "epoch": 0.2799939876747332, + "flos": 24352018129440.0, + "grad_norm": 1.7618868633171758, + "language_loss": 0.83234936, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.86035383, + "num_input_tokens_seen": 100588630, + "step": 4657, + "time_per_iteration": 2.8247079849243164 + }, + { + "auxiliary_loss_clip": 0.01504635, + "auxiliary_loss_mlp": 0.01293599, + "balance_loss_clip": 1.14827085, + "balance_loss_mlp": 1.0319103, + "epoch": 0.2800541109274012, + "flos": 33659833207680.0, + "grad_norm": 2.277500601201057, + "language_loss": 0.640028, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66801035, + "num_input_tokens_seen": 100608775, + "step": 4658, + "time_per_iteration": 2.861006498336792 + }, + { + "auxiliary_loss_clip": 0.01511187, + "auxiliary_loss_mlp": 0.01301844, + "balance_loss_clip": 1.15608668, + "balance_loss_mlp": 1.04645014, + "epoch": 0.28011423418006914, + "flos": 21616460193600.0, + "grad_norm": 2.422364935937579, + "language_loss": 0.78775632, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.81588662, + "num_input_tokens_seen": 100627975, + "step": 4659, + "time_per_iteration": 2.800539016723633 + }, + { + "auxiliary_loss_clip": 0.01500734, + "auxiliary_loss_mlp": 0.01283809, + "balance_loss_clip": 1.14534104, + "balance_loss_mlp": 1.02688873, + "epoch": 0.2801743574327371, + "flos": 23114675378880.0, + "grad_norm": 1.9254609008280514, + "language_loss": 0.79946107, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.82730651, + "num_input_tokens_seen": 100645430, + "step": 4660, + "time_per_iteration": 4.494457721710205 + }, + { + "auxiliary_loss_clip": 0.0150788, + "auxiliary_loss_mlp": 0.01288231, + "balance_loss_clip": 1.15225399, + "balance_loss_mlp": 1.03474391, + "epoch": 0.2802344806854051, + "flos": 12643098909120.0, + "grad_norm": 1.8168160093498684, + "language_loss": 0.80176413, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82972527, + "num_input_tokens_seen": 100663775, + "step": 4661, + "time_per_iteration": 2.7436881065368652 + }, + { + "auxiliary_loss_clip": 0.01514943, + "auxiliary_loss_mlp": 0.01300372, + "balance_loss_clip": 1.16092634, + "balance_loss_mlp": 1.04535985, + "epoch": 0.28029460393807304, + "flos": 37270857676320.0, + "grad_norm": 1.85547059913345, + "language_loss": 0.78211176, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.81026495, + "num_input_tokens_seen": 100686085, + "step": 4662, + "time_per_iteration": 2.967229127883911 + }, + { + "auxiliary_loss_clip": 0.01503854, + "auxiliary_loss_mlp": 0.01296495, + "balance_loss_clip": 1.14855194, + "balance_loss_mlp": 1.04052925, + "epoch": 0.280354727190741, + "flos": 20743231422240.0, + "grad_norm": 2.199321120607415, + "language_loss": 0.69685829, + "learning_rate": 3.377751711782227e-06, + "loss": 0.72486174, + "num_input_tokens_seen": 100705135, + "step": 4663, + "time_per_iteration": 2.7517549991607666 + }, + { + "auxiliary_loss_clip": 0.01509811, + "auxiliary_loss_mlp": 0.0131832, + "balance_loss_clip": 1.15548468, + "balance_loss_mlp": 1.06407022, + "epoch": 0.28041485044340897, + "flos": 21473204076000.0, + "grad_norm": 1.6585200080886122, + "language_loss": 0.77987027, + "learning_rate": 3.377469372935791e-06, + "loss": 0.80815154, + "num_input_tokens_seen": 100724960, + "step": 4664, + "time_per_iteration": 2.8232767581939697 + }, + { + "auxiliary_loss_clip": 0.01502099, + "auxiliary_loss_mlp": 0.01283951, + "balance_loss_clip": 1.14790809, + "balance_loss_mlp": 1.0295105, + "epoch": 0.28047497369607693, + "flos": 14796074478240.0, + "grad_norm": 2.151246340590624, + "language_loss": 0.80253363, + "learning_rate": 3.377186981855578e-06, + "loss": 0.83039409, + "num_input_tokens_seen": 100741995, + "step": 4665, + "time_per_iteration": 2.7261455059051514 + }, + { + "auxiliary_loss_clip": 0.01499691, + "auxiliary_loss_mlp": 0.0130309, + "balance_loss_clip": 1.14593911, + "balance_loss_mlp": 1.05074763, + "epoch": 0.2805350969487449, + "flos": 23072688541440.0, + "grad_norm": 1.7900590193851567, + "language_loss": 0.80652273, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.8345505, + "num_input_tokens_seen": 100758985, + "step": 4666, + "time_per_iteration": 2.801523208618164 + }, + { + "auxiliary_loss_clip": 0.01507046, + "auxiliary_loss_mlp": 0.01292333, + "balance_loss_clip": 1.15225053, + "balance_loss_mlp": 1.03941846, + "epoch": 0.2805952202014129, + "flos": 20481638352480.0, + "grad_norm": 1.9639050066170807, + "language_loss": 0.84886372, + "learning_rate": 3.376622043036658e-06, + "loss": 0.87685752, + "num_input_tokens_seen": 100777820, + "step": 4667, + "time_per_iteration": 2.7515671253204346 + }, + { + "auxiliary_loss_clip": 0.01510321, + "auxiliary_loss_mlp": 0.01314254, + "balance_loss_clip": 1.15597224, + "balance_loss_mlp": 1.06114841, + "epoch": 0.2806553434540809, + "flos": 27419792097600.0, + "grad_norm": 11.49999112263401, + "language_loss": 0.79703057, + "learning_rate": 3.376339495319373e-06, + "loss": 0.82527632, + "num_input_tokens_seen": 100798205, + "step": 4668, + "time_per_iteration": 4.405369997024536 + }, + { + "auxiliary_loss_clip": 0.01503305, + "auxiliary_loss_mlp": 0.01294356, + "balance_loss_clip": 1.14845037, + "balance_loss_mlp": 1.03819931, + "epoch": 0.28071546670674885, + "flos": 26507421100800.0, + "grad_norm": 1.4676885487713456, + "language_loss": 0.76429528, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.79227185, + "num_input_tokens_seen": 100819800, + "step": 4669, + "time_per_iteration": 2.788794755935669 + }, + { + "auxiliary_loss_clip": 0.01502923, + "auxiliary_loss_mlp": 0.01289429, + "balance_loss_clip": 1.14966595, + "balance_loss_mlp": 1.03308153, + "epoch": 0.2807755899594168, + "flos": 20560795151040.0, + "grad_norm": 2.5364595386413864, + "language_loss": 0.78865033, + "learning_rate": 3.375774243322725e-06, + "loss": 0.81657386, + "num_input_tokens_seen": 100837880, + "step": 4670, + "time_per_iteration": 2.7439348697662354 + }, + { + "auxiliary_loss_clip": 0.01502536, + "auxiliary_loss_mlp": 0.01294695, + "balance_loss_clip": 1.14914799, + "balance_loss_mlp": 1.03872871, + "epoch": 0.2808357132120848, + "flos": 24315455018880.0, + "grad_norm": 2.189366807649118, + "language_loss": 0.7977618, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.82573414, + "num_input_tokens_seen": 100856350, + "step": 4671, + "time_per_iteration": 2.7899277210235596 + }, + { + "auxiliary_loss_clip": 0.01517199, + "auxiliary_loss_mlp": 0.01303281, + "balance_loss_clip": 1.16309798, + "balance_loss_mlp": 1.04903114, + "epoch": 0.28089583646475275, + "flos": 26434825873920.0, + "grad_norm": 1.7554578197640984, + "language_loss": 0.75420368, + "learning_rate": 3.37520878264809e-06, + "loss": 0.78240848, + "num_input_tokens_seen": 100876135, + "step": 4672, + "time_per_iteration": 2.891570806503296 + }, + { + "auxiliary_loss_clip": 0.01508582, + "auxiliary_loss_mlp": 0.01293582, + "balance_loss_clip": 1.15572083, + "balance_loss_mlp": 1.0353272, + "epoch": 0.2809559597174207, + "flos": 23114030600160.0, + "grad_norm": 11.172093847607641, + "language_loss": 0.75625181, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.78427339, + "num_input_tokens_seen": 100894790, + "step": 4673, + "time_per_iteration": 2.8526759147644043 + }, + { + "auxiliary_loss_clip": 0.0150514, + "auxiliary_loss_mlp": 0.01297746, + "balance_loss_clip": 1.15207148, + "balance_loss_mlp": 1.04521298, + "epoch": 0.2810160829700887, + "flos": 20925629765280.0, + "grad_norm": 1.9480380396639456, + "language_loss": 0.72899073, + "learning_rate": 3.374643113381237e-06, + "loss": 0.75701958, + "num_input_tokens_seen": 100915100, + "step": 4674, + "time_per_iteration": 2.876512289047241 + }, + { + "auxiliary_loss_clip": 0.01503537, + "auxiliary_loss_mlp": 0.0128968, + "balance_loss_clip": 1.15118718, + "balance_loss_mlp": 1.0360024, + "epoch": 0.28107620622275664, + "flos": 14357658504960.0, + "grad_norm": 2.2706522008762025, + "language_loss": 0.77694321, + "learning_rate": 3.374360200552541e-06, + "loss": 0.80487537, + "num_input_tokens_seen": 100932795, + "step": 4675, + "time_per_iteration": 2.8412203788757324 + }, + { + "auxiliary_loss_clip": 0.01504955, + "auxiliary_loss_mlp": 0.01292079, + "balance_loss_clip": 1.15116382, + "balance_loss_mlp": 1.03668475, + "epoch": 0.2811363294754246, + "flos": 20920964601600.0, + "grad_norm": 3.344154824235365, + "language_loss": 0.7049315, + "learning_rate": 3.374077235607968e-06, + "loss": 0.73290181, + "num_input_tokens_seen": 100950505, + "step": 4676, + "time_per_iteration": 2.8848094940185547 + }, + { + "auxiliary_loss_clip": 0.0151272, + "auxiliary_loss_mlp": 0.01305511, + "balance_loss_clip": 1.15976644, + "balance_loss_mlp": 1.0573647, + "epoch": 0.28119645272809257, + "flos": 20596941051840.0, + "grad_norm": 1.6979190818102072, + "language_loss": 0.70125949, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.72944182, + "num_input_tokens_seen": 100968790, + "step": 4677, + "time_per_iteration": 2.810688018798828 + }, + { + "auxiliary_loss_clip": 0.01504188, + "auxiliary_loss_mlp": 0.01292675, + "balance_loss_clip": 1.15186191, + "balance_loss_mlp": 1.03861618, + "epoch": 0.28125657598076054, + "flos": 25339753108800.0, + "grad_norm": 1.5637586431461015, + "language_loss": 0.6376245, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.66559315, + "num_input_tokens_seen": 100990205, + "step": 4678, + "time_per_iteration": 2.849393606185913 + }, + { + "auxiliary_loss_clip": 0.01500834, + "auxiliary_loss_mlp": 0.01299683, + "balance_loss_clip": 1.14775276, + "balance_loss_mlp": 1.04524255, + "epoch": 0.2813166992334285, + "flos": 24829424615520.0, + "grad_norm": 1.5041278787287844, + "language_loss": 0.70213294, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.73013812, + "num_input_tokens_seen": 101009815, + "step": 4679, + "time_per_iteration": 2.8817198276519775 + }, + { + "auxiliary_loss_clip": 0.01505059, + "auxiliary_loss_mlp": 0.01289399, + "balance_loss_clip": 1.15150893, + "balance_loss_mlp": 1.03610349, + "epoch": 0.2813768224860965, + "flos": 21762826420320.0, + "grad_norm": 1.9100433718868193, + "language_loss": 0.74760938, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.77555394, + "num_input_tokens_seen": 101026780, + "step": 4680, + "time_per_iteration": 2.761965036392212 + }, + { + "auxiliary_loss_clip": 0.01506612, + "auxiliary_loss_mlp": 0.01292101, + "balance_loss_clip": 1.15363538, + "balance_loss_mlp": 1.03537178, + "epoch": 0.2814369457387645, + "flos": 24319134050400.0, + "grad_norm": 1.850993415162677, + "language_loss": 0.77328253, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.80126965, + "num_input_tokens_seen": 101046215, + "step": 4681, + "time_per_iteration": 2.8085432052612305 + }, + { + "auxiliary_loss_clip": 0.01504424, + "auxiliary_loss_mlp": 0.01309301, + "balance_loss_clip": 1.15186739, + "balance_loss_mlp": 1.05467033, + "epoch": 0.28149706899143245, + "flos": 18517319272800.0, + "grad_norm": 2.2103459950352224, + "language_loss": 0.74172747, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76986468, + "num_input_tokens_seen": 101063365, + "step": 4682, + "time_per_iteration": 2.753199338912964 + }, + { + "auxiliary_loss_clip": 0.0150895, + "auxiliary_loss_mlp": 0.01290847, + "balance_loss_clip": 1.15600526, + "balance_loss_mlp": 1.0383141, + "epoch": 0.2815571922441004, + "flos": 24865570516320.0, + "grad_norm": 1.465832605502857, + "language_loss": 0.80590296, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.83390093, + "num_input_tokens_seen": 101083835, + "step": 4683, + "time_per_iteration": 2.823007583618164 + }, + { + "auxiliary_loss_clip": 0.01507877, + "auxiliary_loss_mlp": 0.01305836, + "balance_loss_clip": 1.15460455, + "balance_loss_mlp": 1.05139542, + "epoch": 0.2816173154967684, + "flos": 19903683221280.0, + "grad_norm": 1.7358740114019886, + "language_loss": 0.76566219, + "learning_rate": 3.371811641167852e-06, + "loss": 0.79379934, + "num_input_tokens_seen": 101101740, + "step": 4684, + "time_per_iteration": 2.782553195953369 + }, + { + "auxiliary_loss_clip": 0.01492647, + "auxiliary_loss_mlp": 0.01290213, + "balance_loss_clip": 1.14054203, + "balance_loss_mlp": 1.03520048, + "epoch": 0.28167743874943635, + "flos": 17493134967360.0, + "grad_norm": 1.6903984375719352, + "language_loss": 0.76573467, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.79356325, + "num_input_tokens_seen": 101120480, + "step": 4685, + "time_per_iteration": 2.8032071590423584 + }, + { + "auxiliary_loss_clip": 0.01504858, + "auxiliary_loss_mlp": 0.01291819, + "balance_loss_clip": 1.1541208, + "balance_loss_mlp": 1.0385232, + "epoch": 0.2817375620021043, + "flos": 25304517483840.0, + "grad_norm": 7.001574594998113, + "language_loss": 0.75668836, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78465515, + "num_input_tokens_seen": 101142910, + "step": 4686, + "time_per_iteration": 2.891632080078125 + }, + { + "auxiliary_loss_clip": 0.01506132, + "auxiliary_loss_mlp": 0.012942, + "balance_loss_clip": 1.15361094, + "balance_loss_mlp": 1.03575397, + "epoch": 0.2817976852547723, + "flos": 18694976595840.0, + "grad_norm": 2.511748339547215, + "language_loss": 0.63341391, + "learning_rate": 3.370961184640025e-06, + "loss": 0.66141725, + "num_input_tokens_seen": 101160030, + "step": 4687, + "time_per_iteration": 2.8479297161102295 + }, + { + "auxiliary_loss_clip": 0.01506523, + "auxiliary_loss_mlp": 0.01298993, + "balance_loss_clip": 1.15359974, + "balance_loss_mlp": 1.04512525, + "epoch": 0.28185780850744024, + "flos": 22744189468800.0, + "grad_norm": 2.138927390549434, + "language_loss": 0.76561868, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.79367375, + "num_input_tokens_seen": 101177675, + "step": 4688, + "time_per_iteration": 2.823211193084717 + }, + { + "auxiliary_loss_clip": 0.01510196, + "auxiliary_loss_mlp": 0.01303534, + "balance_loss_clip": 1.15687203, + "balance_loss_mlp": 1.04966545, + "epoch": 0.2819179317601082, + "flos": 14935386067200.0, + "grad_norm": 2.276610804673697, + "language_loss": 0.78619659, + "learning_rate": 3.37039395366863e-06, + "loss": 0.81433392, + "num_input_tokens_seen": 101192225, + "step": 4689, + "time_per_iteration": 2.698970317840576 + }, + { + "auxiliary_loss_clip": 0.01506667, + "auxiliary_loss_mlp": 0.01290273, + "balance_loss_clip": 1.15257585, + "balance_loss_mlp": 1.03583217, + "epoch": 0.2819780550127762, + "flos": 23147369817120.0, + "grad_norm": 1.6721588131939613, + "language_loss": 0.78059447, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80856389, + "num_input_tokens_seen": 101210870, + "step": 4690, + "time_per_iteration": 2.873192071914673 + }, + { + "auxiliary_loss_clip": 0.0150061, + "auxiliary_loss_mlp": 0.01293322, + "balance_loss_clip": 1.14759731, + "balance_loss_mlp": 1.03907275, + "epoch": 0.28203817826544414, + "flos": 21618508314240.0, + "grad_norm": 1.9133036988112895, + "language_loss": 0.88262105, + "learning_rate": 3.369826514835332e-06, + "loss": 0.91056037, + "num_input_tokens_seen": 101229965, + "step": 4691, + "time_per_iteration": 2.764326572418213 + }, + { + "auxiliary_loss_clip": 0.01493419, + "auxiliary_loss_mlp": 0.01291295, + "balance_loss_clip": 1.14145792, + "balance_loss_mlp": 1.03284919, + "epoch": 0.2820983015181121, + "flos": 24029208280800.0, + "grad_norm": 1.7868545418223967, + "language_loss": 0.82320666, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.85105377, + "num_input_tokens_seen": 101250980, + "step": 4692, + "time_per_iteration": 2.830137252807617 + }, + { + "auxiliary_loss_clip": 0.01500191, + "auxiliary_loss_mlp": 0.0128931, + "balance_loss_clip": 1.14704478, + "balance_loss_mlp": 1.03410721, + "epoch": 0.2821584247707801, + "flos": 30010538861280.0, + "grad_norm": 1.6781995729434578, + "language_loss": 0.74912882, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.77702379, + "num_input_tokens_seen": 101273335, + "step": 4693, + "time_per_iteration": 2.8743090629577637 + }, + { + "auxiliary_loss_clip": 0.0149996, + "auxiliary_loss_mlp": 0.01284787, + "balance_loss_clip": 1.14764524, + "balance_loss_mlp": 1.02805793, + "epoch": 0.2822185480234481, + "flos": 21398864153760.0, + "grad_norm": 1.6473287269045298, + "language_loss": 0.77843106, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.80627859, + "num_input_tokens_seen": 101292110, + "step": 4694, + "time_per_iteration": 2.768425941467285 + }, + { + "auxiliary_loss_clip": 0.01500702, + "auxiliary_loss_mlp": 0.01287948, + "balance_loss_clip": 1.1476078, + "balance_loss_mlp": 1.03541493, + "epoch": 0.28227867127611606, + "flos": 27455217363360.0, + "grad_norm": 2.1782579296808184, + "language_loss": 0.66994798, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.69783443, + "num_input_tokens_seen": 101312815, + "step": 4695, + "time_per_iteration": 2.8515713214874268 + }, + { + "auxiliary_loss_clip": 0.01514036, + "auxiliary_loss_mlp": 0.01306842, + "balance_loss_clip": 1.16101623, + "balance_loss_mlp": 1.05182958, + "epoch": 0.282338794528784, + "flos": 22595509624320.0, + "grad_norm": 3.041324034743412, + "language_loss": 0.7583462, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.78655505, + "num_input_tokens_seen": 101329045, + "step": 4696, + "time_per_iteration": 2.7989487648010254 + }, + { + "auxiliary_loss_clip": 0.01507391, + "auxiliary_loss_mlp": 0.01301445, + "balance_loss_clip": 1.15520442, + "balance_loss_mlp": 1.04719508, + "epoch": 0.282398917781452, + "flos": 42014162799360.0, + "grad_norm": 1.9987384035294251, + "language_loss": 0.62476987, + "learning_rate": 3.368122952024877e-06, + "loss": 0.65285826, + "num_input_tokens_seen": 101352715, + "step": 4697, + "time_per_iteration": 2.989499807357788 + }, + { + "auxiliary_loss_clip": 0.01496241, + "auxiliary_loss_mlp": 0.01286021, + "balance_loss_clip": 1.14407241, + "balance_loss_mlp": 1.03348804, + "epoch": 0.28245904103411995, + "flos": 23227133466240.0, + "grad_norm": 1.520666713850031, + "language_loss": 0.73403955, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.76186216, + "num_input_tokens_seen": 101374640, + "step": 4698, + "time_per_iteration": 4.523369073867798 + }, + { + "auxiliary_loss_clip": 0.01504295, + "auxiliary_loss_mlp": 0.01288648, + "balance_loss_clip": 1.15240264, + "balance_loss_mlp": 1.03630602, + "epoch": 0.2825191642867879, + "flos": 25376998926240.0, + "grad_norm": 3.349726913195192, + "language_loss": 0.75749898, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.78542846, + "num_input_tokens_seen": 101393595, + "step": 4699, + "time_per_iteration": 2.809950113296509 + }, + { + "auxiliary_loss_clip": 0.01501369, + "auxiliary_loss_mlp": 0.01297918, + "balance_loss_clip": 1.14865017, + "balance_loss_mlp": 1.03985405, + "epoch": 0.2825792875394559, + "flos": 17238634463520.0, + "grad_norm": 2.826683100932075, + "language_loss": 0.81279731, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.84079015, + "num_input_tokens_seen": 101409265, + "step": 4700, + "time_per_iteration": 2.7507622241973877 + }, + { + "auxiliary_loss_clip": 0.01506229, + "auxiliary_loss_mlp": 0.01296632, + "balance_loss_clip": 1.15369856, + "balance_loss_mlp": 1.04715037, + "epoch": 0.28263941079212385, + "flos": 26726382554400.0, + "grad_norm": 1.7699990080226338, + "language_loss": 0.81653911, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.84456766, + "num_input_tokens_seen": 101428365, + "step": 4701, + "time_per_iteration": 2.824122667312622 + }, + { + "auxiliary_loss_clip": 0.01505351, + "auxiliary_loss_mlp": 0.0129394, + "balance_loss_clip": 1.15242648, + "balance_loss_mlp": 1.03969002, + "epoch": 0.2826995340447918, + "flos": 25923700889280.0, + "grad_norm": 3.0625470457348896, + "language_loss": 0.73424637, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.76223928, + "num_input_tokens_seen": 101447280, + "step": 4702, + "time_per_iteration": 2.841294765472412 + }, + { + "auxiliary_loss_clip": 0.01511706, + "auxiliary_loss_mlp": 0.0129291, + "balance_loss_clip": 1.15748906, + "balance_loss_mlp": 1.03866005, + "epoch": 0.2827596572974598, + "flos": 22381744328640.0, + "grad_norm": 1.7942802159202507, + "language_loss": 0.78306097, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.8111071, + "num_input_tokens_seen": 101465435, + "step": 4703, + "time_per_iteration": 2.787325143814087 + }, + { + "auxiliary_loss_clip": 0.01504906, + "auxiliary_loss_mlp": 0.01288628, + "balance_loss_clip": 1.15171504, + "balance_loss_mlp": 1.03533173, + "epoch": 0.28281978055012774, + "flos": 33549954235200.0, + "grad_norm": 1.5749304863173683, + "language_loss": 0.69483942, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.72277474, + "num_input_tokens_seen": 101486355, + "step": 4704, + "time_per_iteration": 2.8635413646698 + }, + { + "auxiliary_loss_clip": 0.01512493, + "auxiliary_loss_mlp": 0.01297648, + "balance_loss_clip": 1.1604147, + "balance_loss_mlp": 1.04606819, + "epoch": 0.2828799038027957, + "flos": 23443136523360.0, + "grad_norm": 2.0182761949639123, + "language_loss": 0.70445484, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.73255622, + "num_input_tokens_seen": 101505875, + "step": 4705, + "time_per_iteration": 2.830768346786499 + }, + { + "auxiliary_loss_clip": 0.01613935, + "auxiliary_loss_mlp": 0.01236565, + "balance_loss_clip": 1.26234698, + "balance_loss_mlp": 1.01912689, + "epoch": 0.2829400270554637, + "flos": 69879842717760.0, + "grad_norm": 0.7211290715177279, + "language_loss": 0.59221822, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.62072325, + "num_input_tokens_seen": 101565045, + "step": 4706, + "time_per_iteration": 6.282805442810059 + }, + { + "auxiliary_loss_clip": 0.01508156, + "auxiliary_loss_mlp": 0.01288864, + "balance_loss_clip": 1.15471268, + "balance_loss_mlp": 1.03614044, + "epoch": 0.2830001503081317, + "flos": 24791268522240.0, + "grad_norm": 1.4022329399837097, + "language_loss": 0.82395911, + "learning_rate": 3.365279531475407e-06, + "loss": 0.85192931, + "num_input_tokens_seen": 101585825, + "step": 4707, + "time_per_iteration": 4.361603260040283 + }, + { + "auxiliary_loss_clip": 0.01501409, + "auxiliary_loss_mlp": 0.01298656, + "balance_loss_clip": 1.14845872, + "balance_loss_mlp": 1.04211736, + "epoch": 0.28306027356079966, + "flos": 27671675558400.0, + "grad_norm": 2.1620133757364126, + "language_loss": 0.80449843, + "learning_rate": 3.36499490449902e-06, + "loss": 0.83249909, + "num_input_tokens_seen": 101606105, + "step": 4708, + "time_per_iteration": 2.8211941719055176 + }, + { + "auxiliary_loss_clip": 0.01607209, + "auxiliary_loss_mlp": 0.01244949, + "balance_loss_clip": 1.25591719, + "balance_loss_mlp": 1.02598572, + "epoch": 0.2831203968134676, + "flos": 60533264695680.0, + "grad_norm": 0.8733843745782983, + "language_loss": 0.62726045, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.65578204, + "num_input_tokens_seen": 101656875, + "step": 4709, + "time_per_iteration": 3.094569683074951 + }, + { + "auxiliary_loss_clip": 0.01505138, + "auxiliary_loss_mlp": 0.01296978, + "balance_loss_clip": 1.1532141, + "balance_loss_mlp": 1.04024887, + "epoch": 0.2831805200661356, + "flos": 22057000143840.0, + "grad_norm": 1.345978807496218, + "language_loss": 0.74068248, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.7687037, + "num_input_tokens_seen": 101676225, + "step": 4710, + "time_per_iteration": 2.8182995319366455 + }, + { + "auxiliary_loss_clip": 0.01509235, + "auxiliary_loss_mlp": 0.01305627, + "balance_loss_clip": 1.15622532, + "balance_loss_mlp": 1.05271268, + "epoch": 0.28324064331880355, + "flos": 22602829759200.0, + "grad_norm": 1.8055017923040557, + "language_loss": 0.78948367, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81763232, + "num_input_tokens_seen": 101693710, + "step": 4711, + "time_per_iteration": 2.815195322036743 + }, + { + "auxiliary_loss_clip": 0.01502883, + "auxiliary_loss_mlp": 0.01308115, + "balance_loss_clip": 1.14958739, + "balance_loss_mlp": 1.0542469, + "epoch": 0.2833007665714715, + "flos": 30406474931040.0, + "grad_norm": 2.130595518960505, + "language_loss": 0.71645737, + "learning_rate": 3.363855879093996e-06, + "loss": 0.74456728, + "num_input_tokens_seen": 101714010, + "step": 4712, + "time_per_iteration": 2.8598790168762207 + }, + { + "auxiliary_loss_clip": 0.01504829, + "auxiliary_loss_mlp": 0.01309557, + "balance_loss_clip": 1.15299344, + "balance_loss_mlp": 1.05740571, + "epoch": 0.2833608898241395, + "flos": 23551725938400.0, + "grad_norm": 1.877611979580143, + "language_loss": 0.82103002, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.84917384, + "num_input_tokens_seen": 101732995, + "step": 4713, + "time_per_iteration": 2.7880759239196777 + }, + { + "auxiliary_loss_clip": 0.01512546, + "auxiliary_loss_mlp": 0.01289827, + "balance_loss_clip": 1.15934014, + "balance_loss_mlp": 1.03271675, + "epoch": 0.28342101307680745, + "flos": 20268821260800.0, + "grad_norm": 2.067546228948049, + "language_loss": 0.75585908, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.7838828, + "num_input_tokens_seen": 101751385, + "step": 4714, + "time_per_iteration": 2.776136875152588 + }, + { + "auxiliary_loss_clip": 0.01503847, + "auxiliary_loss_mlp": 0.01306766, + "balance_loss_clip": 1.15149117, + "balance_loss_mlp": 1.05060923, + "epoch": 0.2834811363294754, + "flos": 30849707780640.0, + "grad_norm": 2.169095775278613, + "language_loss": 0.78255463, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.81066072, + "num_input_tokens_seen": 101773825, + "step": 4715, + "time_per_iteration": 2.8765621185302734 + }, + { + "auxiliary_loss_clip": 0.01504612, + "auxiliary_loss_mlp": 0.01290258, + "balance_loss_clip": 1.15261781, + "balance_loss_mlp": 1.0350548, + "epoch": 0.2835412595821434, + "flos": 22713239725920.0, + "grad_norm": 1.8491042321359032, + "language_loss": 0.73970747, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.76765621, + "num_input_tokens_seen": 101791920, + "step": 4716, + "time_per_iteration": 2.8389275074005127 + }, + { + "auxiliary_loss_clip": 0.01497668, + "auxiliary_loss_mlp": 0.01305891, + "balance_loss_clip": 1.14542282, + "balance_loss_mlp": 1.05068779, + "epoch": 0.28360138283481134, + "flos": 18079699790880.0, + "grad_norm": 2.313322956196078, + "language_loss": 0.74775112, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.77578664, + "num_input_tokens_seen": 101809515, + "step": 4717, + "time_per_iteration": 2.732144355773926 + }, + { + "auxiliary_loss_clip": 0.01511642, + "auxiliary_loss_mlp": 0.01302815, + "balance_loss_clip": 1.1593926, + "balance_loss_mlp": 1.04837489, + "epoch": 0.2836615060874793, + "flos": 17856528311520.0, + "grad_norm": 1.5791918168136627, + "language_loss": 0.67457783, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.70272243, + "num_input_tokens_seen": 101827735, + "step": 4718, + "time_per_iteration": 2.8366029262542725 + }, + { + "auxiliary_loss_clip": 0.01502037, + "auxiliary_loss_mlp": 0.01294856, + "balance_loss_clip": 1.149997, + "balance_loss_mlp": 1.03831768, + "epoch": 0.2837216293401473, + "flos": 25742743816320.0, + "grad_norm": 1.8826618609327843, + "language_loss": 0.72394234, + "learning_rate": 3.361860593925566e-06, + "loss": 0.75191128, + "num_input_tokens_seen": 101845970, + "step": 4719, + "time_per_iteration": 2.814488410949707 + }, + { + "auxiliary_loss_clip": 0.01514675, + "auxiliary_loss_mlp": 0.01297661, + "balance_loss_clip": 1.16310453, + "balance_loss_mlp": 1.04932404, + "epoch": 0.2837817525928153, + "flos": 20925667693440.0, + "grad_norm": 1.9780926376516414, + "language_loss": 0.80466998, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.83279335, + "num_input_tokens_seen": 101865040, + "step": 4720, + "time_per_iteration": 2.802767276763916 + }, + { + "auxiliary_loss_clip": 0.01503927, + "auxiliary_loss_mlp": 0.01296034, + "balance_loss_clip": 1.15270805, + "balance_loss_mlp": 1.03968668, + "epoch": 0.28384187584548326, + "flos": 18918716997600.0, + "grad_norm": 1.9744695517121666, + "language_loss": 0.79223317, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.82023275, + "num_input_tokens_seen": 101883735, + "step": 4721, + "time_per_iteration": 2.771503210067749 + }, + { + "auxiliary_loss_clip": 0.01504196, + "auxiliary_loss_mlp": 0.0129741, + "balance_loss_clip": 1.15326345, + "balance_loss_mlp": 1.04716611, + "epoch": 0.2839019990981512, + "flos": 27346400379360.0, + "grad_norm": 2.06550552304113, + "language_loss": 0.82864761, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.85666358, + "num_input_tokens_seen": 101903025, + "step": 4722, + "time_per_iteration": 2.8643057346343994 + }, + { + "auxiliary_loss_clip": 0.01511052, + "auxiliary_loss_mlp": 0.01294313, + "balance_loss_clip": 1.16012907, + "balance_loss_mlp": 1.04101753, + "epoch": 0.2839621223508192, + "flos": 18116300829600.0, + "grad_norm": 1.7341678188188152, + "language_loss": 0.70344985, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.73150349, + "num_input_tokens_seen": 101922255, + "step": 4723, + "time_per_iteration": 2.8194258213043213 + }, + { + "auxiliary_loss_clip": 0.01511744, + "auxiliary_loss_mlp": 0.012882, + "balance_loss_clip": 1.16070795, + "balance_loss_mlp": 1.03013539, + "epoch": 0.28402224560348716, + "flos": 26360713520640.0, + "grad_norm": 1.5683312194836363, + "language_loss": 0.78723156, + "learning_rate": 3.360433840760998e-06, + "loss": 0.81523097, + "num_input_tokens_seen": 101943100, + "step": 4724, + "time_per_iteration": 2.8409667015075684 + }, + { + "auxiliary_loss_clip": 0.01506296, + "auxiliary_loss_mlp": 0.01296162, + "balance_loss_clip": 1.1548233, + "balance_loss_mlp": 1.04191279, + "epoch": 0.2840823688561551, + "flos": 24063078492000.0, + "grad_norm": 2.1868586951853786, + "language_loss": 0.92731953, + "learning_rate": 3.36014833532143e-06, + "loss": 0.9553442, + "num_input_tokens_seen": 101963160, + "step": 4725, + "time_per_iteration": 2.8153772354125977 + }, + { + "auxiliary_loss_clip": 0.01510171, + "auxiliary_loss_mlp": 0.01299102, + "balance_loss_clip": 1.15895414, + "balance_loss_mlp": 1.04351771, + "epoch": 0.2841424921088231, + "flos": 29462357700000.0, + "grad_norm": 1.6033252873427917, + "language_loss": 0.89038539, + "learning_rate": 3.3598627783049e-06, + "loss": 0.91847813, + "num_input_tokens_seen": 101984300, + "step": 4726, + "time_per_iteration": 2.8275985717773438 + }, + { + "auxiliary_loss_clip": 0.01514945, + "auxiliary_loss_mlp": 0.01301171, + "balance_loss_clip": 1.16457272, + "balance_loss_mlp": 1.04768491, + "epoch": 0.28420261536149105, + "flos": 48103741441440.0, + "grad_norm": 1.9321419020543966, + "language_loss": 0.78763014, + "learning_rate": 3.359577169722238e-06, + "loss": 0.81579131, + "num_input_tokens_seen": 102005765, + "step": 4727, + "time_per_iteration": 2.9736127853393555 + }, + { + "auxiliary_loss_clip": 0.01509603, + "auxiliary_loss_mlp": 0.01302921, + "balance_loss_clip": 1.15987992, + "balance_loss_mlp": 1.05210507, + "epoch": 0.284262738614159, + "flos": 25668517678560.0, + "grad_norm": 2.586221863164397, + "language_loss": 0.6650666, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.69319183, + "num_input_tokens_seen": 102022755, + "step": 4728, + "time_per_iteration": 2.8383047580718994 + }, + { + "auxiliary_loss_clip": 0.01503335, + "auxiliary_loss_mlp": 0.01296401, + "balance_loss_clip": 1.15209413, + "balance_loss_mlp": 1.04157901, + "epoch": 0.284322861866827, + "flos": 19721019381120.0, + "grad_norm": 1.9355470564500332, + "language_loss": 0.76825523, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.79625261, + "num_input_tokens_seen": 102041850, + "step": 4729, + "time_per_iteration": 2.8369650840759277 + }, + { + "auxiliary_loss_clip": 0.01514196, + "auxiliary_loss_mlp": 0.01295496, + "balance_loss_clip": 1.16393304, + "balance_loss_mlp": 1.04010165, + "epoch": 0.28438298511949495, + "flos": 23917546684800.0, + "grad_norm": 1.8643138116320312, + "language_loss": 0.66560125, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.69369811, + "num_input_tokens_seen": 102059500, + "step": 4730, + "time_per_iteration": 2.819462776184082 + }, + { + "auxiliary_loss_clip": 0.01512968, + "auxiliary_loss_mlp": 0.01297172, + "balance_loss_clip": 1.16243362, + "balance_loss_mlp": 1.03987086, + "epoch": 0.2844431083721629, + "flos": 26069725762560.0, + "grad_norm": 1.8026631922975354, + "language_loss": 0.74672914, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.77483052, + "num_input_tokens_seen": 102080460, + "step": 4731, + "time_per_iteration": 2.9194042682647705 + }, + { + "auxiliary_loss_clip": 0.01509792, + "auxiliary_loss_mlp": 0.01290392, + "balance_loss_clip": 1.15772116, + "balance_loss_mlp": 1.03709602, + "epoch": 0.2845032316248309, + "flos": 25812722000160.0, + "grad_norm": 1.457468957013938, + "language_loss": 0.83896995, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.86697173, + "num_input_tokens_seen": 102100950, + "step": 4732, + "time_per_iteration": 2.805149555206299 + }, + { + "auxiliary_loss_clip": 0.01510052, + "auxiliary_loss_mlp": 0.01297778, + "balance_loss_clip": 1.15929103, + "balance_loss_mlp": 1.04123998, + "epoch": 0.2845633548774989, + "flos": 19824943632480.0, + "grad_norm": 1.5592761918090146, + "language_loss": 0.78959179, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81767011, + "num_input_tokens_seen": 102119345, + "step": 4733, + "time_per_iteration": 2.704943895339966 + }, + { + "auxiliary_loss_clip": 0.01519906, + "auxiliary_loss_mlp": 0.01322696, + "balance_loss_clip": 1.16797626, + "balance_loss_mlp": 1.06940043, + "epoch": 0.28462347813016686, + "flos": 23184919059840.0, + "grad_norm": 2.5238211658352236, + "language_loss": 0.71769041, + "learning_rate": 3.357576466701875e-06, + "loss": 0.7461164, + "num_input_tokens_seen": 102139050, + "step": 4734, + "time_per_iteration": 2.752978563308716 + }, + { + "auxiliary_loss_clip": 0.01510479, + "auxiliary_loss_mlp": 0.01297284, + "balance_loss_clip": 1.15963829, + "balance_loss_mlp": 1.04742169, + "epoch": 0.2846836013828348, + "flos": 18662433870240.0, + "grad_norm": 1.9203792630175023, + "language_loss": 0.74478054, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.7728582, + "num_input_tokens_seen": 102157935, + "step": 4735, + "time_per_iteration": 2.767279863357544 + }, + { + "auxiliary_loss_clip": 0.01514861, + "auxiliary_loss_mlp": 0.0129934, + "balance_loss_clip": 1.1634109, + "balance_loss_mlp": 1.04718864, + "epoch": 0.2847437246355028, + "flos": 14175260161920.0, + "grad_norm": 2.0687959533342215, + "language_loss": 0.80105877, + "learning_rate": 3.357004373789946e-06, + "loss": 0.82920086, + "num_input_tokens_seen": 102175325, + "step": 4736, + "time_per_iteration": 4.327853202819824 + }, + { + "auxiliary_loss_clip": 0.0151611, + "auxiliary_loss_mlp": 0.01308483, + "balance_loss_clip": 1.1650449, + "balance_loss_mlp": 1.05709457, + "epoch": 0.28480384788817076, + "flos": 29280945489120.0, + "grad_norm": 2.2944139105599226, + "language_loss": 0.60060763, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.62885362, + "num_input_tokens_seen": 102196625, + "step": 4737, + "time_per_iteration": 2.8313443660736084 + }, + { + "auxiliary_loss_clip": 0.0151539, + "auxiliary_loss_mlp": 0.01306957, + "balance_loss_clip": 1.16438043, + "balance_loss_mlp": 1.06109965, + "epoch": 0.2848639711408387, + "flos": 22603512466080.0, + "grad_norm": 1.9766651421036892, + "language_loss": 0.86797154, + "learning_rate": 3.356432075047052e-06, + "loss": 0.89619505, + "num_input_tokens_seen": 102214975, + "step": 4738, + "time_per_iteration": 2.7523415088653564 + }, + { + "auxiliary_loss_clip": 0.01518724, + "auxiliary_loss_mlp": 0.01312003, + "balance_loss_clip": 1.16653264, + "balance_loss_mlp": 1.06156802, + "epoch": 0.2849240943935067, + "flos": 17601193388160.0, + "grad_norm": 2.0238750398702035, + "language_loss": 0.90016627, + "learning_rate": 3.356145848516118e-06, + "loss": 0.92847353, + "num_input_tokens_seen": 102231885, + "step": 4739, + "time_per_iteration": 2.740307092666626 + }, + { + "auxiliary_loss_clip": 0.01510514, + "auxiliary_loss_mlp": 0.01296822, + "balance_loss_clip": 1.15885639, + "balance_loss_mlp": 1.04772186, + "epoch": 0.28498421764617465, + "flos": 24864849881280.0, + "grad_norm": 1.325412863211691, + "language_loss": 0.72236991, + "learning_rate": 3.355859570559998e-06, + "loss": 0.75044328, + "num_input_tokens_seen": 102252725, + "step": 4740, + "time_per_iteration": 2.8710498809814453 + }, + { + "auxiliary_loss_clip": 0.01523873, + "auxiliary_loss_mlp": 0.01310878, + "balance_loss_clip": 1.17342806, + "balance_loss_mlp": 1.06654704, + "epoch": 0.2850443408988426, + "flos": 22784697108000.0, + "grad_norm": 1.575005473246099, + "language_loss": 0.78047442, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.80882192, + "num_input_tokens_seen": 102271730, + "step": 4741, + "time_per_iteration": 2.7542967796325684 + }, + { + "auxiliary_loss_clip": 0.0151094, + "auxiliary_loss_mlp": 0.0130169, + "balance_loss_clip": 1.16014314, + "balance_loss_mlp": 1.05106401, + "epoch": 0.2851044641515106, + "flos": 18846083842560.0, + "grad_norm": 1.7088080649019024, + "language_loss": 0.76675713, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.79488343, + "num_input_tokens_seen": 102291325, + "step": 4742, + "time_per_iteration": 2.830875873565674 + }, + { + "auxiliary_loss_clip": 0.01511406, + "auxiliary_loss_mlp": 0.01294749, + "balance_loss_clip": 1.16026306, + "balance_loss_mlp": 1.04068995, + "epoch": 0.28516458740417855, + "flos": 18882533168640.0, + "grad_norm": 2.034655675477999, + "language_loss": 0.58135504, + "learning_rate": 3.355000428249086e-06, + "loss": 0.6094166, + "num_input_tokens_seen": 102309000, + "step": 4743, + "time_per_iteration": 2.792065143585205 + }, + { + "auxiliary_loss_clip": 0.01518356, + "auxiliary_loss_mlp": 0.01306724, + "balance_loss_clip": 1.16792989, + "balance_loss_mlp": 1.05228388, + "epoch": 0.2852247106568465, + "flos": 25302090081600.0, + "grad_norm": 1.6304603396510817, + "language_loss": 0.74633634, + "learning_rate": 3.354713944700797e-06, + "loss": 0.77458715, + "num_input_tokens_seen": 102329240, + "step": 4744, + "time_per_iteration": 5.784803628921509 + }, + { + "auxiliary_loss_clip": 0.01515599, + "auxiliary_loss_mlp": 0.01304467, + "balance_loss_clip": 1.16550171, + "balance_loss_mlp": 1.05250633, + "epoch": 0.2852848339095145, + "flos": 11656956912480.0, + "grad_norm": 2.2190657342496705, + "language_loss": 0.7821545, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.81035519, + "num_input_tokens_seen": 102344440, + "step": 4745, + "time_per_iteration": 4.369406223297119 + }, + { + "auxiliary_loss_clip": 0.01516808, + "auxiliary_loss_mlp": 0.01320407, + "balance_loss_clip": 1.16687393, + "balance_loss_mlp": 1.07378697, + "epoch": 0.2853449571621825, + "flos": 12934921086720.0, + "grad_norm": 2.1236480065698413, + "language_loss": 0.82957184, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.85794401, + "num_input_tokens_seen": 102360985, + "step": 4746, + "time_per_iteration": 2.7637839317321777 + }, + { + "auxiliary_loss_clip": 0.01520192, + "auxiliary_loss_mlp": 0.01294735, + "balance_loss_clip": 1.17050481, + "balance_loss_mlp": 1.04067612, + "epoch": 0.28540508041485046, + "flos": 20012613989760.0, + "grad_norm": 3.234880384605601, + "language_loss": 0.79997939, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.82812864, + "num_input_tokens_seen": 102380320, + "step": 4747, + "time_per_iteration": 2.748046398162842 + }, + { + "auxiliary_loss_clip": 0.01626239, + "auxiliary_loss_mlp": 0.0130661, + "balance_loss_clip": 1.27809823, + "balance_loss_mlp": 1.09069824, + "epoch": 0.28546520366751843, + "flos": 68146356605760.0, + "grad_norm": 0.7919065439976376, + "language_loss": 0.60376495, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.63309348, + "num_input_tokens_seen": 102439140, + "step": 4748, + "time_per_iteration": 3.2885541915893555 + }, + { + "auxiliary_loss_clip": 0.01514478, + "auxiliary_loss_mlp": 0.01306487, + "balance_loss_clip": 1.16303754, + "balance_loss_mlp": 1.05872273, + "epoch": 0.2855253269201864, + "flos": 13252686490080.0, + "grad_norm": 2.5550091333051443, + "language_loss": 0.80911589, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.83732557, + "num_input_tokens_seen": 102450990, + "step": 4749, + "time_per_iteration": 2.750103235244751 + }, + { + "auxiliary_loss_clip": 0.01516023, + "auxiliary_loss_mlp": 0.01295278, + "balance_loss_clip": 1.16528583, + "balance_loss_mlp": 1.04541552, + "epoch": 0.28558545017285436, + "flos": 28623378421440.0, + "grad_norm": 2.267227688988244, + "language_loss": 0.71145457, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.73956758, + "num_input_tokens_seen": 102471820, + "step": 4750, + "time_per_iteration": 2.7671165466308594 + }, + { + "auxiliary_loss_clip": 0.01522625, + "auxiliary_loss_mlp": 0.01290581, + "balance_loss_clip": 1.17224002, + "balance_loss_mlp": 1.03728533, + "epoch": 0.2856455734255223, + "flos": 34133295165120.0, + "grad_norm": 1.5846438472444693, + "language_loss": 0.82520747, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.85333949, + "num_input_tokens_seen": 102492625, + "step": 4751, + "time_per_iteration": 2.8924407958984375 + }, + { + "auxiliary_loss_clip": 0.01508136, + "auxiliary_loss_mlp": 0.01299012, + "balance_loss_clip": 1.1570406, + "balance_loss_mlp": 1.04838598, + "epoch": 0.2857056966781903, + "flos": 39789312638400.0, + "grad_norm": 1.673306688011464, + "language_loss": 0.79801536, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82608688, + "num_input_tokens_seen": 102514145, + "step": 4752, + "time_per_iteration": 2.9066576957702637 + }, + { + "auxiliary_loss_clip": 0.0150975, + "auxiliary_loss_mlp": 0.01289807, + "balance_loss_clip": 1.15883422, + "balance_loss_mlp": 1.035748, + "epoch": 0.28576581993085826, + "flos": 21874488016320.0, + "grad_norm": 1.7133549612956995, + "language_loss": 0.79149711, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.8194927, + "num_input_tokens_seen": 102532365, + "step": 4753, + "time_per_iteration": 2.8979852199554443 + }, + { + "auxiliary_loss_clip": 0.0152093, + "auxiliary_loss_mlp": 0.01307393, + "balance_loss_clip": 1.1717062, + "balance_loss_mlp": 1.05104518, + "epoch": 0.2858259431835262, + "flos": 19093264211520.0, + "grad_norm": 2.107833317160392, + "language_loss": 0.89430338, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.92258656, + "num_input_tokens_seen": 102548425, + "step": 4754, + "time_per_iteration": 2.820178747177124 + }, + { + "auxiliary_loss_clip": 0.01518209, + "auxiliary_loss_mlp": 0.01299955, + "balance_loss_clip": 1.16851294, + "balance_loss_mlp": 1.04894757, + "epoch": 0.2858860664361942, + "flos": 20336106545280.0, + "grad_norm": 2.400637678392718, + "language_loss": 0.82051283, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.8486945, + "num_input_tokens_seen": 102566370, + "step": 4755, + "time_per_iteration": 2.7366464138031006 + }, + { + "auxiliary_loss_clip": 0.01511351, + "auxiliary_loss_mlp": 0.01282923, + "balance_loss_clip": 1.16311908, + "balance_loss_mlp": 1.02867353, + "epoch": 0.28594618968886215, + "flos": 24464021078880.0, + "grad_norm": 1.817313969925868, + "language_loss": 0.83898902, + "learning_rate": 3.351272138300922e-06, + "loss": 0.8669318, + "num_input_tokens_seen": 102588715, + "step": 4756, + "time_per_iteration": 2.8027756214141846 + }, + { + "auxiliary_loss_clip": 0.01644, + "auxiliary_loss_mlp": 0.01241921, + "balance_loss_clip": 1.30529737, + "balance_loss_mlp": 1.01303864, + "epoch": 0.2860063129415301, + "flos": 71660663173440.0, + "grad_norm": 0.8780534062326464, + "language_loss": 0.60998464, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63884389, + "num_input_tokens_seen": 102656715, + "step": 4757, + "time_per_iteration": 3.436375379562378 + }, + { + "auxiliary_loss_clip": 0.01532684, + "auxiliary_loss_mlp": 0.01301986, + "balance_loss_clip": 1.18930972, + "balance_loss_mlp": 1.05174184, + "epoch": 0.2860664361941981, + "flos": 20560795151040.0, + "grad_norm": 2.192382512743561, + "language_loss": 0.65802801, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.68637478, + "num_input_tokens_seen": 102676545, + "step": 4758, + "time_per_iteration": 2.774418592453003 + }, + { + "auxiliary_loss_clip": 0.01523593, + "auxiliary_loss_mlp": 0.01297946, + "balance_loss_clip": 1.18113768, + "balance_loss_mlp": 1.04121649, + "epoch": 0.2861265594468661, + "flos": 35999606786400.0, + "grad_norm": 1.5451245886273732, + "language_loss": 0.63003278, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65824813, + "num_input_tokens_seen": 102702875, + "step": 4759, + "time_per_iteration": 2.9840075969696045 + }, + { + "auxiliary_loss_clip": 0.01530875, + "auxiliary_loss_mlp": 0.01305865, + "balance_loss_clip": 1.18817091, + "balance_loss_mlp": 1.05447626, + "epoch": 0.28618668269953407, + "flos": 20049480525600.0, + "grad_norm": 1.9934598604139409, + "language_loss": 0.74318433, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.77155173, + "num_input_tokens_seen": 102723160, + "step": 4760, + "time_per_iteration": 2.7280795574188232 + }, + { + "auxiliary_loss_clip": 0.01529763, + "auxiliary_loss_mlp": 0.01303668, + "balance_loss_clip": 1.18698859, + "balance_loss_mlp": 1.05609441, + "epoch": 0.28624680595220203, + "flos": 24974122003200.0, + "grad_norm": 2.027318062938516, + "language_loss": 0.73140979, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.75974417, + "num_input_tokens_seen": 102743855, + "step": 4761, + "time_per_iteration": 2.8144688606262207 + }, + { + "auxiliary_loss_clip": 0.0152456, + "auxiliary_loss_mlp": 0.01297369, + "balance_loss_clip": 1.18222308, + "balance_loss_mlp": 1.04521787, + "epoch": 0.28630692920487, + "flos": 22494467913120.0, + "grad_norm": 1.9250090714156123, + "language_loss": 0.74479508, + "learning_rate": 3.349548466945793e-06, + "loss": 0.77301443, + "num_input_tokens_seen": 102761370, + "step": 4762, + "time_per_iteration": 2.743346929550171 + }, + { + "auxiliary_loss_clip": 0.01529701, + "auxiliary_loss_mlp": 0.01323298, + "balance_loss_clip": 1.18785203, + "balance_loss_mlp": 1.07877612, + "epoch": 0.28636705245753796, + "flos": 21251929004640.0, + "grad_norm": 2.1570531211324804, + "language_loss": 0.76468694, + "learning_rate": 3.349261009210496e-06, + "loss": 0.79321694, + "num_input_tokens_seen": 102780885, + "step": 4763, + "time_per_iteration": 2.7424423694610596 + }, + { + "auxiliary_loss_clip": 0.01531091, + "auxiliary_loss_mlp": 0.01306186, + "balance_loss_clip": 1.18830705, + "balance_loss_mlp": 1.0549885, + "epoch": 0.28642717571020593, + "flos": 24097972763520.0, + "grad_norm": 1.6391878650130807, + "language_loss": 0.77255642, + "learning_rate": 3.348973500311086e-06, + "loss": 0.80092919, + "num_input_tokens_seen": 102801000, + "step": 4764, + "time_per_iteration": 2.8008873462677 + }, + { + "auxiliary_loss_clip": 0.01530466, + "auxiliary_loss_mlp": 0.0131053, + "balance_loss_clip": 1.18834698, + "balance_loss_mlp": 1.05818796, + "epoch": 0.2864872989628739, + "flos": 22603588322400.0, + "grad_norm": 3.5726436418378493, + "language_loss": 0.71596438, + "learning_rate": 3.348685940258466e-06, + "loss": 0.74437433, + "num_input_tokens_seen": 102820230, + "step": 4765, + "time_per_iteration": 2.795114278793335 + }, + { + "auxiliary_loss_clip": 0.01523967, + "auxiliary_loss_mlp": 0.01301752, + "balance_loss_clip": 1.18142831, + "balance_loss_mlp": 1.05131686, + "epoch": 0.28654742221554186, + "flos": 32747879420640.0, + "grad_norm": 1.814927118016511, + "language_loss": 0.76274168, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.79099882, + "num_input_tokens_seen": 102842670, + "step": 4766, + "time_per_iteration": 2.860692024230957 + }, + { + "auxiliary_loss_clip": 0.01522841, + "auxiliary_loss_mlp": 0.01294724, + "balance_loss_clip": 1.17951465, + "balance_loss_mlp": 1.04371643, + "epoch": 0.2866075454682098, + "flos": 26984182808160.0, + "grad_norm": 2.173101934044068, + "language_loss": 0.78034079, + "learning_rate": 3.348110666737214e-06, + "loss": 0.8085165, + "num_input_tokens_seen": 102864480, + "step": 4767, + "time_per_iteration": 2.8103766441345215 + }, + { + "auxiliary_loss_clip": 0.01532915, + "auxiliary_loss_mlp": 0.01293995, + "balance_loss_clip": 1.190593, + "balance_loss_mlp": 1.04069948, + "epoch": 0.2866676687208778, + "flos": 23255845447680.0, + "grad_norm": 3.0597572348793216, + "language_loss": 0.65372825, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.6819973, + "num_input_tokens_seen": 102883740, + "step": 4768, + "time_per_iteration": 2.8846943378448486 + }, + { + "auxiliary_loss_clip": 0.0152306, + "auxiliary_loss_mlp": 0.01297611, + "balance_loss_clip": 1.18061399, + "balance_loss_mlp": 1.04107237, + "epoch": 0.28672779197354575, + "flos": 21581869347360.0, + "grad_norm": 2.095578965611484, + "language_loss": 0.70598006, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.73418677, + "num_input_tokens_seen": 102902945, + "step": 4769, + "time_per_iteration": 2.807377338409424 + }, + { + "auxiliary_loss_clip": 0.01526834, + "auxiliary_loss_mlp": 0.01297231, + "balance_loss_clip": 1.18371201, + "balance_loss_mlp": 1.04641509, + "epoch": 0.2867879152262137, + "flos": 19867651104960.0, + "grad_norm": 1.8140288994404994, + "language_loss": 0.74788392, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.7761246, + "num_input_tokens_seen": 102922405, + "step": 4770, + "time_per_iteration": 2.7848451137542725 + }, + { + "auxiliary_loss_clip": 0.01519123, + "auxiliary_loss_mlp": 0.01300621, + "balance_loss_clip": 1.17648673, + "balance_loss_mlp": 1.04827929, + "epoch": 0.2868480384788817, + "flos": 28215457053120.0, + "grad_norm": 2.503452959493101, + "language_loss": 0.6793105, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.70750797, + "num_input_tokens_seen": 102938980, + "step": 4771, + "time_per_iteration": 2.863713502883911 + }, + { + "auxiliary_loss_clip": 0.01620567, + "auxiliary_loss_mlp": 0.01244209, + "balance_loss_clip": 1.28289914, + "balance_loss_mlp": 1.01837921, + "epoch": 0.2869081617315497, + "flos": 65430294108480.0, + "grad_norm": 0.7717672888412848, + "language_loss": 0.56818408, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.5968318, + "num_input_tokens_seen": 103000405, + "step": 4772, + "time_per_iteration": 3.265115261077881 + }, + { + "auxiliary_loss_clip": 0.01519551, + "auxiliary_loss_mlp": 0.01291381, + "balance_loss_clip": 1.17749107, + "balance_loss_mlp": 1.03465223, + "epoch": 0.28696828498421767, + "flos": 18662699367360.0, + "grad_norm": 2.996880259008165, + "language_loss": 0.83634502, + "learning_rate": 3.346383619630856e-06, + "loss": 0.86445433, + "num_input_tokens_seen": 103017970, + "step": 4773, + "time_per_iteration": 2.6818337440490723 + }, + { + "auxiliary_loss_clip": 0.01505419, + "auxiliary_loss_mlp": 0.01296621, + "balance_loss_clip": 1.16338503, + "balance_loss_mlp": 1.04179883, + "epoch": 0.28702840823688563, + "flos": 23662022120640.0, + "grad_norm": 3.0837387092080553, + "language_loss": 0.77793384, + "learning_rate": 3.34609559969027e-06, + "loss": 0.80595422, + "num_input_tokens_seen": 103036385, + "step": 4774, + "time_per_iteration": 4.422378063201904 + }, + { + "auxiliary_loss_clip": 0.01510118, + "auxiliary_loss_mlp": 0.0128315, + "balance_loss_clip": 1.16647887, + "balance_loss_mlp": 1.03080797, + "epoch": 0.2870885314895536, + "flos": 13806253450080.0, + "grad_norm": 2.190425689237311, + "language_loss": 0.73712277, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.76505542, + "num_input_tokens_seen": 103052170, + "step": 4775, + "time_per_iteration": 2.797447443008423 + }, + { + "auxiliary_loss_clip": 0.0150557, + "auxiliary_loss_mlp": 0.01288893, + "balance_loss_clip": 1.16298461, + "balance_loss_mlp": 1.03140116, + "epoch": 0.28714865474222157, + "flos": 17788787889120.0, + "grad_norm": 2.3739842382019587, + "language_loss": 0.88238752, + "learning_rate": 3.34551940668778e-06, + "loss": 0.91033208, + "num_input_tokens_seen": 103070510, + "step": 4776, + "time_per_iteration": 2.7394115924835205 + }, + { + "auxiliary_loss_clip": 0.01507843, + "auxiliary_loss_mlp": 0.01299426, + "balance_loss_clip": 1.16426015, + "balance_loss_mlp": 1.05051684, + "epoch": 0.28720877799488953, + "flos": 15999319448640.0, + "grad_norm": 1.6918679866310962, + "language_loss": 0.74214995, + "learning_rate": 3.345231233647726e-06, + "loss": 0.77022266, + "num_input_tokens_seen": 103089590, + "step": 4777, + "time_per_iteration": 2.744992971420288 + }, + { + "auxiliary_loss_clip": 0.01519568, + "auxiliary_loss_mlp": 0.01292275, + "balance_loss_clip": 1.17631316, + "balance_loss_mlp": 1.03440166, + "epoch": 0.2872689012475575, + "flos": 20925250483680.0, + "grad_norm": 1.8950558476164032, + "language_loss": 0.79836488, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82648337, + "num_input_tokens_seen": 103109080, + "step": 4778, + "time_per_iteration": 2.8606512546539307 + }, + { + "auxiliary_loss_clip": 0.01514971, + "auxiliary_loss_mlp": 0.01294465, + "balance_loss_clip": 1.1706903, + "balance_loss_mlp": 1.04059649, + "epoch": 0.28732902450022546, + "flos": 21327292987200.0, + "grad_norm": 1.841343351480598, + "language_loss": 0.73706686, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76516122, + "num_input_tokens_seen": 103127755, + "step": 4779, + "time_per_iteration": 2.759883403778076 + }, + { + "auxiliary_loss_clip": 0.0151565, + "auxiliary_loss_mlp": 0.01282405, + "balance_loss_clip": 1.17029321, + "balance_loss_mlp": 1.02319598, + "epoch": 0.2873891477528934, + "flos": 20852503544160.0, + "grad_norm": 1.7930662547991183, + "language_loss": 0.7645424, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.79252297, + "num_input_tokens_seen": 103147035, + "step": 4780, + "time_per_iteration": 2.7539219856262207 + }, + { + "auxiliary_loss_clip": 0.01506279, + "auxiliary_loss_mlp": 0.01282242, + "balance_loss_clip": 1.16095328, + "balance_loss_mlp": 1.02398682, + "epoch": 0.2874492710055614, + "flos": 17421943082400.0, + "grad_norm": 2.2027929919138716, + "language_loss": 0.81094253, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83882779, + "num_input_tokens_seen": 103165410, + "step": 4781, + "time_per_iteration": 2.741426706314087 + }, + { + "auxiliary_loss_clip": 0.01513044, + "auxiliary_loss_mlp": 0.01291741, + "balance_loss_clip": 1.16834259, + "balance_loss_mlp": 1.02719152, + "epoch": 0.28750939425822936, + "flos": 13408496828640.0, + "grad_norm": 2.130710064221746, + "language_loss": 0.86214906, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.89019686, + "num_input_tokens_seen": 103183710, + "step": 4782, + "time_per_iteration": 4.253015995025635 + }, + { + "auxiliary_loss_clip": 0.01515984, + "auxiliary_loss_mlp": 0.01293897, + "balance_loss_clip": 1.17005706, + "balance_loss_mlp": 1.0352608, + "epoch": 0.2875695175108973, + "flos": 21872098542240.0, + "grad_norm": 1.5050340649295018, + "language_loss": 0.71527457, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.74337327, + "num_input_tokens_seen": 103203790, + "step": 4783, + "time_per_iteration": 2.8497462272644043 + }, + { + "auxiliary_loss_clip": 0.01507621, + "auxiliary_loss_mlp": 0.01290694, + "balance_loss_clip": 1.16248667, + "balance_loss_mlp": 1.03415561, + "epoch": 0.2876296407635653, + "flos": 26247610654560.0, + "grad_norm": 1.6835576077540986, + "language_loss": 0.77114499, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79912817, + "num_input_tokens_seen": 103223925, + "step": 4784, + "time_per_iteration": 4.295501232147217 + }, + { + "auxiliary_loss_clip": 0.01508927, + "auxiliary_loss_mlp": 0.01287434, + "balance_loss_clip": 1.16075623, + "balance_loss_mlp": 1.03718948, + "epoch": 0.28768976401623325, + "flos": 25375974865920.0, + "grad_norm": 1.3789150763438247, + "language_loss": 0.76031411, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.78827763, + "num_input_tokens_seen": 103244760, + "step": 4785, + "time_per_iteration": 2.811110258102417 + }, + { + "auxiliary_loss_clip": 0.01504751, + "auxiliary_loss_mlp": 0.01297778, + "balance_loss_clip": 1.15724552, + "balance_loss_mlp": 1.04104841, + "epoch": 0.28774988726890127, + "flos": 30667006012320.0, + "grad_norm": 2.0044725419993834, + "language_loss": 0.82793736, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85596263, + "num_input_tokens_seen": 103261995, + "step": 4786, + "time_per_iteration": 2.8310041427612305 + }, + { + "auxiliary_loss_clip": 0.01502242, + "auxiliary_loss_mlp": 0.0128378, + "balance_loss_clip": 1.15558159, + "balance_loss_mlp": 1.02857661, + "epoch": 0.28781001052156924, + "flos": 20597585830560.0, + "grad_norm": 1.9966304315835324, + "language_loss": 0.80155593, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82941622, + "num_input_tokens_seen": 103279780, + "step": 4787, + "time_per_iteration": 2.751431941986084 + }, + { + "auxiliary_loss_clip": 0.01505738, + "auxiliary_loss_mlp": 0.01286257, + "balance_loss_clip": 1.15890825, + "balance_loss_mlp": 1.03334236, + "epoch": 0.2878701337742372, + "flos": 26544780702720.0, + "grad_norm": 2.1769367868001193, + "language_loss": 0.83695424, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.86487418, + "num_input_tokens_seen": 103300580, + "step": 4788, + "time_per_iteration": 2.814467191696167 + }, + { + "auxiliary_loss_clip": 0.01513505, + "auxiliary_loss_mlp": 0.01300357, + "balance_loss_clip": 1.16726041, + "balance_loss_mlp": 1.04419982, + "epoch": 0.28793025702690517, + "flos": 28149347541600.0, + "grad_norm": 4.3317901034456385, + "language_loss": 0.73853892, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.7666775, + "num_input_tokens_seen": 103320430, + "step": 4789, + "time_per_iteration": 2.8028080463409424 + }, + { + "auxiliary_loss_clip": 0.01499522, + "auxiliary_loss_mlp": 0.01288018, + "balance_loss_clip": 1.15189052, + "balance_loss_mlp": 1.03853655, + "epoch": 0.28799038027957313, + "flos": 23807743568640.0, + "grad_norm": 1.9230364069621488, + "language_loss": 0.84349197, + "learning_rate": 3.341480346078704e-06, + "loss": 0.87136734, + "num_input_tokens_seen": 103337695, + "step": 4790, + "time_per_iteration": 2.7859513759613037 + }, + { + "auxiliary_loss_clip": 0.01505097, + "auxiliary_loss_mlp": 0.01283447, + "balance_loss_clip": 1.15872002, + "balance_loss_mlp": 1.02614594, + "epoch": 0.2880505035322411, + "flos": 22346205278400.0, + "grad_norm": 1.7558832091413779, + "language_loss": 0.77949822, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80738366, + "num_input_tokens_seen": 103357010, + "step": 4791, + "time_per_iteration": 2.7289645671844482 + }, + { + "auxiliary_loss_clip": 0.01499192, + "auxiliary_loss_mlp": 0.01296436, + "balance_loss_clip": 1.15221477, + "balance_loss_mlp": 1.04104197, + "epoch": 0.28811062678490906, + "flos": 18006194288160.0, + "grad_norm": 11.87648709041628, + "language_loss": 0.70558518, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.73354149, + "num_input_tokens_seen": 103375600, + "step": 4792, + "time_per_iteration": 2.753610372543335 + }, + { + "auxiliary_loss_clip": 0.01503869, + "auxiliary_loss_mlp": 0.01290329, + "balance_loss_clip": 1.15842915, + "balance_loss_mlp": 1.03417206, + "epoch": 0.28817075003757703, + "flos": 22093070188320.0, + "grad_norm": 1.6374997389862203, + "language_loss": 0.79437089, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.82231289, + "num_input_tokens_seen": 103395225, + "step": 4793, + "time_per_iteration": 2.77634334564209 + }, + { + "auxiliary_loss_clip": 0.01508921, + "auxiliary_loss_mlp": 0.01295016, + "balance_loss_clip": 1.16226363, + "balance_loss_mlp": 1.04706037, + "epoch": 0.288230873290245, + "flos": 41686384361760.0, + "grad_norm": 1.9508219946134444, + "language_loss": 0.78427553, + "learning_rate": 3.340324496161797e-06, + "loss": 0.81231493, + "num_input_tokens_seen": 103417245, + "step": 4794, + "time_per_iteration": 2.9073727130889893 + }, + { + "auxiliary_loss_clip": 0.01496139, + "auxiliary_loss_mlp": 0.01294001, + "balance_loss_clip": 1.15035892, + "balance_loss_mlp": 1.03650916, + "epoch": 0.28829099654291296, + "flos": 18626401753920.0, + "grad_norm": 2.126145971142333, + "language_loss": 0.83549839, + "learning_rate": 3.340035406592074e-06, + "loss": 0.86339974, + "num_input_tokens_seen": 103435500, + "step": 4795, + "time_per_iteration": 2.757230758666992 + }, + { + "auxiliary_loss_clip": 0.01502936, + "auxiliary_loss_mlp": 0.01280912, + "balance_loss_clip": 1.15639043, + "balance_loss_mlp": 1.02895129, + "epoch": 0.2883511197955809, + "flos": 24676800242400.0, + "grad_norm": 2.0935348238123757, + "language_loss": 0.7460596, + "learning_rate": 3.339746266208074e-06, + "loss": 0.77389807, + "num_input_tokens_seen": 103451040, + "step": 4796, + "time_per_iteration": 2.779419183731079 + }, + { + "auxiliary_loss_clip": 0.0150471, + "auxiliary_loss_mlp": 0.01299868, + "balance_loss_clip": 1.15780187, + "balance_loss_mlp": 1.0454278, + "epoch": 0.2884112430482489, + "flos": 23114371953600.0, + "grad_norm": 3.0779322199487367, + "language_loss": 0.73283434, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.76088023, + "num_input_tokens_seen": 103471330, + "step": 4797, + "time_per_iteration": 2.8184328079223633 + }, + { + "auxiliary_loss_clip": 0.01503855, + "auxiliary_loss_mlp": 0.01303165, + "balance_loss_clip": 1.15810323, + "balance_loss_mlp": 1.05253911, + "epoch": 0.28847136630091685, + "flos": 16875241119360.0, + "grad_norm": 2.1395830865407146, + "language_loss": 0.74600464, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.77407479, + "num_input_tokens_seen": 103488060, + "step": 4798, + "time_per_iteration": 2.7151594161987305 + }, + { + "auxiliary_loss_clip": 0.0150442, + "auxiliary_loss_mlp": 0.01321926, + "balance_loss_clip": 1.15834618, + "balance_loss_mlp": 1.06786692, + "epoch": 0.2885314895535849, + "flos": 25659414920160.0, + "grad_norm": 2.568343313556652, + "language_loss": 0.65089428, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67915773, + "num_input_tokens_seen": 103503600, + "step": 4799, + "time_per_iteration": 2.8077685832977295 + }, + { + "auxiliary_loss_clip": 0.01499651, + "auxiliary_loss_mlp": 0.01299094, + "balance_loss_clip": 1.15474677, + "balance_loss_mlp": 1.04312754, + "epoch": 0.28859161280625284, + "flos": 21109469378400.0, + "grad_norm": 1.709868160436344, + "language_loss": 0.82550216, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.85348958, + "num_input_tokens_seen": 103524195, + "step": 4800, + "time_per_iteration": 2.8559134006500244 + }, + { + "auxiliary_loss_clip": 0.01493729, + "auxiliary_loss_mlp": 0.01287976, + "balance_loss_clip": 1.14778221, + "balance_loss_mlp": 1.03696942, + "epoch": 0.2886517360589208, + "flos": 26471995835040.0, + "grad_norm": 1.6551740476894539, + "language_loss": 0.91192788, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93974495, + "num_input_tokens_seen": 103545235, + "step": 4801, + "time_per_iteration": 2.812256097793579 + }, + { + "auxiliary_loss_clip": 0.01494607, + "auxiliary_loss_mlp": 0.01285099, + "balance_loss_clip": 1.14902449, + "balance_loss_mlp": 1.03008652, + "epoch": 0.28871185931158877, + "flos": 25267650948000.0, + "grad_norm": 2.1491526319497662, + "language_loss": 0.73681515, + "learning_rate": 3.33801035741839e-06, + "loss": 0.76461232, + "num_input_tokens_seen": 103563305, + "step": 4802, + "time_per_iteration": 2.812710762023926 + }, + { + "auxiliary_loss_clip": 0.01595949, + "auxiliary_loss_mlp": 0.01231155, + "balance_loss_clip": 1.25566769, + "balance_loss_mlp": 1.00151062, + "epoch": 0.28877198256425674, + "flos": 66671543459520.0, + "grad_norm": 0.7801940246098416, + "language_loss": 0.62952644, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65779752, + "num_input_tokens_seen": 103625025, + "step": 4803, + "time_per_iteration": 3.2722694873809814 + }, + { + "auxiliary_loss_clip": 0.01496859, + "auxiliary_loss_mlp": 0.01296851, + "balance_loss_clip": 1.15147364, + "balance_loss_mlp": 1.03878665, + "epoch": 0.2888321058169247, + "flos": 20305118874240.0, + "grad_norm": 1.8320921432370814, + "language_loss": 0.71009457, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.73803163, + "num_input_tokens_seen": 103644235, + "step": 4804, + "time_per_iteration": 2.786343812942505 + }, + { + "auxiliary_loss_clip": 0.01498239, + "auxiliary_loss_mlp": 0.01299595, + "balance_loss_clip": 1.15303874, + "balance_loss_mlp": 1.04420125, + "epoch": 0.28889222906959267, + "flos": 25518889630080.0, + "grad_norm": 2.0539154272517934, + "language_loss": 0.68434918, + "learning_rate": 3.337141717919346e-06, + "loss": 0.71232748, + "num_input_tokens_seen": 103664700, + "step": 4805, + "time_per_iteration": 2.7879180908203125 + }, + { + "auxiliary_loss_clip": 0.01495326, + "auxiliary_loss_mlp": 0.01289728, + "balance_loss_clip": 1.15050888, + "balance_loss_mlp": 1.03185427, + "epoch": 0.28895235232226063, + "flos": 32674070492640.0, + "grad_norm": 1.4384301256840735, + "language_loss": 0.69475734, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.72260791, + "num_input_tokens_seen": 103686595, + "step": 4806, + "time_per_iteration": 2.8120875358581543 + }, + { + "auxiliary_loss_clip": 0.0149984, + "auxiliary_loss_mlp": 0.01297686, + "balance_loss_clip": 1.15465546, + "balance_loss_mlp": 1.04400826, + "epoch": 0.2890124755749286, + "flos": 29717199557280.0, + "grad_norm": 1.6173129844340817, + "language_loss": 0.71825814, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.74623346, + "num_input_tokens_seen": 103707525, + "step": 4807, + "time_per_iteration": 2.803760051727295 + }, + { + "auxiliary_loss_clip": 0.01498357, + "auxiliary_loss_mlp": 0.01310183, + "balance_loss_clip": 1.1532824, + "balance_loss_mlp": 1.05402601, + "epoch": 0.28907259882759656, + "flos": 22676676615360.0, + "grad_norm": 1.9815978845299924, + "language_loss": 0.81819004, + "learning_rate": 3.336272622079382e-06, + "loss": 0.84627545, + "num_input_tokens_seen": 103727905, + "step": 4808, + "time_per_iteration": 2.7801194190979004 + }, + { + "auxiliary_loss_clip": 0.01498954, + "auxiliary_loss_mlp": 0.01295378, + "balance_loss_clip": 1.15362716, + "balance_loss_mlp": 1.04150963, + "epoch": 0.2891327220802645, + "flos": 22568314769280.0, + "grad_norm": 1.672864833071771, + "language_loss": 0.78520632, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.81314963, + "num_input_tokens_seen": 103748335, + "step": 4809, + "time_per_iteration": 2.778357982635498 + }, + { + "auxiliary_loss_clip": 0.01489436, + "auxiliary_loss_mlp": 0.01297213, + "balance_loss_clip": 1.14406359, + "balance_loss_mlp": 1.0385766, + "epoch": 0.2891928453329325, + "flos": 21654957640320.0, + "grad_norm": 1.9325403184717287, + "language_loss": 0.78606921, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.8139357, + "num_input_tokens_seen": 103767020, + "step": 4810, + "time_per_iteration": 2.743640184402466 + }, + { + "auxiliary_loss_clip": 0.01496309, + "auxiliary_loss_mlp": 0.01291356, + "balance_loss_clip": 1.15036488, + "balance_loss_mlp": 1.03519905, + "epoch": 0.28925296858560046, + "flos": 23224364710560.0, + "grad_norm": 1.8766288137767175, + "language_loss": 0.77169734, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79957402, + "num_input_tokens_seen": 103786355, + "step": 4811, + "time_per_iteration": 2.7674081325531006 + }, + { + "auxiliary_loss_clip": 0.01496199, + "auxiliary_loss_mlp": 0.01298242, + "balance_loss_clip": 1.15166652, + "balance_loss_mlp": 1.04494596, + "epoch": 0.2893130918382685, + "flos": 28624212840960.0, + "grad_norm": 1.5081078047017515, + "language_loss": 0.77652109, + "learning_rate": 3.335113118275117e-06, + "loss": 0.80446547, + "num_input_tokens_seen": 103809345, + "step": 4812, + "time_per_iteration": 4.464073657989502 + }, + { + "auxiliary_loss_clip": 0.01591663, + "auxiliary_loss_mlp": 0.01233887, + "balance_loss_clip": 1.24939632, + "balance_loss_mlp": 1.00805664, + "epoch": 0.28937321509093644, + "flos": 72308672344800.0, + "grad_norm": 0.8254605464919521, + "language_loss": 0.60166436, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62991989, + "num_input_tokens_seen": 103871180, + "step": 4813, + "time_per_iteration": 3.482016086578369 + }, + { + "auxiliary_loss_clip": 0.01494213, + "auxiliary_loss_mlp": 0.01304027, + "balance_loss_clip": 1.14840829, + "balance_loss_mlp": 1.04863322, + "epoch": 0.2894333383436044, + "flos": 16218242974080.0, + "grad_norm": 3.1339080793522087, + "language_loss": 0.82807642, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.85605878, + "num_input_tokens_seen": 103889040, + "step": 4814, + "time_per_iteration": 2.7898173332214355 + }, + { + "auxiliary_loss_clip": 0.01490001, + "auxiliary_loss_mlp": 0.01300866, + "balance_loss_clip": 1.14497221, + "balance_loss_mlp": 1.04089475, + "epoch": 0.2894934615962724, + "flos": 24830676244800.0, + "grad_norm": 2.0817342197196895, + "language_loss": 0.72374487, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.75165355, + "num_input_tokens_seen": 103910380, + "step": 4815, + "time_per_iteration": 2.7920947074890137 + }, + { + "auxiliary_loss_clip": 0.01501573, + "auxiliary_loss_mlp": 0.01294806, + "balance_loss_clip": 1.15482903, + "balance_loss_mlp": 1.04017484, + "epoch": 0.28955358484894034, + "flos": 20452129879680.0, + "grad_norm": 1.681386863868984, + "language_loss": 0.70576036, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.73372412, + "num_input_tokens_seen": 103929955, + "step": 4816, + "time_per_iteration": 2.8172571659088135 + }, + { + "auxiliary_loss_clip": 0.01493249, + "auxiliary_loss_mlp": 0.01302405, + "balance_loss_clip": 1.14779806, + "balance_loss_mlp": 1.04376876, + "epoch": 0.2896137081016083, + "flos": 22567404493440.0, + "grad_norm": 2.450577862234556, + "language_loss": 0.74403524, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.77199173, + "num_input_tokens_seen": 103948020, + "step": 4817, + "time_per_iteration": 2.7775752544403076 + }, + { + "auxiliary_loss_clip": 0.01492371, + "auxiliary_loss_mlp": 0.01295895, + "balance_loss_clip": 1.14643884, + "balance_loss_mlp": 1.03725886, + "epoch": 0.28967383135427627, + "flos": 26690388366240.0, + "grad_norm": 4.206268496689866, + "language_loss": 0.76581639, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.79369903, + "num_input_tokens_seen": 103968740, + "step": 4818, + "time_per_iteration": 2.786625623703003 + }, + { + "auxiliary_loss_clip": 0.01501524, + "auxiliary_loss_mlp": 0.01295602, + "balance_loss_clip": 1.15634513, + "balance_loss_mlp": 1.04135203, + "epoch": 0.28973395460694423, + "flos": 15559993199520.0, + "grad_norm": 2.6670609204447313, + "language_loss": 0.79674125, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.82471246, + "num_input_tokens_seen": 103986005, + "step": 4819, + "time_per_iteration": 2.763584613800049 + }, + { + "auxiliary_loss_clip": 0.01497372, + "auxiliary_loss_mlp": 0.01306718, + "balance_loss_clip": 1.15129721, + "balance_loss_mlp": 1.04789054, + "epoch": 0.2897940778596122, + "flos": 18699452118720.0, + "grad_norm": 1.709905238044772, + "language_loss": 0.78853083, + "learning_rate": 3.332791681244776e-06, + "loss": 0.81657171, + "num_input_tokens_seen": 104005070, + "step": 4820, + "time_per_iteration": 4.351288795471191 + }, + { + "auxiliary_loss_clip": 0.01497372, + "auxiliary_loss_mlp": 0.01298818, + "balance_loss_clip": 1.1510967, + "balance_loss_mlp": 1.04132581, + "epoch": 0.28985420111228016, + "flos": 18772274914560.0, + "grad_norm": 2.377257115287744, + "language_loss": 0.72989261, + "learning_rate": 3.332501274072231e-06, + "loss": 0.75785458, + "num_input_tokens_seen": 104022945, + "step": 4821, + "time_per_iteration": 4.230363607406616 + }, + { + "auxiliary_loss_clip": 0.01492513, + "auxiliary_loss_mlp": 0.01286843, + "balance_loss_clip": 1.14504933, + "balance_loss_mlp": 1.03183103, + "epoch": 0.28991432436494813, + "flos": 23071740337440.0, + "grad_norm": 1.9597284435658882, + "language_loss": 0.72358119, + "learning_rate": 3.332210816371104e-06, + "loss": 0.75137472, + "num_input_tokens_seen": 104042080, + "step": 4822, + "time_per_iteration": 4.273548603057861 + }, + { + "auxiliary_loss_clip": 0.01498108, + "auxiliary_loss_mlp": 0.0129828, + "balance_loss_clip": 1.15241086, + "balance_loss_mlp": 1.04116976, + "epoch": 0.2899744476176161, + "flos": 17605137916800.0, + "grad_norm": 1.8655490978577962, + "language_loss": 0.66753578, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.69549966, + "num_input_tokens_seen": 104060975, + "step": 4823, + "time_per_iteration": 2.8346590995788574 + }, + { + "auxiliary_loss_clip": 0.01493808, + "auxiliary_loss_mlp": 0.01299881, + "balance_loss_clip": 1.14799619, + "balance_loss_mlp": 1.04048121, + "epoch": 0.29003457087028406, + "flos": 22311538575840.0, + "grad_norm": 2.149266565729145, + "language_loss": 0.81593192, + "learning_rate": 3.331629749427164e-06, + "loss": 0.84386879, + "num_input_tokens_seen": 104081395, + "step": 4824, + "time_per_iteration": 2.7789454460144043 + }, + { + "auxiliary_loss_clip": 0.01496007, + "auxiliary_loss_mlp": 0.01307101, + "balance_loss_clip": 1.14978576, + "balance_loss_mlp": 1.05151582, + "epoch": 0.2900946941229521, + "flos": 21947159099520.0, + "grad_norm": 2.1544485314723927, + "language_loss": 0.72654623, + "learning_rate": 3.331339140206385e-06, + "loss": 0.75457728, + "num_input_tokens_seen": 104099995, + "step": 4825, + "time_per_iteration": 2.832275390625 + }, + { + "auxiliary_loss_clip": 0.01500312, + "auxiliary_loss_mlp": 0.01305297, + "balance_loss_clip": 1.15521455, + "balance_loss_mlp": 1.04837763, + "epoch": 0.29015481737562004, + "flos": 17934167983680.0, + "grad_norm": 2.1770329796305234, + "language_loss": 0.73503494, + "learning_rate": 3.331048480501092e-06, + "loss": 0.76309103, + "num_input_tokens_seen": 104118930, + "step": 4826, + "time_per_iteration": 2.952604055404663 + }, + { + "auxiliary_loss_clip": 0.01492568, + "auxiliary_loss_mlp": 0.01291206, + "balance_loss_clip": 1.14818954, + "balance_loss_mlp": 1.03543019, + "epoch": 0.290214940628288, + "flos": 22785721168320.0, + "grad_norm": 2.0556382125058503, + "language_loss": 0.68284672, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.71068442, + "num_input_tokens_seen": 104136940, + "step": 4827, + "time_per_iteration": 2.88936710357666 + }, + { + "auxiliary_loss_clip": 0.01499661, + "auxiliary_loss_mlp": 0.01303066, + "balance_loss_clip": 1.15456426, + "balance_loss_mlp": 1.04385746, + "epoch": 0.290275063880956, + "flos": 20008290179520.0, + "grad_norm": 8.095156891839506, + "language_loss": 0.80080807, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82883537, + "num_input_tokens_seen": 104154280, + "step": 4828, + "time_per_iteration": 2.739290952682495 + }, + { + "auxiliary_loss_clip": 0.01498024, + "auxiliary_loss_mlp": 0.01290073, + "balance_loss_clip": 1.15289307, + "balance_loss_mlp": 1.03315353, + "epoch": 0.29033518713362394, + "flos": 22055748514560.0, + "grad_norm": 1.7787797949471336, + "language_loss": 0.80178362, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82966459, + "num_input_tokens_seen": 104172605, + "step": 4829, + "time_per_iteration": 2.9312174320220947 + }, + { + "auxiliary_loss_clip": 0.01498512, + "auxiliary_loss_mlp": 0.01298272, + "balance_loss_clip": 1.15235281, + "balance_loss_mlp": 1.04535758, + "epoch": 0.2903953103862919, + "flos": 25632447634080.0, + "grad_norm": 1.6494317410191455, + "language_loss": 0.82722545, + "learning_rate": 3.329885337055249e-06, + "loss": 0.85519326, + "num_input_tokens_seen": 104194120, + "step": 4830, + "time_per_iteration": 2.824112892150879 + }, + { + "auxiliary_loss_clip": 0.01502437, + "auxiliary_loss_mlp": 0.01302665, + "balance_loss_clip": 1.15830708, + "balance_loss_mlp": 1.04479098, + "epoch": 0.29045543363895987, + "flos": 16947722561760.0, + "grad_norm": 2.1924106892520436, + "language_loss": 0.79169273, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81974375, + "num_input_tokens_seen": 104210875, + "step": 4831, + "time_per_iteration": 2.724641799926758 + }, + { + "auxiliary_loss_clip": 0.01500022, + "auxiliary_loss_mlp": 0.01294786, + "balance_loss_clip": 1.15575314, + "balance_loss_mlp": 1.0401547, + "epoch": 0.29051555689162784, + "flos": 26397883481760.0, + "grad_norm": 1.7085907224227945, + "language_loss": 0.74297422, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.77092224, + "num_input_tokens_seen": 104229875, + "step": 4832, + "time_per_iteration": 2.781587600708008 + }, + { + "auxiliary_loss_clip": 0.01490966, + "auxiliary_loss_mlp": 0.01284883, + "balance_loss_clip": 1.14633107, + "balance_loss_mlp": 1.03234982, + "epoch": 0.2905756801442958, + "flos": 21105904131360.0, + "grad_norm": 1.8906295371103659, + "language_loss": 0.76086187, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78862035, + "num_input_tokens_seen": 104250405, + "step": 4833, + "time_per_iteration": 2.7041049003601074 + }, + { + "auxiliary_loss_clip": 0.01489829, + "auxiliary_loss_mlp": 0.01290126, + "balance_loss_clip": 1.14477491, + "balance_loss_mlp": 1.03816485, + "epoch": 0.29063580339696377, + "flos": 15707914480800.0, + "grad_norm": 2.048943770901027, + "language_loss": 0.64889735, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.6766969, + "num_input_tokens_seen": 104269185, + "step": 4834, + "time_per_iteration": 2.761704206466675 + }, + { + "auxiliary_loss_clip": 0.01495463, + "auxiliary_loss_mlp": 0.01286769, + "balance_loss_clip": 1.14989114, + "balance_loss_mlp": 1.03042102, + "epoch": 0.29069592664963173, + "flos": 24647329697760.0, + "grad_norm": 1.671221952973598, + "language_loss": 0.71650445, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.74432671, + "num_input_tokens_seen": 104289400, + "step": 4835, + "time_per_iteration": 2.791471242904663 + }, + { + "auxiliary_loss_clip": 0.01489313, + "auxiliary_loss_mlp": 0.0128789, + "balance_loss_clip": 1.14433503, + "balance_loss_mlp": 1.03497553, + "epoch": 0.2907560499022997, + "flos": 24976132195680.0, + "grad_norm": 2.5890051690876437, + "language_loss": 0.7945987, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.82237071, + "num_input_tokens_seen": 104310485, + "step": 4836, + "time_per_iteration": 2.7436442375183105 + }, + { + "auxiliary_loss_clip": 0.01502077, + "auxiliary_loss_mlp": 0.01304549, + "balance_loss_clip": 1.15766072, + "balance_loss_mlp": 1.04705739, + "epoch": 0.29081617315496766, + "flos": 18659285832960.0, + "grad_norm": 1.8214199356824545, + "language_loss": 0.81159836, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.83966458, + "num_input_tokens_seen": 104327330, + "step": 4837, + "time_per_iteration": 2.7557079792022705 + }, + { + "auxiliary_loss_clip": 0.01491047, + "auxiliary_loss_mlp": 0.01289449, + "balance_loss_clip": 1.14620614, + "balance_loss_mlp": 1.03023982, + "epoch": 0.2908762964076356, + "flos": 35333733451680.0, + "grad_norm": 2.3255477142022083, + "language_loss": 0.67280793, + "learning_rate": 3.327556630259381e-06, + "loss": 0.7006129, + "num_input_tokens_seen": 104350350, + "step": 4838, + "time_per_iteration": 2.8350131511688232 + }, + { + "auxiliary_loss_clip": 0.01495194, + "auxiliary_loss_mlp": 0.01290677, + "balance_loss_clip": 1.15150249, + "balance_loss_mlp": 1.03013349, + "epoch": 0.29093641966030365, + "flos": 23078567406240.0, + "grad_norm": 2.094331232900442, + "language_loss": 0.71641481, + "learning_rate": 3.327265315259095e-06, + "loss": 0.74427348, + "num_input_tokens_seen": 104369995, + "step": 4839, + "time_per_iteration": 2.7626075744628906 + }, + { + "auxiliary_loss_clip": 0.01492472, + "auxiliary_loss_mlp": 0.01279828, + "balance_loss_clip": 1.15060306, + "balance_loss_mlp": 1.02252698, + "epoch": 0.2909965429129716, + "flos": 35958378512160.0, + "grad_norm": 1.9092429944853093, + "language_loss": 0.75941944, + "learning_rate": 3.326973949928776e-06, + "loss": 0.7871424, + "num_input_tokens_seen": 104392285, + "step": 4840, + "time_per_iteration": 2.864666700363159 + }, + { + "auxiliary_loss_clip": 0.01498572, + "auxiliary_loss_mlp": 0.01300988, + "balance_loss_clip": 1.15562403, + "balance_loss_mlp": 1.04578519, + "epoch": 0.2910566661656396, + "flos": 30882705644160.0, + "grad_norm": 2.320950846573953, + "language_loss": 0.60735476, + "learning_rate": 3.326682534279471e-06, + "loss": 0.63535035, + "num_input_tokens_seen": 104412640, + "step": 4841, + "time_per_iteration": 2.905100107192993 + }, + { + "auxiliary_loss_clip": 0.01492923, + "auxiliary_loss_mlp": 0.01292742, + "balance_loss_clip": 1.15005648, + "balance_loss_mlp": 1.03505898, + "epoch": 0.29111678941830754, + "flos": 30012928335360.0, + "grad_norm": 1.4611901173228257, + "language_loss": 0.71389234, + "learning_rate": 3.326391068322232e-06, + "loss": 0.74174893, + "num_input_tokens_seen": 104435245, + "step": 4842, + "time_per_iteration": 2.835394859313965 + }, + { + "auxiliary_loss_clip": 0.01502611, + "auxiliary_loss_mlp": 0.01289268, + "balance_loss_clip": 1.16078019, + "balance_loss_mlp": 1.03196716, + "epoch": 0.2911769126709755, + "flos": 22859643880800.0, + "grad_norm": 1.5282777741394726, + "language_loss": 0.73425758, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.76217633, + "num_input_tokens_seen": 104455395, + "step": 4843, + "time_per_iteration": 2.8542680740356445 + }, + { + "auxiliary_loss_clip": 0.01493329, + "auxiliary_loss_mlp": 0.012835, + "balance_loss_clip": 1.14935827, + "balance_loss_mlp": 1.03096652, + "epoch": 0.2912370359236435, + "flos": 21652530238080.0, + "grad_norm": 2.0567979744634024, + "language_loss": 0.58106709, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60883534, + "num_input_tokens_seen": 104473350, + "step": 4844, + "time_per_iteration": 2.8560914993286133 + }, + { + "auxiliary_loss_clip": 0.01506943, + "auxiliary_loss_mlp": 0.01296972, + "balance_loss_clip": 1.1649332, + "balance_loss_mlp": 1.03795362, + "epoch": 0.29129715917631144, + "flos": 22895562212640.0, + "grad_norm": 1.9955631548012238, + "language_loss": 0.8694607, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.8974998, + "num_input_tokens_seen": 104492265, + "step": 4845, + "time_per_iteration": 2.8238325119018555 + }, + { + "auxiliary_loss_clip": 0.01498572, + "auxiliary_loss_mlp": 0.01294527, + "balance_loss_clip": 1.15578032, + "balance_loss_mlp": 1.03837013, + "epoch": 0.2913572824289794, + "flos": 22676676615360.0, + "grad_norm": 1.9512179421608344, + "language_loss": 0.67117822, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.69910926, + "num_input_tokens_seen": 104510755, + "step": 4846, + "time_per_iteration": 2.802676200866699 + }, + { + "auxiliary_loss_clip": 0.01509807, + "auxiliary_loss_mlp": 0.01289075, + "balance_loss_clip": 1.16858506, + "balance_loss_mlp": 1.03577876, + "epoch": 0.29141740568164737, + "flos": 23109365436480.0, + "grad_norm": 1.9496351023381635, + "language_loss": 0.70525301, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.7332418, + "num_input_tokens_seen": 104530830, + "step": 4847, + "time_per_iteration": 2.7962944507598877 + }, + { + "auxiliary_loss_clip": 0.01498873, + "auxiliary_loss_mlp": 0.01298426, + "balance_loss_clip": 1.15807247, + "balance_loss_mlp": 1.03807318, + "epoch": 0.29147752893431533, + "flos": 23589313109280.0, + "grad_norm": 1.681268582229902, + "language_loss": 0.73536777, + "learning_rate": 3.324641216731237e-06, + "loss": 0.76334083, + "num_input_tokens_seen": 104550115, + "step": 4848, + "time_per_iteration": 2.8046152591705322 + }, + { + "auxiliary_loss_clip": 0.01494798, + "auxiliary_loss_mlp": 0.01283341, + "balance_loss_clip": 1.15324926, + "balance_loss_mlp": 1.02737498, + "epoch": 0.2915376521869833, + "flos": 20593906799040.0, + "grad_norm": 2.2430853009542395, + "language_loss": 0.77036017, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.7981416, + "num_input_tokens_seen": 104566255, + "step": 4849, + "time_per_iteration": 2.7473392486572266 + }, + { + "auxiliary_loss_clip": 0.01503247, + "auxiliary_loss_mlp": 0.01288571, + "balance_loss_clip": 1.16243529, + "balance_loss_mlp": 1.02878976, + "epoch": 0.29159777543965126, + "flos": 20813475103200.0, + "grad_norm": 2.305289951685501, + "language_loss": 0.78645539, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.81437361, + "num_input_tokens_seen": 104585235, + "step": 4850, + "time_per_iteration": 2.806929588317871 + }, + { + "auxiliary_loss_clip": 0.01504824, + "auxiliary_loss_mlp": 0.01306855, + "balance_loss_clip": 1.16388106, + "balance_loss_mlp": 1.05489469, + "epoch": 0.29165789869231923, + "flos": 24246500895360.0, + "grad_norm": 1.76931052829778, + "language_loss": 0.75661397, + "learning_rate": 3.323765612674296e-06, + "loss": 0.78473073, + "num_input_tokens_seen": 104605315, + "step": 4851, + "time_per_iteration": 4.421865701675415 + }, + { + "auxiliary_loss_clip": 0.0150745, + "auxiliary_loss_mlp": 0.01283035, + "balance_loss_clip": 1.16874492, + "balance_loss_mlp": 1.0276413, + "epoch": 0.29171802194498725, + "flos": 28952484344640.0, + "grad_norm": 1.6015040603739643, + "language_loss": 0.77366942, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.80157429, + "num_input_tokens_seen": 104626055, + "step": 4852, + "time_per_iteration": 2.7911882400512695 + }, + { + "auxiliary_loss_clip": 0.01498633, + "auxiliary_loss_mlp": 0.01290357, + "balance_loss_clip": 1.15640604, + "balance_loss_mlp": 1.03992176, + "epoch": 0.2917781451976552, + "flos": 22600250644320.0, + "grad_norm": 1.6111811098421938, + "language_loss": 0.78134209, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.809232, + "num_input_tokens_seen": 104646005, + "step": 4853, + "time_per_iteration": 2.8059356212615967 + }, + { + "auxiliary_loss_clip": 0.01499064, + "auxiliary_loss_mlp": 0.01298123, + "balance_loss_clip": 1.1576817, + "balance_loss_mlp": 1.04444575, + "epoch": 0.2918382684503232, + "flos": 21576293907840.0, + "grad_norm": 7.346088765165505, + "language_loss": 0.87983048, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90780234, + "num_input_tokens_seen": 104661620, + "step": 4854, + "time_per_iteration": 2.720794439315796 + }, + { + "auxiliary_loss_clip": 0.01505467, + "auxiliary_loss_mlp": 0.01295718, + "balance_loss_clip": 1.16356373, + "balance_loss_mlp": 1.04394794, + "epoch": 0.29189839170299114, + "flos": 24355848873600.0, + "grad_norm": 1.7824920098196244, + "language_loss": 0.8665632, + "learning_rate": 3.322597437887519e-06, + "loss": 0.89457512, + "num_input_tokens_seen": 104681445, + "step": 4855, + "time_per_iteration": 2.765615463256836 + }, + { + "auxiliary_loss_clip": 0.01627608, + "auxiliary_loss_mlp": 0.01279747, + "balance_loss_clip": 1.2896049, + "balance_loss_mlp": 1.05773163, + "epoch": 0.2919585149556591, + "flos": 71324085402720.0, + "grad_norm": 0.8047006245441626, + "language_loss": 0.6014986, + "learning_rate": 3.322305268780566e-06, + "loss": 0.6305722, + "num_input_tokens_seen": 104747945, + "step": 4856, + "time_per_iteration": 3.4423606395721436 + }, + { + "auxiliary_loss_clip": 0.01498493, + "auxiliary_loss_mlp": 0.01292594, + "balance_loss_clip": 1.1555593, + "balance_loss_mlp": 1.0391072, + "epoch": 0.2920186382083271, + "flos": 15635053756800.0, + "grad_norm": 1.876050906287338, + "language_loss": 0.68397409, + "learning_rate": 3.322013049531664e-06, + "loss": 0.71188498, + "num_input_tokens_seen": 104766225, + "step": 4857, + "time_per_iteration": 2.7723703384399414 + }, + { + "auxiliary_loss_clip": 0.01504263, + "auxiliary_loss_mlp": 0.01284888, + "balance_loss_clip": 1.16097403, + "balance_loss_mlp": 1.02835011, + "epoch": 0.29207876146099504, + "flos": 28368536564160.0, + "grad_norm": 2.1190321038332836, + "language_loss": 0.83952343, + "learning_rate": 3.321720780151895e-06, + "loss": 0.86741501, + "num_input_tokens_seen": 104785345, + "step": 4858, + "time_per_iteration": 4.1928465366363525 + }, + { + "auxiliary_loss_clip": 0.01517175, + "auxiliary_loss_mlp": 0.01319905, + "balance_loss_clip": 1.17352223, + "balance_loss_mlp": 1.06451106, + "epoch": 0.292138884713663, + "flos": 21873084674400.0, + "grad_norm": 1.8737802850067036, + "language_loss": 0.77747214, + "learning_rate": 3.321428460652342e-06, + "loss": 0.805843, + "num_input_tokens_seen": 104804560, + "step": 4859, + "time_per_iteration": 4.2305591106414795 + }, + { + "auxiliary_loss_clip": 0.01502884, + "auxiliary_loss_mlp": 0.01286065, + "balance_loss_clip": 1.16048813, + "balance_loss_mlp": 1.03048027, + "epoch": 0.29219900796633097, + "flos": 20994470104320.0, + "grad_norm": 3.233563233613094, + "language_loss": 0.68839455, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.71628404, + "num_input_tokens_seen": 104821105, + "step": 4860, + "time_per_iteration": 4.163851022720337 + }, + { + "auxiliary_loss_clip": 0.01510909, + "auxiliary_loss_mlp": 0.01300016, + "balance_loss_clip": 1.17094278, + "balance_loss_mlp": 1.04900932, + "epoch": 0.29225913121899894, + "flos": 35007396284160.0, + "grad_norm": 2.145704244969639, + "language_loss": 0.75891, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78701931, + "num_input_tokens_seen": 104841440, + "step": 4861, + "time_per_iteration": 2.876497745513916 + }, + { + "auxiliary_loss_clip": 0.01518521, + "auxiliary_loss_mlp": 0.01295737, + "balance_loss_clip": 1.1773082, + "balance_loss_mlp": 1.04263163, + "epoch": 0.2923192544716669, + "flos": 13517313812640.0, + "grad_norm": 1.859804376161969, + "language_loss": 0.91525769, + "learning_rate": 3.320551201545832e-06, + "loss": 0.94340026, + "num_input_tokens_seen": 104858210, + "step": 4862, + "time_per_iteration": 2.86040997505188 + }, + { + "auxiliary_loss_clip": 0.01501888, + "auxiliary_loss_mlp": 0.0128503, + "balance_loss_clip": 1.16043675, + "balance_loss_mlp": 1.03287888, + "epoch": 0.29237937772433487, + "flos": 19465532745120.0, + "grad_norm": 3.526320412435318, + "language_loss": 0.7358464, + "learning_rate": 3.320258681678008e-06, + "loss": 0.76371551, + "num_input_tokens_seen": 104875620, + "step": 4863, + "time_per_iteration": 2.759552001953125 + }, + { + "auxiliary_loss_clip": 0.01504807, + "auxiliary_loss_mlp": 0.01277425, + "balance_loss_clip": 1.16311359, + "balance_loss_mlp": 1.02470136, + "epoch": 0.29243950097700283, + "flos": 20852731113120.0, + "grad_norm": 1.9435944096330449, + "language_loss": 0.78071833, + "learning_rate": 3.319966111745842e-06, + "loss": 0.8085407, + "num_input_tokens_seen": 104894600, + "step": 4864, + "time_per_iteration": 2.7386515140533447 + }, + { + "auxiliary_loss_clip": 0.01515027, + "auxiliary_loss_mlp": 0.01290287, + "balance_loss_clip": 1.17424083, + "balance_loss_mlp": 1.03203249, + "epoch": 0.29249962422967085, + "flos": 23586354712800.0, + "grad_norm": 1.7499044700962394, + "language_loss": 0.81679624, + "learning_rate": 3.319673491760429e-06, + "loss": 0.84484935, + "num_input_tokens_seen": 104914530, + "step": 4865, + "time_per_iteration": 2.8736276626586914 + }, + { + "auxiliary_loss_clip": 0.01511433, + "auxiliary_loss_mlp": 0.0129747, + "balance_loss_clip": 1.16977, + "balance_loss_mlp": 1.03749847, + "epoch": 0.2925597474823388, + "flos": 22275544387680.0, + "grad_norm": 1.9671824697616038, + "language_loss": 0.85511053, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.88319957, + "num_input_tokens_seen": 104933460, + "step": 4866, + "time_per_iteration": 2.8488941192626953 + }, + { + "auxiliary_loss_clip": 0.01506298, + "auxiliary_loss_mlp": 0.01282923, + "balance_loss_clip": 1.16641665, + "balance_loss_mlp": 1.03096235, + "epoch": 0.2926198707350068, + "flos": 34458797913120.0, + "grad_norm": 1.7359121643302682, + "language_loss": 0.75700915, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.78490138, + "num_input_tokens_seen": 104954495, + "step": 4867, + "time_per_iteration": 2.9783170223236084 + }, + { + "auxiliary_loss_clip": 0.01503682, + "auxiliary_loss_mlp": 0.01287988, + "balance_loss_clip": 1.16301608, + "balance_loss_mlp": 1.03488314, + "epoch": 0.29267999398767475, + "flos": 20706251101920.0, + "grad_norm": 2.2086007827244782, + "language_loss": 0.73478687, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.7627036, + "num_input_tokens_seen": 104971915, + "step": 4868, + "time_per_iteration": 2.8548405170440674 + }, + { + "auxiliary_loss_clip": 0.01506425, + "auxiliary_loss_mlp": 0.01285828, + "balance_loss_clip": 1.16630805, + "balance_loss_mlp": 1.03577471, + "epoch": 0.2927401172403427, + "flos": 18370611692640.0, + "grad_norm": 1.6353624796431465, + "language_loss": 0.7486583, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.77658081, + "num_input_tokens_seen": 104991335, + "step": 4869, + "time_per_iteration": 2.819088935852051 + }, + { + "auxiliary_loss_clip": 0.0149959, + "auxiliary_loss_mlp": 0.01290867, + "balance_loss_clip": 1.15661812, + "balance_loss_mlp": 1.03757143, + "epoch": 0.2928002404930107, + "flos": 26106440585760.0, + "grad_norm": 1.5832669587440404, + "language_loss": 0.76616794, + "learning_rate": 3.318209641423088e-06, + "loss": 0.79407251, + "num_input_tokens_seen": 105012015, + "step": 4870, + "time_per_iteration": 2.8803818225860596 + }, + { + "auxiliary_loss_clip": 0.0150476, + "auxiliary_loss_mlp": 0.01288346, + "balance_loss_clip": 1.16203284, + "balance_loss_mlp": 1.03142595, + "epoch": 0.29286036374567864, + "flos": 21326875777440.0, + "grad_norm": 2.26514797694826, + "language_loss": 0.67792296, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.705854, + "num_input_tokens_seen": 105031460, + "step": 4871, + "time_per_iteration": 2.9138331413269043 + }, + { + "auxiliary_loss_clip": 0.01494191, + "auxiliary_loss_mlp": 0.01281734, + "balance_loss_clip": 1.15230656, + "balance_loss_mlp": 1.02748454, + "epoch": 0.2929204869983466, + "flos": 29572047031680.0, + "grad_norm": 1.9050462186147417, + "language_loss": 0.77228224, + "learning_rate": 3.317623751303933e-06, + "loss": 0.80004144, + "num_input_tokens_seen": 105052965, + "step": 4872, + "time_per_iteration": 2.891961097717285 + }, + { + "auxiliary_loss_clip": 0.01503166, + "auxiliary_loss_mlp": 0.0129294, + "balance_loss_clip": 1.16201723, + "balance_loss_mlp": 1.03773642, + "epoch": 0.2929806102510146, + "flos": 19060038779040.0, + "grad_norm": 2.0802820967302673, + "language_loss": 0.7261911, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75415218, + "num_input_tokens_seen": 105071840, + "step": 4873, + "time_per_iteration": 2.8354382514953613 + }, + { + "auxiliary_loss_clip": 0.01504128, + "auxiliary_loss_mlp": 0.01297171, + "balance_loss_clip": 1.16374302, + "balance_loss_mlp": 1.0415864, + "epoch": 0.29304073350368254, + "flos": 21946362608160.0, + "grad_norm": 1.974555138450218, + "language_loss": 0.78355646, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.81156945, + "num_input_tokens_seen": 105089445, + "step": 4874, + "time_per_iteration": 2.778714895248413 + }, + { + "auxiliary_loss_clip": 0.01505776, + "auxiliary_loss_mlp": 0.01288815, + "balance_loss_clip": 1.16397202, + "balance_loss_mlp": 1.02998805, + "epoch": 0.2931008567563505, + "flos": 15452655413760.0, + "grad_norm": 2.1383949075025646, + "language_loss": 0.77524114, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.80318701, + "num_input_tokens_seen": 105106210, + "step": 4875, + "time_per_iteration": 2.751923084259033 + }, + { + "auxiliary_loss_clip": 0.01510889, + "auxiliary_loss_mlp": 0.01287746, + "balance_loss_clip": 1.17244446, + "balance_loss_mlp": 1.03444982, + "epoch": 0.29316098000901847, + "flos": 16985082163680.0, + "grad_norm": 1.6938103211744706, + "language_loss": 0.69263333, + "learning_rate": 3.316451371581431e-06, + "loss": 0.72061968, + "num_input_tokens_seen": 105124200, + "step": 4876, + "time_per_iteration": 2.7759156227111816 + }, + { + "auxiliary_loss_clip": 0.0150873, + "auxiliary_loss_mlp": 0.01297097, + "balance_loss_clip": 1.16750741, + "balance_loss_mlp": 1.04513669, + "epoch": 0.29322110326168643, + "flos": 16359299258400.0, + "grad_norm": 2.1029013708090085, + "language_loss": 0.82243866, + "learning_rate": 3.316158151823096e-06, + "loss": 0.85049695, + "num_input_tokens_seen": 105140400, + "step": 4877, + "time_per_iteration": 2.831481456756592 + }, + { + "auxiliary_loss_clip": 0.01510722, + "auxiliary_loss_mlp": 0.0129133, + "balance_loss_clip": 1.17059696, + "balance_loss_mlp": 1.0366993, + "epoch": 0.29328122651435445, + "flos": 13992596321760.0, + "grad_norm": 2.0644225515331858, + "language_loss": 0.67815542, + "learning_rate": 3.315864882155911e-06, + "loss": 0.70617598, + "num_input_tokens_seen": 105157535, + "step": 4878, + "time_per_iteration": 2.7839760780334473 + }, + { + "auxiliary_loss_clip": 0.01512786, + "auxiliary_loss_mlp": 0.01297061, + "balance_loss_clip": 1.17243338, + "balance_loss_mlp": 1.04643559, + "epoch": 0.2933413497670224, + "flos": 25266930312960.0, + "grad_norm": 1.9015496378980536, + "language_loss": 0.73446065, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.76255906, + "num_input_tokens_seen": 105175185, + "step": 4879, + "time_per_iteration": 2.840183734893799 + }, + { + "auxiliary_loss_clip": 0.01519441, + "auxiliary_loss_mlp": 0.01300204, + "balance_loss_clip": 1.18032336, + "balance_loss_mlp": 1.04728937, + "epoch": 0.2934014730196904, + "flos": 32126458253760.0, + "grad_norm": 2.7882497190435926, + "language_loss": 0.66330194, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.6914984, + "num_input_tokens_seen": 105194540, + "step": 4880, + "time_per_iteration": 2.839559555053711 + }, + { + "auxiliary_loss_clip": 0.01509389, + "auxiliary_loss_mlp": 0.01289171, + "balance_loss_clip": 1.16934597, + "balance_loss_mlp": 1.0381639, + "epoch": 0.29346159627235835, + "flos": 24354786885120.0, + "grad_norm": 3.0344342987102193, + "language_loss": 0.70839709, + "learning_rate": 3.314984773812481e-06, + "loss": 0.73638272, + "num_input_tokens_seen": 105213215, + "step": 4881, + "time_per_iteration": 2.8502676486968994 + }, + { + "auxiliary_loss_clip": 0.0150996, + "auxiliary_loss_mlp": 0.01310245, + "balance_loss_clip": 1.16918123, + "balance_loss_mlp": 1.05961955, + "epoch": 0.2935217195250263, + "flos": 22748930488800.0, + "grad_norm": 1.5586937767676452, + "language_loss": 0.83597279, + "learning_rate": 3.314691304621127e-06, + "loss": 0.86417484, + "num_input_tokens_seen": 105231585, + "step": 4882, + "time_per_iteration": 2.7729570865631104 + }, + { + "auxiliary_loss_clip": 0.0151668, + "auxiliary_loss_mlp": 0.01302321, + "balance_loss_clip": 1.17738056, + "balance_loss_mlp": 1.04978824, + "epoch": 0.2935818427776943, + "flos": 21727704579840.0, + "grad_norm": 2.224129356531185, + "language_loss": 0.71251535, + "learning_rate": 3.314397785576548e-06, + "loss": 0.74070537, + "num_input_tokens_seen": 105250120, + "step": 4883, + "time_per_iteration": 2.8408260345458984 + }, + { + "auxiliary_loss_clip": 0.01514602, + "auxiliary_loss_mlp": 0.0129505, + "balance_loss_clip": 1.17533672, + "balance_loss_mlp": 1.04537845, + "epoch": 0.29364196603036224, + "flos": 23807212574400.0, + "grad_norm": 3.7334547947189725, + "language_loss": 0.92800182, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.95609838, + "num_input_tokens_seen": 105266065, + "step": 4884, + "time_per_iteration": 2.760894536972046 + }, + { + "auxiliary_loss_clip": 0.01525063, + "auxiliary_loss_mlp": 0.01300952, + "balance_loss_clip": 1.18731391, + "balance_loss_mlp": 1.05051732, + "epoch": 0.2937020892830302, + "flos": 23471924361120.0, + "grad_norm": 2.756224099313644, + "language_loss": 0.73898876, + "learning_rate": 3.313810597972234e-06, + "loss": 0.76724887, + "num_input_tokens_seen": 105282155, + "step": 4885, + "time_per_iteration": 2.867037534713745 + }, + { + "auxiliary_loss_clip": 0.01515404, + "auxiliary_loss_mlp": 0.01298952, + "balance_loss_clip": 1.17639089, + "balance_loss_mlp": 1.04660952, + "epoch": 0.2937622125356982, + "flos": 24274643954400.0, + "grad_norm": 2.0571981906577137, + "language_loss": 0.84994739, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87809098, + "num_input_tokens_seen": 105299225, + "step": 4886, + "time_per_iteration": 2.809556484222412 + }, + { + "auxiliary_loss_clip": 0.0150501, + "auxiliary_loss_mlp": 0.01293258, + "balance_loss_clip": 1.16575885, + "balance_loss_mlp": 1.03900862, + "epoch": 0.29382233578836614, + "flos": 20664264264480.0, + "grad_norm": 3.8362680851927564, + "language_loss": 0.77039337, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79837602, + "num_input_tokens_seen": 105315710, + "step": 4887, + "time_per_iteration": 2.7521631717681885 + }, + { + "auxiliary_loss_clip": 0.01507752, + "auxiliary_loss_mlp": 0.0129535, + "balance_loss_clip": 1.16852331, + "balance_loss_mlp": 1.04605985, + "epoch": 0.2938824590410341, + "flos": 16546931687520.0, + "grad_norm": 3.226803929130319, + "language_loss": 0.79485583, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.82288682, + "num_input_tokens_seen": 105333505, + "step": 4888, + "time_per_iteration": 2.851496934890747 + }, + { + "auxiliary_loss_clip": 0.01510005, + "auxiliary_loss_mlp": 0.01295557, + "balance_loss_clip": 1.17167449, + "balance_loss_mlp": 1.04435921, + "epoch": 0.29394258229370207, + "flos": 37928538528480.0, + "grad_norm": 1.4703642701225845, + "language_loss": 0.55133951, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57939517, + "num_input_tokens_seen": 105355605, + "step": 4889, + "time_per_iteration": 4.556346893310547 + }, + { + "auxiliary_loss_clip": 0.01514643, + "auxiliary_loss_mlp": 0.01285085, + "balance_loss_clip": 1.17584729, + "balance_loss_mlp": 1.02835548, + "epoch": 0.29400270554637004, + "flos": 20046597985440.0, + "grad_norm": 1.6621191417487302, + "language_loss": 0.85234809, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.88034534, + "num_input_tokens_seen": 105374225, + "step": 4890, + "time_per_iteration": 2.759554386138916 + }, + { + "auxiliary_loss_clip": 0.01513048, + "auxiliary_loss_mlp": 0.01297647, + "balance_loss_clip": 1.17364538, + "balance_loss_mlp": 1.0420624, + "epoch": 0.294062828799038, + "flos": 15268095165600.0, + "grad_norm": 2.215299001850375, + "language_loss": 0.72832888, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.75643587, + "num_input_tokens_seen": 105391565, + "step": 4891, + "time_per_iteration": 2.714630365371704 + }, + { + "auxiliary_loss_clip": 0.01511907, + "auxiliary_loss_mlp": 0.01286924, + "balance_loss_clip": 1.17350197, + "balance_loss_mlp": 1.03267479, + "epoch": 0.294122952051706, + "flos": 22749347698560.0, + "grad_norm": 2.8861185045500743, + "language_loss": 0.77668262, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.80467087, + "num_input_tokens_seen": 105409840, + "step": 4892, + "time_per_iteration": 2.764916181564331 + }, + { + "auxiliary_loss_clip": 0.01500904, + "auxiliary_loss_mlp": 0.01281981, + "balance_loss_clip": 1.16206884, + "balance_loss_mlp": 1.02906609, + "epoch": 0.294183075304374, + "flos": 24975221919840.0, + "grad_norm": 2.3588783133418154, + "language_loss": 0.78555924, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.81338811, + "num_input_tokens_seen": 105428645, + "step": 4893, + "time_per_iteration": 2.7909138202667236 + }, + { + "auxiliary_loss_clip": 0.01507732, + "auxiliary_loss_mlp": 0.01288998, + "balance_loss_clip": 1.16913116, + "balance_loss_mlp": 1.035321, + "epoch": 0.29424319855704195, + "flos": 30955452583680.0, + "grad_norm": 1.8714995477929246, + "language_loss": 0.85118306, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87915039, + "num_input_tokens_seen": 105447480, + "step": 4894, + "time_per_iteration": 2.8255317211151123 + }, + { + "auxiliary_loss_clip": 0.01507111, + "auxiliary_loss_mlp": 0.01286446, + "balance_loss_clip": 1.16874707, + "balance_loss_mlp": 1.03124285, + "epoch": 0.2943033218097099, + "flos": 15233390534880.0, + "grad_norm": 2.948960585262257, + "language_loss": 0.90353334, + "learning_rate": 3.310871672543274e-06, + "loss": 0.9314689, + "num_input_tokens_seen": 105464600, + "step": 4895, + "time_per_iteration": 2.7438793182373047 + }, + { + "auxiliary_loss_clip": 0.01501998, + "auxiliary_loss_mlp": 0.01281918, + "balance_loss_clip": 1.16219866, + "balance_loss_mlp": 1.02652431, + "epoch": 0.2943634450623779, + "flos": 21728045933280.0, + "grad_norm": 1.868110972599685, + "language_loss": 0.86473393, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.89257312, + "num_input_tokens_seen": 105481510, + "step": 4896, + "time_per_iteration": 2.764695405960083 + }, + { + "auxiliary_loss_clip": 0.01507947, + "auxiliary_loss_mlp": 0.01287165, + "balance_loss_clip": 1.16922784, + "balance_loss_mlp": 1.03081703, + "epoch": 0.29442356831504585, + "flos": 22604346885600.0, + "grad_norm": 1.7836303249756778, + "language_loss": 0.73981953, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.76777065, + "num_input_tokens_seen": 105501390, + "step": 4897, + "time_per_iteration": 5.788020372390747 + }, + { + "auxiliary_loss_clip": 0.01502299, + "auxiliary_loss_mlp": 0.01293602, + "balance_loss_clip": 1.16178608, + "balance_loss_mlp": 1.03725398, + "epoch": 0.2944836915677138, + "flos": 20013486337440.0, + "grad_norm": 2.2625059017791997, + "language_loss": 0.74640691, + "learning_rate": 3.309989025093813e-06, + "loss": 0.77436602, + "num_input_tokens_seen": 105519600, + "step": 4898, + "time_per_iteration": 4.413605690002441 + }, + { + "auxiliary_loss_clip": 0.01504084, + "auxiliary_loss_mlp": 0.01296067, + "balance_loss_clip": 1.16309738, + "balance_loss_mlp": 1.0385747, + "epoch": 0.2945438148203818, + "flos": 20047849614720.0, + "grad_norm": 2.706943946329075, + "language_loss": 0.70879853, + "learning_rate": 3.309694709912618e-06, + "loss": 0.73680001, + "num_input_tokens_seen": 105535970, + "step": 4899, + "time_per_iteration": 2.7486705780029297 + }, + { + "auxiliary_loss_clip": 0.01506234, + "auxiliary_loss_mlp": 0.01302071, + "balance_loss_clip": 1.16625118, + "balance_loss_mlp": 1.04705858, + "epoch": 0.29460393807304974, + "flos": 23735868976800.0, + "grad_norm": 1.8942479628237994, + "language_loss": 0.79068369, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.81876671, + "num_input_tokens_seen": 105556735, + "step": 4900, + "time_per_iteration": 2.8041276931762695 + }, + { + "auxiliary_loss_clip": 0.01502506, + "auxiliary_loss_mlp": 0.01290898, + "balance_loss_clip": 1.16237938, + "balance_loss_mlp": 1.03683901, + "epoch": 0.2946640613257177, + "flos": 14977865970720.0, + "grad_norm": 1.736139728381251, + "language_loss": 0.80607438, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.83400846, + "num_input_tokens_seen": 105574875, + "step": 4901, + "time_per_iteration": 2.736769437789917 + }, + { + "auxiliary_loss_clip": 0.01504206, + "auxiliary_loss_mlp": 0.01294186, + "balance_loss_clip": 1.16340983, + "balance_loss_mlp": 1.04069901, + "epoch": 0.2947241845783857, + "flos": 24246235398240.0, + "grad_norm": 6.859119489527001, + "language_loss": 0.57700235, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60498631, + "num_input_tokens_seen": 105594225, + "step": 4902, + "time_per_iteration": 2.806412935256958 + }, + { + "auxiliary_loss_clip": 0.01503129, + "auxiliary_loss_mlp": 0.01289328, + "balance_loss_clip": 1.16211998, + "balance_loss_mlp": 1.03298068, + "epoch": 0.29478430783105364, + "flos": 19940473900800.0, + "grad_norm": 2.895282242080729, + "language_loss": 0.75950265, + "learning_rate": 3.308516952661925e-06, + "loss": 0.78742719, + "num_input_tokens_seen": 105614000, + "step": 4903, + "time_per_iteration": 2.789090394973755 + }, + { + "auxiliary_loss_clip": 0.01505602, + "auxiliary_loss_mlp": 0.01291019, + "balance_loss_clip": 1.1631695, + "balance_loss_mlp": 1.03142858, + "epoch": 0.2948444310837216, + "flos": 27383835837600.0, + "grad_norm": 1.9955963728100425, + "language_loss": 0.62589085, + "learning_rate": 3.3082223892736e-06, + "loss": 0.65385699, + "num_input_tokens_seen": 105634575, + "step": 4904, + "time_per_iteration": 2.8680224418640137 + }, + { + "auxiliary_loss_clip": 0.01504658, + "auxiliary_loss_mlp": 0.0129458, + "balance_loss_clip": 1.16282272, + "balance_loss_mlp": 1.04033065, + "epoch": 0.2949045543363896, + "flos": 23407787113920.0, + "grad_norm": 1.9351667707591487, + "language_loss": 0.73698187, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.76497424, + "num_input_tokens_seen": 105654385, + "step": 4905, + "time_per_iteration": 2.8115241527557373 + }, + { + "auxiliary_loss_clip": 0.01498517, + "auxiliary_loss_mlp": 0.01293411, + "balance_loss_clip": 1.15720773, + "balance_loss_mlp": 1.03935206, + "epoch": 0.2949646775890576, + "flos": 23953768441920.0, + "grad_norm": 2.5911578785551463, + "language_loss": 0.81698364, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.84490299, + "num_input_tokens_seen": 105673570, + "step": 4906, + "time_per_iteration": 2.9277031421661377 + }, + { + "auxiliary_loss_clip": 0.01503427, + "auxiliary_loss_mlp": 0.01291202, + "balance_loss_clip": 1.16278911, + "balance_loss_mlp": 1.04000425, + "epoch": 0.29502480084172555, + "flos": 22786555587840.0, + "grad_norm": 2.4510433147904553, + "language_loss": 0.87521809, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.90316439, + "num_input_tokens_seen": 105691940, + "step": 4907, + "time_per_iteration": 2.8900790214538574 + }, + { + "auxiliary_loss_clip": 0.01503629, + "auxiliary_loss_mlp": 0.01297541, + "balance_loss_clip": 1.16292715, + "balance_loss_mlp": 1.04195666, + "epoch": 0.2950849240943935, + "flos": 19648727579520.0, + "grad_norm": 2.087193400664363, + "language_loss": 0.82120734, + "learning_rate": 3.307043639752782e-06, + "loss": 0.84921902, + "num_input_tokens_seen": 105709825, + "step": 4908, + "time_per_iteration": 2.805443048477173 + }, + { + "auxiliary_loss_clip": 0.01573951, + "auxiliary_loss_mlp": 0.01228485, + "balance_loss_clip": 1.24009752, + "balance_loss_mlp": 0.99807739, + "epoch": 0.2951450473470615, + "flos": 71008937406720.0, + "grad_norm": 0.7736170765416666, + "language_loss": 0.5726037, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.60062802, + "num_input_tokens_seen": 105766880, + "step": 4909, + "time_per_iteration": 3.2704596519470215 + }, + { + "auxiliary_loss_clip": 0.0149466, + "auxiliary_loss_mlp": 0.01299308, + "balance_loss_clip": 1.15253615, + "balance_loss_mlp": 1.0437237, + "epoch": 0.29520517059972945, + "flos": 22968536721120.0, + "grad_norm": 1.761538377390936, + "language_loss": 0.87220263, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.90014231, + "num_input_tokens_seen": 105786875, + "step": 4910, + "time_per_iteration": 2.825320243835449 + }, + { + "auxiliary_loss_clip": 0.01502213, + "auxiliary_loss_mlp": 0.01299019, + "balance_loss_clip": 1.16147292, + "balance_loss_mlp": 1.04839373, + "epoch": 0.2952652938523974, + "flos": 20487782714400.0, + "grad_norm": 4.081229213700179, + "language_loss": 0.72901756, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75702989, + "num_input_tokens_seen": 105805315, + "step": 4911, + "time_per_iteration": 2.8187785148620605 + }, + { + "auxiliary_loss_clip": 0.01504408, + "auxiliary_loss_mlp": 0.01315889, + "balance_loss_clip": 1.16318965, + "balance_loss_mlp": 1.06545448, + "epoch": 0.2953254171050654, + "flos": 19648651723200.0, + "grad_norm": 1.701503398548955, + "language_loss": 0.89959061, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.92779356, + "num_input_tokens_seen": 105825125, + "step": 4912, + "time_per_iteration": 2.8423044681549072 + }, + { + "auxiliary_loss_clip": 0.01503036, + "auxiliary_loss_mlp": 0.01322301, + "balance_loss_clip": 1.16132784, + "balance_loss_mlp": 1.06728899, + "epoch": 0.29538554035773334, + "flos": 22750220046240.0, + "grad_norm": 1.5802926974770348, + "language_loss": 0.83612102, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.8643744, + "num_input_tokens_seen": 105846085, + "step": 4913, + "time_per_iteration": 2.803959608078003 + }, + { + "auxiliary_loss_clip": 0.01493457, + "auxiliary_loss_mlp": 0.01309766, + "balance_loss_clip": 1.15104759, + "balance_loss_mlp": 1.05971265, + "epoch": 0.2954456636104013, + "flos": 21874032878400.0, + "grad_norm": 1.7678287167107223, + "language_loss": 0.77229875, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.800331, + "num_input_tokens_seen": 105865400, + "step": 4914, + "time_per_iteration": 2.7813546657562256 + }, + { + "auxiliary_loss_clip": 0.01497611, + "auxiliary_loss_mlp": 0.01293206, + "balance_loss_clip": 1.15634966, + "balance_loss_mlp": 1.04601336, + "epoch": 0.2955057868630693, + "flos": 40445741861280.0, + "grad_norm": 2.3605491758061476, + "language_loss": 0.81735373, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.84526193, + "num_input_tokens_seen": 105887920, + "step": 4915, + "time_per_iteration": 2.98392391204834 + }, + { + "auxiliary_loss_clip": 0.01498452, + "auxiliary_loss_mlp": 0.01305523, + "balance_loss_clip": 1.15639222, + "balance_loss_mlp": 1.05356181, + "epoch": 0.29556591011573724, + "flos": 22566683858400.0, + "grad_norm": 1.8670374079443037, + "language_loss": 0.84952056, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.87756032, + "num_input_tokens_seen": 105904035, + "step": 4916, + "time_per_iteration": 2.8222405910491943 + }, + { + "auxiliary_loss_clip": 0.0149955, + "auxiliary_loss_mlp": 0.01313297, + "balance_loss_clip": 1.15923977, + "balance_loss_mlp": 1.06476974, + "epoch": 0.2956260333684052, + "flos": 22091401349280.0, + "grad_norm": 2.0893268909903018, + "language_loss": 0.701195, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.72932345, + "num_input_tokens_seen": 105922685, + "step": 4917, + "time_per_iteration": 2.737677574157715 + }, + { + "auxiliary_loss_clip": 0.01498343, + "auxiliary_loss_mlp": 0.01319484, + "balance_loss_clip": 1.15797961, + "balance_loss_mlp": 1.06923985, + "epoch": 0.2956861566210732, + "flos": 16437356140320.0, + "grad_norm": 2.2944222115344934, + "language_loss": 0.91114074, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93931895, + "num_input_tokens_seen": 105940425, + "step": 4918, + "time_per_iteration": 2.805572271347046 + }, + { + "auxiliary_loss_clip": 0.01503651, + "auxiliary_loss_mlp": 0.01313956, + "balance_loss_clip": 1.16149938, + "balance_loss_mlp": 1.06390226, + "epoch": 0.2957462798737412, + "flos": 25814466695520.0, + "grad_norm": 1.868136950024449, + "language_loss": 0.72750533, + "learning_rate": 3.303797991757425e-06, + "loss": 0.75568134, + "num_input_tokens_seen": 105960550, + "step": 4919, + "time_per_iteration": 2.8324155807495117 + }, + { + "auxiliary_loss_clip": 0.01497406, + "auxiliary_loss_mlp": 0.01307981, + "balance_loss_clip": 1.15601373, + "balance_loss_mlp": 1.06078911, + "epoch": 0.29580640312640916, + "flos": 16692728991840.0, + "grad_norm": 1.9363701394217767, + "language_loss": 0.76107836, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.78913224, + "num_input_tokens_seen": 105978820, + "step": 4920, + "time_per_iteration": 2.7957117557525635 + }, + { + "auxiliary_loss_clip": 0.0150739, + "auxiliary_loss_mlp": 0.01304465, + "balance_loss_clip": 1.1667136, + "balance_loss_mlp": 1.05250406, + "epoch": 0.2958665263790771, + "flos": 23947548223680.0, + "grad_norm": 2.6910693739541895, + "language_loss": 0.69091314, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.71903169, + "num_input_tokens_seen": 105997545, + "step": 4921, + "time_per_iteration": 2.857943534851074 + }, + { + "auxiliary_loss_clip": 0.01503716, + "auxiliary_loss_mlp": 0.01308434, + "balance_loss_clip": 1.16161275, + "balance_loss_mlp": 1.05380321, + "epoch": 0.2959266496317451, + "flos": 18480376880640.0, + "grad_norm": 1.905771737020744, + "language_loss": 0.74730921, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.77543074, + "num_input_tokens_seen": 106015320, + "step": 4922, + "time_per_iteration": 2.728172779083252 + }, + { + "auxiliary_loss_clip": 0.01500996, + "auxiliary_loss_mlp": 0.01311906, + "balance_loss_clip": 1.15842068, + "balance_loss_mlp": 1.05574954, + "epoch": 0.29598677288441305, + "flos": 25959846790080.0, + "grad_norm": 1.831325738069031, + "language_loss": 0.76895607, + "learning_rate": 3.302616272134737e-06, + "loss": 0.79708517, + "num_input_tokens_seen": 106034555, + "step": 4923, + "time_per_iteration": 2.814600944519043 + }, + { + "auxiliary_loss_clip": 0.01496331, + "auxiliary_loss_mlp": 0.01301805, + "balance_loss_clip": 1.1536206, + "balance_loss_mlp": 1.04755521, + "epoch": 0.296046896137081, + "flos": 25158378826080.0, + "grad_norm": 1.6204762554483192, + "language_loss": 0.86567336, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.8936547, + "num_input_tokens_seen": 106054200, + "step": 4924, + "time_per_iteration": 2.7932839393615723 + }, + { + "auxiliary_loss_clip": 0.0149616, + "auxiliary_loss_mlp": 0.01288323, + "balance_loss_clip": 1.15338731, + "balance_loss_mlp": 1.03807902, + "epoch": 0.296107019389749, + "flos": 21763243630080.0, + "grad_norm": 1.4436722117285155, + "language_loss": 0.82026124, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84810603, + "num_input_tokens_seen": 106074700, + "step": 4925, + "time_per_iteration": 2.834052562713623 + }, + { + "auxiliary_loss_clip": 0.0150586, + "auxiliary_loss_mlp": 0.01288128, + "balance_loss_clip": 1.16293335, + "balance_loss_mlp": 1.03673911, + "epoch": 0.29616714264241695, + "flos": 17961552479520.0, + "grad_norm": 3.9520867755353146, + "language_loss": 0.86620396, + "learning_rate": 3.301729463727452e-06, + "loss": 0.89414382, + "num_input_tokens_seen": 106091415, + "step": 4926, + "time_per_iteration": 2.734159469604492 + }, + { + "auxiliary_loss_clip": 0.01499804, + "auxiliary_loss_mlp": 0.01304729, + "balance_loss_clip": 1.15833247, + "balance_loss_mlp": 1.05047917, + "epoch": 0.2962272658950849, + "flos": 15014580793920.0, + "grad_norm": 1.858887408314666, + "language_loss": 0.86317563, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.89122093, + "num_input_tokens_seen": 106109135, + "step": 4927, + "time_per_iteration": 2.749713182449341 + }, + { + "auxiliary_loss_clip": 0.01500514, + "auxiliary_loss_mlp": 0.0129845, + "balance_loss_clip": 1.15820491, + "balance_loss_mlp": 1.04820561, + "epoch": 0.2962873891477529, + "flos": 14722644831840.0, + "grad_norm": 1.9257597655564507, + "language_loss": 0.80725443, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83524406, + "num_input_tokens_seen": 106125750, + "step": 4928, + "time_per_iteration": 4.645334005355835 + }, + { + "auxiliary_loss_clip": 0.0150138, + "auxiliary_loss_mlp": 0.01305989, + "balance_loss_clip": 1.15809846, + "balance_loss_mlp": 1.05040479, + "epoch": 0.29634751240042084, + "flos": 26726192913600.0, + "grad_norm": 4.367021012070289, + "language_loss": 0.72717941, + "learning_rate": 3.300842211064773e-06, + "loss": 0.75525308, + "num_input_tokens_seen": 106142835, + "step": 4929, + "time_per_iteration": 2.785357713699341 + }, + { + "auxiliary_loss_clip": 0.01505721, + "auxiliary_loss_mlp": 0.01294189, + "balance_loss_clip": 1.16246557, + "balance_loss_mlp": 1.04070246, + "epoch": 0.2964076356530888, + "flos": 14572865070720.0, + "grad_norm": 2.7332496081450164, + "language_loss": 0.72290194, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.75090098, + "num_input_tokens_seen": 106160680, + "step": 4930, + "time_per_iteration": 2.8518564701080322 + }, + { + "auxiliary_loss_clip": 0.01564887, + "auxiliary_loss_mlp": 0.01236015, + "balance_loss_clip": 1.22733581, + "balance_loss_mlp": 1.013237, + "epoch": 0.29646775890575683, + "flos": 63111039664320.0, + "grad_norm": 0.8134567884166268, + "language_loss": 0.6065402, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.63454926, + "num_input_tokens_seen": 106224415, + "step": 4931, + "time_per_iteration": 3.234058141708374 + }, + { + "auxiliary_loss_clip": 0.01564297, + "auxiliary_loss_mlp": 0.01237335, + "balance_loss_clip": 1.22608149, + "balance_loss_mlp": 1.00997925, + "epoch": 0.2965278821584248, + "flos": 63074400697440.0, + "grad_norm": 0.7378452449265479, + "language_loss": 0.52354467, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.551561, + "num_input_tokens_seen": 106279140, + "step": 4932, + "time_per_iteration": 3.1546947956085205 + }, + { + "auxiliary_loss_clip": 0.01499203, + "auxiliary_loss_mlp": 0.01288137, + "balance_loss_clip": 1.15590024, + "balance_loss_mlp": 1.03312469, + "epoch": 0.29658800541109276, + "flos": 23770990817280.0, + "grad_norm": 1.5601276858342676, + "language_loss": 0.81539559, + "learning_rate": 3.299658516973972e-06, + "loss": 0.84326905, + "num_input_tokens_seen": 106298190, + "step": 4933, + "time_per_iteration": 2.807990550994873 + }, + { + "auxiliary_loss_clip": 0.01506748, + "auxiliary_loss_mlp": 0.0128538, + "balance_loss_clip": 1.16385651, + "balance_loss_mlp": 1.0322746, + "epoch": 0.2966481286637607, + "flos": 23990862546720.0, + "grad_norm": 1.8221294760411018, + "language_loss": 0.75267047, + "learning_rate": 3.299362470215261e-06, + "loss": 0.78059179, + "num_input_tokens_seen": 106319065, + "step": 4934, + "time_per_iteration": 2.8169898986816406 + }, + { + "auxiliary_loss_clip": 0.01496857, + "auxiliary_loss_mlp": 0.0128465, + "balance_loss_clip": 1.15321493, + "balance_loss_mlp": 1.02639508, + "epoch": 0.2967082519164287, + "flos": 17167063296960.0, + "grad_norm": 1.844681927522479, + "language_loss": 0.62611955, + "learning_rate": 3.299066374184594e-06, + "loss": 0.65393466, + "num_input_tokens_seen": 106338040, + "step": 4935, + "time_per_iteration": 4.2820940017700195 + }, + { + "auxiliary_loss_clip": 0.01504702, + "auxiliary_loss_mlp": 0.01288415, + "balance_loss_clip": 1.16111827, + "balance_loss_mlp": 1.03550029, + "epoch": 0.29676837516909665, + "flos": 29390293467360.0, + "grad_norm": 1.4943547313810674, + "language_loss": 0.79808486, + "learning_rate": 3.2987702288932e-06, + "loss": 0.82601601, + "num_input_tokens_seen": 106358900, + "step": 4936, + "time_per_iteration": 5.8749494552612305 + }, + { + "auxiliary_loss_clip": 0.01509397, + "auxiliary_loss_mlp": 0.0130769, + "balance_loss_clip": 1.16563153, + "balance_loss_mlp": 1.05515707, + "epoch": 0.2968284984217646, + "flos": 34754109481440.0, + "grad_norm": 1.7719114460283363, + "language_loss": 0.74582624, + "learning_rate": 3.298474034352309e-06, + "loss": 0.77399719, + "num_input_tokens_seen": 106381805, + "step": 4937, + "time_per_iteration": 2.9563963413238525 + }, + { + "auxiliary_loss_clip": 0.01506715, + "auxiliary_loss_mlp": 0.0129256, + "balance_loss_clip": 1.16336274, + "balance_loss_mlp": 1.03869212, + "epoch": 0.2968886216744326, + "flos": 21546292368960.0, + "grad_norm": 1.555314521384053, + "language_loss": 0.78481293, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.81280565, + "num_input_tokens_seen": 106402365, + "step": 4938, + "time_per_iteration": 2.851972818374634 + }, + { + "auxiliary_loss_clip": 0.01502889, + "auxiliary_loss_mlp": 0.0129873, + "balance_loss_clip": 1.15899158, + "balance_loss_mlp": 1.04333615, + "epoch": 0.29694874492710055, + "flos": 12789009997920.0, + "grad_norm": 2.165029325086237, + "language_loss": 0.77233088, + "learning_rate": 3.297881497566964e-06, + "loss": 0.80034703, + "num_input_tokens_seen": 106419800, + "step": 4939, + "time_per_iteration": 2.8132450580596924 + }, + { + "auxiliary_loss_clip": 0.01498924, + "auxiliary_loss_mlp": 0.01297376, + "balance_loss_clip": 1.15583956, + "balance_loss_mlp": 1.04140973, + "epoch": 0.2970088681797685, + "flos": 24572193284160.0, + "grad_norm": 1.953897440128471, + "language_loss": 0.7817772, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80974019, + "num_input_tokens_seen": 106440300, + "step": 4940, + "time_per_iteration": 2.8549578189849854 + }, + { + "auxiliary_loss_clip": 0.01498212, + "auxiliary_loss_mlp": 0.01288348, + "balance_loss_clip": 1.15574753, + "balance_loss_mlp": 1.03428924, + "epoch": 0.2970689914324365, + "flos": 23661415270080.0, + "grad_norm": 1.5681558700460119, + "language_loss": 0.75455636, + "learning_rate": 3.297288763918435e-06, + "loss": 0.78242195, + "num_input_tokens_seen": 106460035, + "step": 4941, + "time_per_iteration": 2.804102659225464 + }, + { + "auxiliary_loss_clip": 0.0150007, + "auxiliary_loss_mlp": 0.012908, + "balance_loss_clip": 1.15556264, + "balance_loss_mlp": 1.03483438, + "epoch": 0.29712911468510445, + "flos": 39673099663200.0, + "grad_norm": 2.714664846159622, + "language_loss": 0.74103355, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.7689423, + "num_input_tokens_seen": 106481095, + "step": 4942, + "time_per_iteration": 2.872955083847046 + }, + { + "auxiliary_loss_clip": 0.0149591, + "auxiliary_loss_mlp": 0.01300201, + "balance_loss_clip": 1.15276527, + "balance_loss_mlp": 1.04518819, + "epoch": 0.2971892379377724, + "flos": 26397693840960.0, + "grad_norm": 1.9451105648464375, + "language_loss": 0.7023921, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.73035324, + "num_input_tokens_seen": 106501590, + "step": 4943, + "time_per_iteration": 2.808379650115967 + }, + { + "auxiliary_loss_clip": 0.01495882, + "auxiliary_loss_mlp": 0.01308941, + "balance_loss_clip": 1.15279126, + "balance_loss_mlp": 1.05869651, + "epoch": 0.2972493611904404, + "flos": 17605137916800.0, + "grad_norm": 2.4318064786744333, + "language_loss": 0.80027974, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82832801, + "num_input_tokens_seen": 106519430, + "step": 4944, + "time_per_iteration": 2.8169479370117188 + }, + { + "auxiliary_loss_clip": 0.01492878, + "auxiliary_loss_mlp": 0.01285512, + "balance_loss_clip": 1.14889205, + "balance_loss_mlp": 1.03622127, + "epoch": 0.2973094844431084, + "flos": 20414884062240.0, + "grad_norm": 2.0954702611851075, + "language_loss": 0.8300463, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85783017, + "num_input_tokens_seen": 106535870, + "step": 4945, + "time_per_iteration": 2.811203956604004 + }, + { + "auxiliary_loss_clip": 0.01491304, + "auxiliary_loss_mlp": 0.01284938, + "balance_loss_clip": 1.1479497, + "balance_loss_mlp": 1.03545642, + "epoch": 0.29736960769577636, + "flos": 17495410656960.0, + "grad_norm": 1.908832829720398, + "language_loss": 0.66814256, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.69590497, + "num_input_tokens_seen": 106553560, + "step": 4946, + "time_per_iteration": 2.705209255218506 + }, + { + "auxiliary_loss_clip": 0.0148965, + "auxiliary_loss_mlp": 0.01297432, + "balance_loss_clip": 1.14647722, + "balance_loss_mlp": 1.04261065, + "epoch": 0.2974297309484443, + "flos": 26106364729440.0, + "grad_norm": 2.2219831859497976, + "language_loss": 0.73465484, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.76252568, + "num_input_tokens_seen": 106574115, + "step": 4947, + "time_per_iteration": 2.851499319076538 + }, + { + "auxiliary_loss_clip": 0.01498597, + "auxiliary_loss_mlp": 0.01278044, + "balance_loss_clip": 1.15499175, + "balance_loss_mlp": 1.02322233, + "epoch": 0.2974898542011123, + "flos": 25668821103840.0, + "grad_norm": 1.9652655712077307, + "language_loss": 0.73277515, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.76054156, + "num_input_tokens_seen": 106593070, + "step": 4948, + "time_per_iteration": 2.804490327835083 + }, + { + "auxiliary_loss_clip": 0.01488837, + "auxiliary_loss_mlp": 0.01289695, + "balance_loss_clip": 1.14452147, + "balance_loss_mlp": 1.0404048, + "epoch": 0.29754997745378026, + "flos": 18663344146080.0, + "grad_norm": 2.2660833417739084, + "language_loss": 0.84234738, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.87013268, + "num_input_tokens_seen": 106610695, + "step": 4949, + "time_per_iteration": 2.761809825897217 + }, + { + "auxiliary_loss_clip": 0.01490578, + "auxiliary_loss_mlp": 0.01290321, + "balance_loss_clip": 1.14740562, + "balance_loss_mlp": 1.04064941, + "epoch": 0.2976101007064482, + "flos": 22276871873280.0, + "grad_norm": 5.763548791537007, + "language_loss": 0.70954818, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73735714, + "num_input_tokens_seen": 106631300, + "step": 4950, + "time_per_iteration": 2.799211025238037 + }, + { + "auxiliary_loss_clip": 0.0149099, + "auxiliary_loss_mlp": 0.01283626, + "balance_loss_clip": 1.14604557, + "balance_loss_mlp": 1.03738713, + "epoch": 0.2976702239591162, + "flos": 21947879734560.0, + "grad_norm": 3.25057798377682, + "language_loss": 0.82256961, + "learning_rate": 3.294322145875789e-06, + "loss": 0.85031575, + "num_input_tokens_seen": 106650065, + "step": 4951, + "time_per_iteration": 2.8126959800720215 + }, + { + "auxiliary_loss_clip": 0.01479608, + "auxiliary_loss_mlp": 0.0128056, + "balance_loss_clip": 1.13583219, + "balance_loss_mlp": 1.0303154, + "epoch": 0.29773034721178415, + "flos": 24639099287040.0, + "grad_norm": 2.985507649974422, + "language_loss": 0.73973036, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76733208, + "num_input_tokens_seen": 106668230, + "step": 4952, + "time_per_iteration": 2.8058362007141113 + }, + { + "auxiliary_loss_clip": 0.01491976, + "auxiliary_loss_mlp": 0.0128687, + "balance_loss_clip": 1.14793885, + "balance_loss_mlp": 1.03662562, + "epoch": 0.2977904704644521, + "flos": 20559619378080.0, + "grad_norm": 2.8681680229769806, + "language_loss": 0.83712637, + "learning_rate": 3.293728232937228e-06, + "loss": 0.86491477, + "num_input_tokens_seen": 106687785, + "step": 4953, + "time_per_iteration": 2.7771201133728027 + }, + { + "auxiliary_loss_clip": 0.01496064, + "auxiliary_loss_mlp": 0.01282149, + "balance_loss_clip": 1.15465164, + "balance_loss_mlp": 1.03076065, + "epoch": 0.2978505937171201, + "flos": 18918451500480.0, + "grad_norm": 1.9063889052919611, + "language_loss": 0.73599994, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.76378208, + "num_input_tokens_seen": 106706875, + "step": 4954, + "time_per_iteration": 2.8416006565093994 + }, + { + "auxiliary_loss_clip": 0.01491509, + "auxiliary_loss_mlp": 0.01275743, + "balance_loss_clip": 1.14952826, + "balance_loss_mlp": 1.02187502, + "epoch": 0.29791071696978805, + "flos": 19319773368960.0, + "grad_norm": 1.9514546290787487, + "language_loss": 0.75666505, + "learning_rate": 3.293134123765452e-06, + "loss": 0.78433752, + "num_input_tokens_seen": 106725105, + "step": 4955, + "time_per_iteration": 2.75040864944458 + }, + { + "auxiliary_loss_clip": 0.01496849, + "auxiliary_loss_mlp": 0.01280768, + "balance_loss_clip": 1.15410531, + "balance_loss_mlp": 1.02594614, + "epoch": 0.297970840222456, + "flos": 18808800096960.0, + "grad_norm": 1.828214275800083, + "language_loss": 0.72468948, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.75246561, + "num_input_tokens_seen": 106744780, + "step": 4956, + "time_per_iteration": 2.796063184738159 + }, + { + "auxiliary_loss_clip": 0.01491771, + "auxiliary_loss_mlp": 0.01295985, + "balance_loss_clip": 1.14978731, + "balance_loss_mlp": 1.04211664, + "epoch": 0.298030963475124, + "flos": 22854751148160.0, + "grad_norm": 1.736730717810136, + "language_loss": 0.78935444, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81723201, + "num_input_tokens_seen": 106764670, + "step": 4957, + "time_per_iteration": 2.785573720932007 + }, + { + "auxiliary_loss_clip": 0.01493598, + "auxiliary_loss_mlp": 0.01299755, + "balance_loss_clip": 1.1515944, + "balance_loss_mlp": 1.04760396, + "epoch": 0.298091086727792, + "flos": 21870467631360.0, + "grad_norm": 2.7393608555059217, + "language_loss": 0.70466626, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.73259985, + "num_input_tokens_seen": 106783695, + "step": 4958, + "time_per_iteration": 2.78387451171875 + }, + { + "auxiliary_loss_clip": 0.01498234, + "auxiliary_loss_mlp": 0.01289989, + "balance_loss_clip": 1.15717304, + "balance_loss_mlp": 1.03936386, + "epoch": 0.29815120998045996, + "flos": 21176944303680.0, + "grad_norm": 1.5761696525420148, + "language_loss": 0.79051387, + "learning_rate": 3.291945317082743e-06, + "loss": 0.81839609, + "num_input_tokens_seen": 106803150, + "step": 4959, + "time_per_iteration": 2.8545031547546387 + }, + { + "auxiliary_loss_clip": 0.01496249, + "auxiliary_loss_mlp": 0.01287527, + "balance_loss_clip": 1.15396905, + "balance_loss_mlp": 1.03995323, + "epoch": 0.29821133323312793, + "flos": 19898031925440.0, + "grad_norm": 1.8065615416006675, + "language_loss": 0.80197781, + "learning_rate": 3.291647992907147e-06, + "loss": 0.82981563, + "num_input_tokens_seen": 106820705, + "step": 4960, + "time_per_iteration": 2.769038200378418 + }, + { + "auxiliary_loss_clip": 0.01485896, + "auxiliary_loss_mlp": 0.01288181, + "balance_loss_clip": 1.14333665, + "balance_loss_mlp": 1.0354569, + "epoch": 0.2982714564857959, + "flos": 12752446887360.0, + "grad_norm": 2.257331359885131, + "language_loss": 0.74718684, + "learning_rate": 3.291350619752129e-06, + "loss": 0.77492762, + "num_input_tokens_seen": 106837335, + "step": 4961, + "time_per_iteration": 2.6744203567504883 + }, + { + "auxiliary_loss_clip": 0.01486554, + "auxiliary_loss_mlp": 0.01298006, + "balance_loss_clip": 1.14388013, + "balance_loss_mlp": 1.04718971, + "epoch": 0.29833157973846386, + "flos": 22274027261280.0, + "grad_norm": 2.0752851670685195, + "language_loss": 0.62089825, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64874387, + "num_input_tokens_seen": 106856250, + "step": 4962, + "time_per_iteration": 2.6776387691497803 + }, + { + "auxiliary_loss_clip": 0.01489521, + "auxiliary_loss_mlp": 0.01281483, + "balance_loss_clip": 1.14764059, + "balance_loss_mlp": 1.02971292, + "epoch": 0.2983917029911318, + "flos": 15374560603680.0, + "grad_norm": 1.747675256376634, + "language_loss": 0.83141804, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85912812, + "num_input_tokens_seen": 106873370, + "step": 4963, + "time_per_iteration": 2.6451575756073 + }, + { + "auxiliary_loss_clip": 0.01493905, + "auxiliary_loss_mlp": 0.01297557, + "balance_loss_clip": 1.15045476, + "balance_loss_mlp": 1.04521489, + "epoch": 0.2984518262437998, + "flos": 15379453336320.0, + "grad_norm": 2.1446744587917497, + "language_loss": 0.66126263, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68917722, + "num_input_tokens_seen": 106890330, + "step": 4964, + "time_per_iteration": 2.6749653816223145 + }, + { + "auxiliary_loss_clip": 0.01483939, + "auxiliary_loss_mlp": 0.01286346, + "balance_loss_clip": 1.14209783, + "balance_loss_mlp": 1.03743672, + "epoch": 0.29851194949646775, + "flos": 18110118539520.0, + "grad_norm": 1.8337457138144957, + "language_loss": 0.71232724, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.74003017, + "num_input_tokens_seen": 106909190, + "step": 4965, + "time_per_iteration": 4.329511642456055 + }, + { + "auxiliary_loss_clip": 0.01495795, + "auxiliary_loss_mlp": 0.01281835, + "balance_loss_clip": 1.15376425, + "balance_loss_mlp": 1.02720416, + "epoch": 0.2985720727491357, + "flos": 22020437033280.0, + "grad_norm": 1.7697458390794967, + "language_loss": 0.6642493, + "learning_rate": 3.289863019680461e-06, + "loss": 0.69202554, + "num_input_tokens_seen": 106927825, + "step": 4966, + "time_per_iteration": 2.8815107345581055 + }, + { + "auxiliary_loss_clip": 0.01495489, + "auxiliary_loss_mlp": 0.01283272, + "balance_loss_clip": 1.15206003, + "balance_loss_mlp": 1.02864075, + "epoch": 0.2986321960018037, + "flos": 13042979507520.0, + "grad_norm": 3.3880418432546087, + "language_loss": 0.73749155, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76527917, + "num_input_tokens_seen": 106943155, + "step": 4967, + "time_per_iteration": 2.7699880599975586 + }, + { + "auxiliary_loss_clip": 0.01491385, + "auxiliary_loss_mlp": 0.01284381, + "balance_loss_clip": 1.14986062, + "balance_loss_mlp": 1.03356421, + "epoch": 0.29869231925447165, + "flos": 14466778914240.0, + "grad_norm": 1.9613899922724634, + "language_loss": 0.71315527, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.74091291, + "num_input_tokens_seen": 106960295, + "step": 4968, + "time_per_iteration": 2.7148337364196777 + }, + { + "auxiliary_loss_clip": 0.01489326, + "auxiliary_loss_mlp": 0.01292747, + "balance_loss_clip": 1.14814281, + "balance_loss_mlp": 1.03830719, + "epoch": 0.2987524425071396, + "flos": 31652958368160.0, + "grad_norm": 1.7117607684214933, + "language_loss": 0.77015656, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79797727, + "num_input_tokens_seen": 106982870, + "step": 4969, + "time_per_iteration": 2.8483989238739014 + }, + { + "auxiliary_loss_clip": 0.01496572, + "auxiliary_loss_mlp": 0.01278434, + "balance_loss_clip": 1.15461826, + "balance_loss_mlp": 1.02666402, + "epoch": 0.2988125657598076, + "flos": 21435541048800.0, + "grad_norm": 2.1610671374127084, + "language_loss": 0.70343232, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.73118234, + "num_input_tokens_seen": 107002405, + "step": 4970, + "time_per_iteration": 2.7860937118530273 + }, + { + "auxiliary_loss_clip": 0.01497445, + "auxiliary_loss_mlp": 0.01293216, + "balance_loss_clip": 1.15550375, + "balance_loss_mlp": 1.03496051, + "epoch": 0.2988726890124756, + "flos": 18078903299520.0, + "grad_norm": 4.132591150962561, + "language_loss": 0.85069585, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87860245, + "num_input_tokens_seen": 107017310, + "step": 4971, + "time_per_iteration": 2.6909587383270264 + }, + { + "auxiliary_loss_clip": 0.01492625, + "auxiliary_loss_mlp": 0.01294363, + "balance_loss_clip": 1.15001035, + "balance_loss_mlp": 1.04450035, + "epoch": 0.29893281226514357, + "flos": 21757440621600.0, + "grad_norm": 1.8486169196714572, + "language_loss": 0.79805136, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.8259213, + "num_input_tokens_seen": 107034645, + "step": 4972, + "time_per_iteration": 2.8395838737487793 + }, + { + "auxiliary_loss_clip": 0.01497064, + "auxiliary_loss_mlp": 0.01294669, + "balance_loss_clip": 1.15523219, + "balance_loss_mlp": 1.04556966, + "epoch": 0.29899293551781153, + "flos": 16838602152480.0, + "grad_norm": 2.1256599110960286, + "language_loss": 0.8558315, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.88374877, + "num_input_tokens_seen": 107051125, + "step": 4973, + "time_per_iteration": 2.785045862197876 + }, + { + "auxiliary_loss_clip": 0.01499529, + "auxiliary_loss_mlp": 0.01286776, + "balance_loss_clip": 1.15794039, + "balance_loss_mlp": 1.03882027, + "epoch": 0.2990530587704795, + "flos": 11730841696800.0, + "grad_norm": 1.75469603506931, + "language_loss": 0.77314192, + "learning_rate": 3.287480316742863e-06, + "loss": 0.80100495, + "num_input_tokens_seen": 107068815, + "step": 4974, + "time_per_iteration": 5.8125715255737305 + }, + { + "auxiliary_loss_clip": 0.01496671, + "auxiliary_loss_mlp": 0.01294403, + "balance_loss_clip": 1.15463948, + "balance_loss_mlp": 1.04282379, + "epoch": 0.29911318202314746, + "flos": 28042502821920.0, + "grad_norm": 3.4317335047286974, + "language_loss": 0.72349018, + "learning_rate": 3.287182259060815e-06, + "loss": 0.75140095, + "num_input_tokens_seen": 107090420, + "step": 4975, + "time_per_iteration": 4.438039302825928 + }, + { + "auxiliary_loss_clip": 0.01498052, + "auxiliary_loss_mlp": 0.01288251, + "balance_loss_clip": 1.15644372, + "balance_loss_mlp": 1.03915179, + "epoch": 0.2991733052758154, + "flos": 18735522163200.0, + "grad_norm": 2.530487439901704, + "language_loss": 0.75934052, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78720355, + "num_input_tokens_seen": 107107255, + "step": 4976, + "time_per_iteration": 2.755892276763916 + }, + { + "auxiliary_loss_clip": 0.01494569, + "auxiliary_loss_mlp": 0.01295074, + "balance_loss_clip": 1.15306568, + "balance_loss_mlp": 1.04807246, + "epoch": 0.2992334285284834, + "flos": 15560941403520.0, + "grad_norm": 2.0570747716943445, + "language_loss": 0.86218387, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.89008027, + "num_input_tokens_seen": 107123840, + "step": 4977, + "time_per_iteration": 2.779709815979004 + }, + { + "auxiliary_loss_clip": 0.01492518, + "auxiliary_loss_mlp": 0.01304402, + "balance_loss_clip": 1.15090096, + "balance_loss_mlp": 1.05587471, + "epoch": 0.29929355178115136, + "flos": 21799655028000.0, + "grad_norm": 1.675793523030289, + "language_loss": 0.68447232, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.7124415, + "num_input_tokens_seen": 107143475, + "step": 4978, + "time_per_iteration": 2.7749316692352295 + }, + { + "auxiliary_loss_clip": 0.01504308, + "auxiliary_loss_mlp": 0.01300994, + "balance_loss_clip": 1.16287017, + "balance_loss_mlp": 1.0528481, + "epoch": 0.2993536750338193, + "flos": 21180661263360.0, + "grad_norm": 2.206821966721495, + "language_loss": 0.75945342, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78750646, + "num_input_tokens_seen": 107161725, + "step": 4979, + "time_per_iteration": 2.7792046070098877 + }, + { + "auxiliary_loss_clip": 0.01499232, + "auxiliary_loss_mlp": 0.01305273, + "balance_loss_clip": 1.15911841, + "balance_loss_mlp": 1.05445719, + "epoch": 0.2994137982864873, + "flos": 32124751486560.0, + "grad_norm": 1.8893113104695698, + "language_loss": 0.6845516, + "learning_rate": 3.285691238725484e-06, + "loss": 0.71259665, + "num_input_tokens_seen": 107183935, + "step": 4980, + "time_per_iteration": 2.8288071155548096 + }, + { + "auxiliary_loss_clip": 0.01501579, + "auxiliary_loss_mlp": 0.0130369, + "balance_loss_clip": 1.16264868, + "balance_loss_mlp": 1.05478144, + "epoch": 0.29947392153915525, + "flos": 21107535042240.0, + "grad_norm": 1.9200718308821674, + "language_loss": 0.73709202, + "learning_rate": 3.285392888352555e-06, + "loss": 0.76514471, + "num_input_tokens_seen": 107204285, + "step": 4981, + "time_per_iteration": 2.7965776920318604 + }, + { + "auxiliary_loss_clip": 0.01498196, + "auxiliary_loss_mlp": 0.01311808, + "balance_loss_clip": 1.15759444, + "balance_loss_mlp": 1.05851209, + "epoch": 0.2995340447918232, + "flos": 21544737314400.0, + "grad_norm": 1.833586118652272, + "language_loss": 0.86406577, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.89216578, + "num_input_tokens_seen": 107225265, + "step": 4982, + "time_per_iteration": 2.8211066722869873 + }, + { + "auxiliary_loss_clip": 0.01503965, + "auxiliary_loss_mlp": 0.01309803, + "balance_loss_clip": 1.16396284, + "balance_loss_mlp": 1.05212069, + "epoch": 0.2995941680444912, + "flos": 16726902628320.0, + "grad_norm": 2.054808095406058, + "language_loss": 0.86439818, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.89253587, + "num_input_tokens_seen": 107241335, + "step": 4983, + "time_per_iteration": 2.8170106410980225 + }, + { + "auxiliary_loss_clip": 0.01504175, + "auxiliary_loss_mlp": 0.01304586, + "balance_loss_clip": 1.16514874, + "balance_loss_mlp": 1.05720329, + "epoch": 0.2996542912971592, + "flos": 20925629765280.0, + "grad_norm": 2.0161847859321744, + "language_loss": 0.78550607, + "learning_rate": 3.284497544825668e-06, + "loss": 0.81359369, + "num_input_tokens_seen": 107259375, + "step": 4984, + "time_per_iteration": 2.810880184173584 + }, + { + "auxiliary_loss_clip": 0.01513473, + "auxiliary_loss_mlp": 0.01311677, + "balance_loss_clip": 1.17581677, + "balance_loss_mlp": 1.05952573, + "epoch": 0.29971441454982717, + "flos": 25082066639520.0, + "grad_norm": 1.6871014623245355, + "language_loss": 0.78746182, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.81571335, + "num_input_tokens_seen": 107279890, + "step": 4985, + "time_per_iteration": 2.7794671058654785 + }, + { + "auxiliary_loss_clip": 0.01504894, + "auxiliary_loss_mlp": 0.01296305, + "balance_loss_clip": 1.16547894, + "balance_loss_mlp": 1.0422467, + "epoch": 0.29977453780249513, + "flos": 52559775766080.0, + "grad_norm": 2.256393725801356, + "language_loss": 0.71776325, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74577522, + "num_input_tokens_seen": 107303430, + "step": 4986, + "time_per_iteration": 3.0613226890563965 + }, + { + "auxiliary_loss_clip": 0.01505068, + "auxiliary_loss_mlp": 0.01299849, + "balance_loss_clip": 1.16517401, + "balance_loss_mlp": 1.04044986, + "epoch": 0.2998346610551631, + "flos": 22239360558720.0, + "grad_norm": 1.75507252151574, + "language_loss": 0.73610175, + "learning_rate": 3.283601762924312e-06, + "loss": 0.76415086, + "num_input_tokens_seen": 107323700, + "step": 4987, + "time_per_iteration": 2.9367711544036865 + }, + { + "auxiliary_loss_clip": 0.01504759, + "auxiliary_loss_mlp": 0.01293124, + "balance_loss_clip": 1.16616094, + "balance_loss_mlp": 1.04211688, + "epoch": 0.29989478430783106, + "flos": 16875089406720.0, + "grad_norm": 1.6447659633746798, + "language_loss": 0.7989862, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82696497, + "num_input_tokens_seen": 107341965, + "step": 4988, + "time_per_iteration": 2.9459950923919678 + }, + { + "auxiliary_loss_clip": 0.01499865, + "auxiliary_loss_mlp": 0.0131491, + "balance_loss_clip": 1.16032887, + "balance_loss_mlp": 1.06886184, + "epoch": 0.29995490756049903, + "flos": 23771142529920.0, + "grad_norm": 1.5922195704846729, + "language_loss": 0.709216, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.7373637, + "num_input_tokens_seen": 107362615, + "step": 4989, + "time_per_iteration": 2.7767348289489746 + }, + { + "auxiliary_loss_clip": 0.01509401, + "auxiliary_loss_mlp": 0.01292186, + "balance_loss_clip": 1.17110157, + "balance_loss_mlp": 1.03755546, + "epoch": 0.300015030813167, + "flos": 14467082339520.0, + "grad_norm": 2.0790825706497778, + "language_loss": 0.85357434, + "learning_rate": 3.282705542954199e-06, + "loss": 0.88159025, + "num_input_tokens_seen": 107378980, + "step": 4990, + "time_per_iteration": 2.7116568088531494 + }, + { + "auxiliary_loss_clip": 0.01504538, + "auxiliary_loss_mlp": 0.01290115, + "balance_loss_clip": 1.16458917, + "balance_loss_mlp": 1.03758168, + "epoch": 0.30007515406583496, + "flos": 25194335086080.0, + "grad_norm": 2.339835203197718, + "language_loss": 0.67325097, + "learning_rate": 3.28240670566841e-06, + "loss": 0.7011975, + "num_input_tokens_seen": 107397640, + "step": 4991, + "time_per_iteration": 2.817652463912964 + }, + { + "auxiliary_loss_clip": 0.01506119, + "auxiliary_loss_mlp": 0.01298679, + "balance_loss_clip": 1.16654658, + "balance_loss_mlp": 1.04252219, + "epoch": 0.3001352773185029, + "flos": 19393240943520.0, + "grad_norm": 1.775913420089297, + "language_loss": 0.7888521, + "learning_rate": 3.28210781975363e-06, + "loss": 0.81690013, + "num_input_tokens_seen": 107416020, + "step": 4992, + "time_per_iteration": 2.7576310634613037 + }, + { + "auxiliary_loss_clip": 0.01507418, + "auxiliary_loss_mlp": 0.01298202, + "balance_loss_clip": 1.16862345, + "balance_loss_mlp": 1.04700398, + "epoch": 0.3001954005711709, + "flos": 21546026871840.0, + "grad_norm": 2.048826217614398, + "language_loss": 0.82491255, + "learning_rate": 3.281808885221193e-06, + "loss": 0.85296869, + "num_input_tokens_seen": 107436340, + "step": 4993, + "time_per_iteration": 2.8264853954315186 + }, + { + "auxiliary_loss_clip": 0.01502587, + "auxiliary_loss_mlp": 0.01310734, + "balance_loss_clip": 1.16296327, + "balance_loss_mlp": 1.05591273, + "epoch": 0.30025552382383885, + "flos": 17386252319520.0, + "grad_norm": 2.292656313166052, + "language_loss": 0.86183822, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88997138, + "num_input_tokens_seen": 107454585, + "step": 4994, + "time_per_iteration": 2.7427139282226562 + }, + { + "auxiliary_loss_clip": 0.01505959, + "auxiliary_loss_mlp": 0.01281678, + "balance_loss_clip": 1.16755414, + "balance_loss_mlp": 1.0293355, + "epoch": 0.3003156470765068, + "flos": 29535901130880.0, + "grad_norm": 1.5725924319956643, + "language_loss": 0.81212753, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.84000385, + "num_input_tokens_seen": 107477180, + "step": 4995, + "time_per_iteration": 2.857919931411743 + }, + { + "auxiliary_loss_clip": 0.01504414, + "auxiliary_loss_mlp": 0.01298788, + "balance_loss_clip": 1.16624165, + "balance_loss_mlp": 1.04797173, + "epoch": 0.3003757703291748, + "flos": 43649375955840.0, + "grad_norm": 1.7490565364236292, + "language_loss": 0.67270982, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.70074183, + "num_input_tokens_seen": 107500250, + "step": 4996, + "time_per_iteration": 2.9027645587921143 + }, + { + "auxiliary_loss_clip": 0.01511438, + "auxiliary_loss_mlp": 0.01295507, + "balance_loss_clip": 1.1742065, + "balance_loss_mlp": 1.0404942, + "epoch": 0.30043589358184275, + "flos": 22530575885760.0, + "grad_norm": 1.7611596701831866, + "language_loss": 0.7553491, + "learning_rate": 3.280612661141615e-06, + "loss": 0.78341854, + "num_input_tokens_seen": 107520070, + "step": 4997, + "time_per_iteration": 2.744473457336426 + }, + { + "auxiliary_loss_clip": 0.01499984, + "auxiliary_loss_mlp": 0.01288941, + "balance_loss_clip": 1.16188061, + "balance_loss_mlp": 1.03545415, + "epoch": 0.30049601683451077, + "flos": 20997997423200.0, + "grad_norm": 2.767335021438184, + "language_loss": 0.77588499, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.80377424, + "num_input_tokens_seen": 107539285, + "step": 4998, + "time_per_iteration": 2.74399471282959 + }, + { + "auxiliary_loss_clip": 0.01515533, + "auxiliary_loss_mlp": 0.01301087, + "balance_loss_clip": 1.17853343, + "balance_loss_mlp": 1.04969835, + "epoch": 0.30055614008717874, + "flos": 23918343176160.0, + "grad_norm": 3.57380667281902, + "language_loss": 0.73562121, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.76378739, + "num_input_tokens_seen": 107560260, + "step": 4999, + "time_per_iteration": 2.7995762825012207 + }, + { + "auxiliary_loss_clip": 0.01502393, + "auxiliary_loss_mlp": 0.01295899, + "balance_loss_clip": 1.16398203, + "balance_loss_mlp": 1.0408864, + "epoch": 0.3006162633398467, + "flos": 19171700375040.0, + "grad_norm": 2.006864804408444, + "language_loss": 0.7608521, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.78883493, + "num_input_tokens_seen": 107579260, + "step": 5000, + "time_per_iteration": 2.8266172409057617 + }, + { + "auxiliary_loss_clip": 0.01500895, + "auxiliary_loss_mlp": 0.01279036, + "balance_loss_clip": 1.16436195, + "balance_loss_mlp": 1.02154422, + "epoch": 0.30067638659251467, + "flos": 14680733850720.0, + "grad_norm": 5.286587449056651, + "language_loss": 0.81833398, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.84613335, + "num_input_tokens_seen": 107595245, + "step": 5001, + "time_per_iteration": 2.9020683765411377 + }, + { + "auxiliary_loss_clip": 0.01513963, + "auxiliary_loss_mlp": 0.01292982, + "balance_loss_clip": 1.17570364, + "balance_loss_mlp": 1.03758752, + "epoch": 0.30073650984518263, + "flos": 23370275799360.0, + "grad_norm": 1.7149652251400695, + "language_loss": 0.80663645, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.83470595, + "num_input_tokens_seen": 107613985, + "step": 5002, + "time_per_iteration": 2.7923636436462402 + }, + { + "auxiliary_loss_clip": 0.01506164, + "auxiliary_loss_mlp": 0.01292473, + "balance_loss_clip": 1.16912007, + "balance_loss_mlp": 1.0365063, + "epoch": 0.3007966330978506, + "flos": 22968878074560.0, + "grad_norm": 4.430499100529078, + "language_loss": 0.71369612, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.74168253, + "num_input_tokens_seen": 107631435, + "step": 5003, + "time_per_iteration": 4.443756103515625 + }, + { + "auxiliary_loss_clip": 0.01500517, + "auxiliary_loss_mlp": 0.01300555, + "balance_loss_clip": 1.16265762, + "balance_loss_mlp": 1.04268193, + "epoch": 0.30085675635051856, + "flos": 27820924325280.0, + "grad_norm": 2.263185916937629, + "language_loss": 0.70423388, + "learning_rate": 3.27851739984233e-06, + "loss": 0.73224461, + "num_input_tokens_seen": 107650530, + "step": 5004, + "time_per_iteration": 2.831202268600464 + }, + { + "auxiliary_loss_clip": 0.0150752, + "auxiliary_loss_mlp": 0.01290637, + "balance_loss_clip": 1.1692102, + "balance_loss_mlp": 1.03695953, + "epoch": 0.3009168796031865, + "flos": 10883897504640.0, + "grad_norm": 2.7860074263208032, + "language_loss": 0.81738603, + "learning_rate": 3.278217882782715e-06, + "loss": 0.84536761, + "num_input_tokens_seen": 107662240, + "step": 5005, + "time_per_iteration": 2.7484023571014404 + }, + { + "auxiliary_loss_clip": 0.0150896, + "auxiliary_loss_mlp": 0.01286892, + "balance_loss_clip": 1.17074585, + "balance_loss_mlp": 1.03454947, + "epoch": 0.3009770028558545, + "flos": 23807667712320.0, + "grad_norm": 2.864847777071977, + "language_loss": 0.74627751, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77423608, + "num_input_tokens_seen": 107680330, + "step": 5006, + "time_per_iteration": 2.7624332904815674 + }, + { + "auxiliary_loss_clip": 0.01498383, + "auxiliary_loss_mlp": 0.01278969, + "balance_loss_clip": 1.16054606, + "balance_loss_mlp": 1.02376521, + "epoch": 0.30103712610852246, + "flos": 26470592493120.0, + "grad_norm": 1.8539243473082947, + "language_loss": 0.71471137, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.74248481, + "num_input_tokens_seen": 107700020, + "step": 5007, + "time_per_iteration": 2.8190205097198486 + }, + { + "auxiliary_loss_clip": 0.01498425, + "auxiliary_loss_mlp": 0.01288623, + "balance_loss_clip": 1.15962994, + "balance_loss_mlp": 1.0315125, + "epoch": 0.3010972493611904, + "flos": 22858657748640.0, + "grad_norm": 5.105162477633402, + "language_loss": 0.76841646, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.79628694, + "num_input_tokens_seen": 107718575, + "step": 5008, + "time_per_iteration": 2.7616076469421387 + }, + { + "auxiliary_loss_clip": 0.01501406, + "auxiliary_loss_mlp": 0.01295132, + "balance_loss_clip": 1.16266572, + "balance_loss_mlp": 1.04393387, + "epoch": 0.3011573726138584, + "flos": 24055568716320.0, + "grad_norm": 2.138174912545331, + "language_loss": 0.84701014, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87497556, + "num_input_tokens_seen": 107738635, + "step": 5009, + "time_per_iteration": 2.7739503383636475 + }, + { + "auxiliary_loss_clip": 0.01505653, + "auxiliary_loss_mlp": 0.01295073, + "balance_loss_clip": 1.16779876, + "balance_loss_mlp": 1.03929782, + "epoch": 0.30121749586652635, + "flos": 20261387341440.0, + "grad_norm": 1.977785868112266, + "language_loss": 0.83648229, + "learning_rate": 3.276719570659604e-06, + "loss": 0.86448956, + "num_input_tokens_seen": 107753415, + "step": 5010, + "time_per_iteration": 2.755629539489746 + }, + { + "auxiliary_loss_clip": 0.01498084, + "auxiliary_loss_mlp": 0.0128638, + "balance_loss_clip": 1.16115355, + "balance_loss_mlp": 1.03537297, + "epoch": 0.3012776191191944, + "flos": 26945495720640.0, + "grad_norm": 3.450915990709767, + "language_loss": 0.85225838, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.88010299, + "num_input_tokens_seen": 107773840, + "step": 5011, + "time_per_iteration": 4.391032457351685 + }, + { + "auxiliary_loss_clip": 0.01498874, + "auxiliary_loss_mlp": 0.01297437, + "balance_loss_clip": 1.16019464, + "balance_loss_mlp": 1.03880012, + "epoch": 0.30133774237186234, + "flos": 20414390996160.0, + "grad_norm": 2.8965522156334558, + "language_loss": 0.72254503, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.75050819, + "num_input_tokens_seen": 107792020, + "step": 5012, + "time_per_iteration": 4.217578649520874 + }, + { + "auxiliary_loss_clip": 0.01501315, + "auxiliary_loss_mlp": 0.01299072, + "balance_loss_clip": 1.16265678, + "balance_loss_mlp": 1.04310572, + "epoch": 0.3013978656245303, + "flos": 19794524883840.0, + "grad_norm": 2.4917405630277787, + "language_loss": 0.87571329, + "learning_rate": 3.275820002334819e-06, + "loss": 0.90371716, + "num_input_tokens_seen": 107809595, + "step": 5013, + "time_per_iteration": 4.233405590057373 + }, + { + "auxiliary_loss_clip": 0.01500145, + "auxiliary_loss_mlp": 0.01302676, + "balance_loss_clip": 1.16232038, + "balance_loss_mlp": 1.04747319, + "epoch": 0.30145798887719827, + "flos": 16251316693920.0, + "grad_norm": 2.2034136664601176, + "language_loss": 0.83190393, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85993218, + "num_input_tokens_seen": 107827230, + "step": 5014, + "time_per_iteration": 2.902712106704712 + }, + { + "auxiliary_loss_clip": 0.01503649, + "auxiliary_loss_mlp": 0.01289174, + "balance_loss_clip": 1.16699553, + "balance_loss_mlp": 1.04064679, + "epoch": 0.30151811212986623, + "flos": 24574051764000.0, + "grad_norm": 2.4697248238401577, + "language_loss": 0.68293911, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.7108674, + "num_input_tokens_seen": 107847195, + "step": 5015, + "time_per_iteration": 2.7760684490203857 + }, + { + "auxiliary_loss_clip": 0.01503669, + "auxiliary_loss_mlp": 0.01292635, + "balance_loss_clip": 1.1653111, + "balance_loss_mlp": 1.04067457, + "epoch": 0.3015782353825342, + "flos": 21874260447360.0, + "grad_norm": 2.404279671741041, + "language_loss": 0.74525392, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.77321696, + "num_input_tokens_seen": 107866420, + "step": 5016, + "time_per_iteration": 2.795203447341919 + }, + { + "auxiliary_loss_clip": 0.01505321, + "auxiliary_loss_mlp": 0.01297149, + "balance_loss_clip": 1.16726136, + "balance_loss_mlp": 1.04652333, + "epoch": 0.30163835863520216, + "flos": 28771982409600.0, + "grad_norm": 1.703392049530925, + "language_loss": 0.65671104, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.68473577, + "num_input_tokens_seen": 107889090, + "step": 5017, + "time_per_iteration": 2.811066150665283 + }, + { + "auxiliary_loss_clip": 0.01500535, + "auxiliary_loss_mlp": 0.01300714, + "balance_loss_clip": 1.16208649, + "balance_loss_mlp": 1.04875362, + "epoch": 0.30169848188787013, + "flos": 22968422936640.0, + "grad_norm": 2.261122641977047, + "language_loss": 0.68939888, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.7174114, + "num_input_tokens_seen": 107907520, + "step": 5018, + "time_per_iteration": 2.791487455368042 + }, + { + "auxiliary_loss_clip": 0.0149564, + "auxiliary_loss_mlp": 0.01290681, + "balance_loss_clip": 1.15717483, + "balance_loss_mlp": 1.04520535, + "epoch": 0.3017586051405381, + "flos": 21837431839680.0, + "grad_norm": 2.226283078237637, + "language_loss": 0.79151833, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81938159, + "num_input_tokens_seen": 107925650, + "step": 5019, + "time_per_iteration": 2.8254668712615967 + }, + { + "auxiliary_loss_clip": 0.014994, + "auxiliary_loss_mlp": 0.01304364, + "balance_loss_clip": 1.16244555, + "balance_loss_mlp": 1.0556457, + "epoch": 0.30181872839320606, + "flos": 22162403593440.0, + "grad_norm": 2.443565778514546, + "language_loss": 0.6984309, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.7264685, + "num_input_tokens_seen": 107943975, + "step": 5020, + "time_per_iteration": 2.7805535793304443 + }, + { + "auxiliary_loss_clip": 0.0150881, + "auxiliary_loss_mlp": 0.01294334, + "balance_loss_clip": 1.16965568, + "balance_loss_mlp": 1.03817713, + "epoch": 0.301878851645874, + "flos": 18116149116960.0, + "grad_norm": 2.6706785840371206, + "language_loss": 0.78461361, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.81264508, + "num_input_tokens_seen": 107962950, + "step": 5021, + "time_per_iteration": 2.8412978649139404 + }, + { + "auxiliary_loss_clip": 0.01506995, + "auxiliary_loss_mlp": 0.01298894, + "balance_loss_clip": 1.16781545, + "balance_loss_mlp": 1.04407191, + "epoch": 0.301938974898542, + "flos": 17604113856480.0, + "grad_norm": 4.588262321298536, + "language_loss": 0.76586711, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.793926, + "num_input_tokens_seen": 107979700, + "step": 5022, + "time_per_iteration": 2.770036458969116 + }, + { + "auxiliary_loss_clip": 0.01494459, + "auxiliary_loss_mlp": 0.0129448, + "balance_loss_clip": 1.15626383, + "balance_loss_mlp": 1.04137492, + "epoch": 0.30199909815120995, + "flos": 11182774320000.0, + "grad_norm": 2.188379955486667, + "language_loss": 0.70037651, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72826588, + "num_input_tokens_seen": 107996645, + "step": 5023, + "time_per_iteration": 2.759122848510742 + }, + { + "auxiliary_loss_clip": 0.01497518, + "auxiliary_loss_mlp": 0.01273754, + "balance_loss_clip": 1.16004717, + "balance_loss_mlp": 1.02103043, + "epoch": 0.302059221403878, + "flos": 21909989138400.0, + "grad_norm": 2.177524383396297, + "language_loss": 0.71788633, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.74559903, + "num_input_tokens_seen": 108015020, + "step": 5024, + "time_per_iteration": 2.7672126293182373 + }, + { + "auxiliary_loss_clip": 0.01498518, + "auxiliary_loss_mlp": 0.01287664, + "balance_loss_clip": 1.16146064, + "balance_loss_mlp": 1.03417742, + "epoch": 0.30211934465654594, + "flos": 26398831685760.0, + "grad_norm": 1.8510185498130598, + "language_loss": 0.74565637, + "learning_rate": 3.272217377978061e-06, + "loss": 0.7735182, + "num_input_tokens_seen": 108036430, + "step": 5025, + "time_per_iteration": 2.858558416366577 + }, + { + "auxiliary_loss_clip": 0.01502841, + "auxiliary_loss_mlp": 0.01288459, + "balance_loss_clip": 1.16383243, + "balance_loss_mlp": 1.03802419, + "epoch": 0.3021794679092139, + "flos": 23402515099680.0, + "grad_norm": 1.917094153254676, + "language_loss": 0.67176688, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69967985, + "num_input_tokens_seen": 108054250, + "step": 5026, + "time_per_iteration": 2.775459051132202 + }, + { + "auxiliary_loss_clip": 0.01501214, + "auxiliary_loss_mlp": 0.01283717, + "balance_loss_clip": 1.16450131, + "balance_loss_mlp": 1.02946782, + "epoch": 0.30223959116188187, + "flos": 20262752755200.0, + "grad_norm": 1.7787411227620498, + "language_loss": 0.85287946, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.88072878, + "num_input_tokens_seen": 108071495, + "step": 5027, + "time_per_iteration": 2.7420201301574707 + }, + { + "auxiliary_loss_clip": 0.01502742, + "auxiliary_loss_mlp": 0.01283022, + "balance_loss_clip": 1.16586328, + "balance_loss_mlp": 1.02953529, + "epoch": 0.30229971441454984, + "flos": 26690274581760.0, + "grad_norm": 1.6184288985655677, + "language_loss": 0.78675294, + "learning_rate": 3.271315635661351e-06, + "loss": 0.81461054, + "num_input_tokens_seen": 108092135, + "step": 5028, + "time_per_iteration": 2.8683156967163086 + }, + { + "auxiliary_loss_clip": 0.01497876, + "auxiliary_loss_mlp": 0.01283609, + "balance_loss_clip": 1.16032803, + "balance_loss_mlp": 1.02687955, + "epoch": 0.3023598376672178, + "flos": 34347932808480.0, + "grad_norm": 2.1225221261182448, + "language_loss": 0.77033317, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.79814804, + "num_input_tokens_seen": 108112945, + "step": 5029, + "time_per_iteration": 2.878357410430908 + }, + { + "auxiliary_loss_clip": 0.01505996, + "auxiliary_loss_mlp": 0.01296247, + "balance_loss_clip": 1.1676116, + "balance_loss_mlp": 1.04562116, + "epoch": 0.30241996091988577, + "flos": 23114371953600.0, + "grad_norm": 3.6102406219247363, + "language_loss": 0.82733524, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.85535765, + "num_input_tokens_seen": 108130325, + "step": 5030, + "time_per_iteration": 2.8174808025360107 + }, + { + "auxiliary_loss_clip": 0.01500996, + "auxiliary_loss_mlp": 0.01288777, + "balance_loss_clip": 1.16327012, + "balance_loss_mlp": 1.03471828, + "epoch": 0.30248008417255373, + "flos": 19391989314240.0, + "grad_norm": 1.734699667876591, + "language_loss": 0.69749737, + "learning_rate": 3.270413459468905e-06, + "loss": 0.72539508, + "num_input_tokens_seen": 108150300, + "step": 5031, + "time_per_iteration": 2.7387349605560303 + }, + { + "auxiliary_loss_clip": 0.01500699, + "auxiliary_loss_mlp": 0.01285809, + "balance_loss_clip": 1.16349304, + "balance_loss_mlp": 1.0317502, + "epoch": 0.3025402074252217, + "flos": 23772014877600.0, + "grad_norm": 1.6916045925635859, + "language_loss": 0.82289958, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.85076469, + "num_input_tokens_seen": 108170330, + "step": 5032, + "time_per_iteration": 2.845798969268799 + }, + { + "auxiliary_loss_clip": 0.01512909, + "auxiliary_loss_mlp": 0.0128635, + "balance_loss_clip": 1.17502534, + "balance_loss_mlp": 1.02523422, + "epoch": 0.30260033067788966, + "flos": 25996485756960.0, + "grad_norm": 2.4105942320262628, + "language_loss": 0.73332572, + "learning_rate": 3.269811767783906e-06, + "loss": 0.76131833, + "num_input_tokens_seen": 108191265, + "step": 5033, + "time_per_iteration": 2.836771249771118 + }, + { + "auxiliary_loss_clip": 0.01498699, + "auxiliary_loss_mlp": 0.01290588, + "balance_loss_clip": 1.16100621, + "balance_loss_mlp": 1.03900909, + "epoch": 0.3026604539305576, + "flos": 25376923069920.0, + "grad_norm": 1.6088474502054295, + "language_loss": 0.7386384, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76653135, + "num_input_tokens_seen": 108211615, + "step": 5034, + "time_per_iteration": 2.8624398708343506 + }, + { + "auxiliary_loss_clip": 0.01504315, + "auxiliary_loss_mlp": 0.01289852, + "balance_loss_clip": 1.16619325, + "balance_loss_mlp": 1.0377003, + "epoch": 0.3027205771832256, + "flos": 25815149402400.0, + "grad_norm": 2.0128692113669113, + "language_loss": 0.72186792, + "learning_rate": 3.269209883493352e-06, + "loss": 0.74980956, + "num_input_tokens_seen": 108231080, + "step": 5035, + "time_per_iteration": 2.789098024368286 + }, + { + "auxiliary_loss_clip": 0.0150329, + "auxiliary_loss_mlp": 0.01291964, + "balance_loss_clip": 1.16476524, + "balance_loss_mlp": 1.04210091, + "epoch": 0.30278070043589356, + "flos": 27347007229920.0, + "grad_norm": 2.18821076366741, + "language_loss": 0.87915862, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.90711111, + "num_input_tokens_seen": 108251125, + "step": 5036, + "time_per_iteration": 2.7800395488739014 + }, + { + "auxiliary_loss_clip": 0.0150299, + "auxiliary_loss_mlp": 0.01289452, + "balance_loss_clip": 1.16567576, + "balance_loss_mlp": 1.03958941, + "epoch": 0.3028408236885616, + "flos": 24788310125760.0, + "grad_norm": 1.6472724944644466, + "language_loss": 0.77380335, + "learning_rate": 3.268607806688536e-06, + "loss": 0.80172777, + "num_input_tokens_seen": 108272545, + "step": 5037, + "time_per_iteration": 2.8315815925598145 + }, + { + "auxiliary_loss_clip": 0.01509055, + "auxiliary_loss_mlp": 0.01304129, + "balance_loss_clip": 1.17097616, + "balance_loss_mlp": 1.05235863, + "epoch": 0.30290094694122954, + "flos": 12934124595360.0, + "grad_norm": 2.198851848823883, + "language_loss": 0.7696898, + "learning_rate": 3.268306696121816e-06, + "loss": 0.79782164, + "num_input_tokens_seen": 108289725, + "step": 5038, + "time_per_iteration": 2.6685872077941895 + }, + { + "auxiliary_loss_clip": 0.01498548, + "auxiliary_loss_mlp": 0.01281867, + "balance_loss_clip": 1.16204906, + "balance_loss_mlp": 1.03009677, + "epoch": 0.3029610701938975, + "flos": 25918542659520.0, + "grad_norm": 19.052033994939396, + "language_loss": 0.73726213, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76506627, + "num_input_tokens_seen": 108310690, + "step": 5039, + "time_per_iteration": 2.7984986305236816 + }, + { + "auxiliary_loss_clip": 0.0149555, + "auxiliary_loss_mlp": 0.01282649, + "balance_loss_clip": 1.15831971, + "balance_loss_mlp": 1.03259587, + "epoch": 0.3030211934465655, + "flos": 21983153287680.0, + "grad_norm": 2.220301177840394, + "language_loss": 0.79837489, + "learning_rate": 3.267704330716847e-06, + "loss": 0.82615685, + "num_input_tokens_seen": 108328905, + "step": 5040, + "time_per_iteration": 2.7416441440582275 + }, + { + "auxiliary_loss_clip": 0.01509156, + "auxiliary_loss_mlp": 0.01280504, + "balance_loss_clip": 1.17284775, + "balance_loss_mlp": 1.02987862, + "epoch": 0.30308131669923344, + "flos": 20993863253760.0, + "grad_norm": 2.1538115508262616, + "language_loss": 0.82241881, + "learning_rate": 3.267403075901438e-06, + "loss": 0.85031545, + "num_input_tokens_seen": 108346680, + "step": 5041, + "time_per_iteration": 2.8735179901123047 + }, + { + "auxiliary_loss_clip": 0.01585939, + "auxiliary_loss_mlp": 0.01239296, + "balance_loss_clip": 1.25102258, + "balance_loss_mlp": 1.02490997, + "epoch": 0.3031414399519014, + "flos": 60555376812960.0, + "grad_norm": 0.7792964860408262, + "language_loss": 0.59466398, + "learning_rate": 3.267101773025978e-06, + "loss": 0.62291628, + "num_input_tokens_seen": 108413885, + "step": 5042, + "time_per_iteration": 5.12737774848938 + }, + { + "auxiliary_loss_clip": 0.015054, + "auxiliary_loss_mlp": 0.01286685, + "balance_loss_clip": 1.1676147, + "balance_loss_mlp": 1.03205371, + "epoch": 0.30320156320456937, + "flos": 21909799497600.0, + "grad_norm": 2.854562571407636, + "language_loss": 0.71589875, + "learning_rate": 3.266800422101892e-06, + "loss": 0.74381959, + "num_input_tokens_seen": 108433640, + "step": 5043, + "time_per_iteration": 2.7833030223846436 + }, + { + "auxiliary_loss_clip": 0.01506068, + "auxiliary_loss_mlp": 0.01292221, + "balance_loss_clip": 1.16811943, + "balance_loss_mlp": 1.0431217, + "epoch": 0.30326168645723733, + "flos": 21654919712160.0, + "grad_norm": 2.762794609601558, + "language_loss": 0.69975102, + "learning_rate": 3.266499023140606e-06, + "loss": 0.72773385, + "num_input_tokens_seen": 108452640, + "step": 5044, + "time_per_iteration": 2.7785379886627197 + }, + { + "auxiliary_loss_clip": 0.01508662, + "auxiliary_loss_mlp": 0.01283159, + "balance_loss_clip": 1.17054915, + "balance_loss_mlp": 1.03463137, + "epoch": 0.3033218097099053, + "flos": 21873426027840.0, + "grad_norm": 1.423682032837964, + "language_loss": 0.77553928, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.8034575, + "num_input_tokens_seen": 108472470, + "step": 5045, + "time_per_iteration": 2.774914264678955 + }, + { + "auxiliary_loss_clip": 0.01508946, + "auxiliary_loss_mlp": 0.01295734, + "balance_loss_clip": 1.17187595, + "balance_loss_mlp": 1.04167485, + "epoch": 0.30338193296257326, + "flos": 27092203300800.0, + "grad_norm": 1.8778106521511568, + "language_loss": 0.72905993, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.75710678, + "num_input_tokens_seen": 108493025, + "step": 5046, + "time_per_iteration": 2.8242664337158203 + }, + { + "auxiliary_loss_clip": 0.01512169, + "auxiliary_loss_mlp": 0.01297779, + "balance_loss_clip": 1.17383659, + "balance_loss_mlp": 1.0454365, + "epoch": 0.30344205621524123, + "flos": 19536269492160.0, + "grad_norm": 2.1020988478637386, + "language_loss": 0.81328607, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.8413856, + "num_input_tokens_seen": 108513480, + "step": 5047, + "time_per_iteration": 2.8372437953948975 + }, + { + "auxiliary_loss_clip": 0.01514877, + "auxiliary_loss_mlp": 0.01283607, + "balance_loss_clip": 1.17744505, + "balance_loss_mlp": 1.03164673, + "epoch": 0.3035021794679092, + "flos": 23913033233760.0, + "grad_norm": 1.874586735197371, + "language_loss": 0.72268111, + "learning_rate": 3.265292947152084e-06, + "loss": 0.7506659, + "num_input_tokens_seen": 108533155, + "step": 5048, + "time_per_iteration": 2.817469596862793 + }, + { + "auxiliary_loss_clip": 0.01511653, + "auxiliary_loss_mlp": 0.01283965, + "balance_loss_clip": 1.17506278, + "balance_loss_mlp": 1.03410184, + "epoch": 0.30356230272057716, + "flos": 16145458106400.0, + "grad_norm": 2.0169668956760054, + "language_loss": 0.75691092, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.78486705, + "num_input_tokens_seen": 108551900, + "step": 5049, + "time_per_iteration": 2.8481204509735107 + }, + { + "auxiliary_loss_clip": 0.01505568, + "auxiliary_loss_mlp": 0.0129295, + "balance_loss_clip": 1.16802514, + "balance_loss_mlp": 1.04442263, + "epoch": 0.3036224259732452, + "flos": 28917324576000.0, + "grad_norm": 1.8968913205547373, + "language_loss": 0.8216815, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84966671, + "num_input_tokens_seen": 108574005, + "step": 5050, + "time_per_iteration": 4.330431699752808 + }, + { + "auxiliary_loss_clip": 0.01514464, + "auxiliary_loss_mlp": 0.01287993, + "balance_loss_clip": 1.17606187, + "balance_loss_mlp": 1.03565073, + "epoch": 0.30368254922591315, + "flos": 21107724683040.0, + "grad_norm": 2.840687149674123, + "language_loss": 0.73722517, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.76524979, + "num_input_tokens_seen": 108592715, + "step": 5051, + "time_per_iteration": 5.75999641418457 + }, + { + "auxiliary_loss_clip": 0.01507359, + "auxiliary_loss_mlp": 0.01299616, + "balance_loss_clip": 1.17010128, + "balance_loss_mlp": 1.05089736, + "epoch": 0.3037426724785811, + "flos": 23004492981120.0, + "grad_norm": 1.7147795114009978, + "language_loss": 0.7630924, + "learning_rate": 3.264086103483033e-06, + "loss": 0.79116213, + "num_input_tokens_seen": 108611770, + "step": 5052, + "time_per_iteration": 2.898644208908081 + }, + { + "auxiliary_loss_clip": 0.01510149, + "auxiliary_loss_mlp": 0.01293229, + "balance_loss_clip": 1.17210913, + "balance_loss_mlp": 1.04222143, + "epoch": 0.3038027957312491, + "flos": 15634522762560.0, + "grad_norm": 2.4137824383063604, + "language_loss": 0.82880723, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.85684103, + "num_input_tokens_seen": 108629070, + "step": 5053, + "time_per_iteration": 2.862110137939453 + }, + { + "auxiliary_loss_clip": 0.01505984, + "auxiliary_loss_mlp": 0.01287554, + "balance_loss_clip": 1.16841245, + "balance_loss_mlp": 1.03502083, + "epoch": 0.30386291898391704, + "flos": 12715656207840.0, + "grad_norm": 1.7031217073562444, + "language_loss": 0.71234602, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.7402814, + "num_input_tokens_seen": 108646315, + "step": 5054, + "time_per_iteration": 2.7438137531280518 + }, + { + "auxiliary_loss_clip": 0.01509351, + "auxiliary_loss_mlp": 0.01286132, + "balance_loss_clip": 1.17087328, + "balance_loss_mlp": 1.03493428, + "epoch": 0.303923042236585, + "flos": 26361851365440.0, + "grad_norm": 1.7869292057549404, + "language_loss": 0.69643605, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.72439086, + "num_input_tokens_seen": 108665920, + "step": 5055, + "time_per_iteration": 2.7588980197906494 + }, + { + "auxiliary_loss_clip": 0.01503425, + "auxiliary_loss_mlp": 0.01281283, + "balance_loss_clip": 1.16459751, + "balance_loss_mlp": 1.0274148, + "epoch": 0.30398316548925297, + "flos": 19721284878240.0, + "grad_norm": 2.29637977914522, + "language_loss": 0.67614758, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.70399463, + "num_input_tokens_seen": 108683485, + "step": 5056, + "time_per_iteration": 2.8207409381866455 + }, + { + "auxiliary_loss_clip": 0.01509261, + "auxiliary_loss_mlp": 0.0127859, + "balance_loss_clip": 1.17078722, + "balance_loss_mlp": 1.02586675, + "epoch": 0.30404328874192094, + "flos": 24241949516160.0, + "grad_norm": 1.689256150631794, + "language_loss": 0.8248387, + "learning_rate": 3.262576470461507e-06, + "loss": 0.85271722, + "num_input_tokens_seen": 108702700, + "step": 5057, + "time_per_iteration": 2.8673291206359863 + }, + { + "auxiliary_loss_clip": 0.01506278, + "auxiliary_loss_mlp": 0.01284706, + "balance_loss_clip": 1.16743279, + "balance_loss_mlp": 1.03064692, + "epoch": 0.3041034119945889, + "flos": 24501266896320.0, + "grad_norm": 1.7124498210596002, + "language_loss": 0.89145267, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91936255, + "num_input_tokens_seen": 108721860, + "step": 5058, + "time_per_iteration": 2.8956520557403564 + }, + { + "auxiliary_loss_clip": 0.01512781, + "auxiliary_loss_mlp": 0.01293918, + "balance_loss_clip": 1.17517531, + "balance_loss_mlp": 1.03699791, + "epoch": 0.30416353524725687, + "flos": 28291162389120.0, + "grad_norm": 2.0453649432548486, + "language_loss": 0.71516693, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.74323392, + "num_input_tokens_seen": 108743215, + "step": 5059, + "time_per_iteration": 2.8117406368255615 + }, + { + "auxiliary_loss_clip": 0.01503704, + "auxiliary_loss_mlp": 0.01280325, + "balance_loss_clip": 1.16577733, + "balance_loss_mlp": 1.02760077, + "epoch": 0.30422365849992483, + "flos": 23662704827520.0, + "grad_norm": 1.7207093110582412, + "language_loss": 0.73422658, + "learning_rate": 3.26167011603268e-06, + "loss": 0.76206696, + "num_input_tokens_seen": 108765505, + "step": 5060, + "time_per_iteration": 2.9776968955993652 + }, + { + "auxiliary_loss_clip": 0.01512207, + "auxiliary_loss_mlp": 0.0128336, + "balance_loss_clip": 1.17382526, + "balance_loss_mlp": 1.02910995, + "epoch": 0.3042837817525928, + "flos": 23000776021440.0, + "grad_norm": 1.829740435599098, + "language_loss": 0.77291214, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.80086774, + "num_input_tokens_seen": 108783370, + "step": 5061, + "time_per_iteration": 2.8135085105895996 + }, + { + "auxiliary_loss_clip": 0.01512243, + "auxiliary_loss_mlp": 0.01292477, + "balance_loss_clip": 1.17355335, + "balance_loss_mlp": 1.03765559, + "epoch": 0.30434390500526076, + "flos": 22084043286240.0, + "grad_norm": 2.3774892643476995, + "language_loss": 0.819511, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84755814, + "num_input_tokens_seen": 108797430, + "step": 5062, + "time_per_iteration": 2.7381391525268555 + }, + { + "auxiliary_loss_clip": 0.01499989, + "auxiliary_loss_mlp": 0.01273705, + "balance_loss_clip": 1.16238272, + "balance_loss_mlp": 1.02460551, + "epoch": 0.3044040282579287, + "flos": 25485853838400.0, + "grad_norm": 1.8658751233254818, + "language_loss": 0.75054729, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.77828419, + "num_input_tokens_seen": 108816945, + "step": 5063, + "time_per_iteration": 2.8582241535186768 + }, + { + "auxiliary_loss_clip": 0.01512984, + "auxiliary_loss_mlp": 0.01293842, + "balance_loss_clip": 1.17379558, + "balance_loss_mlp": 1.04417002, + "epoch": 0.30446415151059675, + "flos": 21947690093760.0, + "grad_norm": 2.1840714179881227, + "language_loss": 0.84530622, + "learning_rate": 3.26046097371721e-06, + "loss": 0.87337446, + "num_input_tokens_seen": 108836615, + "step": 5064, + "time_per_iteration": 2.8409531116485596 + }, + { + "auxiliary_loss_clip": 0.0150227, + "auxiliary_loss_mlp": 0.01296195, + "balance_loss_clip": 1.16445565, + "balance_loss_mlp": 1.04537857, + "epoch": 0.3045242747632647, + "flos": 16437280284000.0, + "grad_norm": 2.652127283877681, + "language_loss": 0.76272118, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.79070592, + "num_input_tokens_seen": 108855165, + "step": 5065, + "time_per_iteration": 2.812563896179199 + }, + { + "auxiliary_loss_clip": 0.01507949, + "auxiliary_loss_mlp": 0.01298947, + "balance_loss_clip": 1.17141366, + "balance_loss_mlp": 1.04374397, + "epoch": 0.3045843980159327, + "flos": 31543041467520.0, + "grad_norm": 2.1330183198649375, + "language_loss": 0.62517804, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.653247, + "num_input_tokens_seen": 108874690, + "step": 5066, + "time_per_iteration": 2.8595030307769775 + }, + { + "auxiliary_loss_clip": 0.01509158, + "auxiliary_loss_mlp": 0.01291585, + "balance_loss_clip": 1.17026317, + "balance_loss_mlp": 1.03828931, + "epoch": 0.30464452126860064, + "flos": 17855314610400.0, + "grad_norm": 2.0919512059801204, + "language_loss": 0.82609111, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85409856, + "num_input_tokens_seen": 108893140, + "step": 5067, + "time_per_iteration": 2.846766710281372 + }, + { + "auxiliary_loss_clip": 0.01499919, + "auxiliary_loss_mlp": 0.01283056, + "balance_loss_clip": 1.16160429, + "balance_loss_mlp": 1.03662682, + "epoch": 0.3047046445212686, + "flos": 20633504162400.0, + "grad_norm": 2.408000841503524, + "language_loss": 0.63511407, + "learning_rate": 3.259251066652873e-06, + "loss": 0.66294384, + "num_input_tokens_seen": 108911880, + "step": 5068, + "time_per_iteration": 2.7774312496185303 + }, + { + "auxiliary_loss_clip": 0.01501408, + "auxiliary_loss_mlp": 0.0128888, + "balance_loss_clip": 1.16436529, + "balance_loss_mlp": 1.0403527, + "epoch": 0.3047647677739366, + "flos": 21289705816320.0, + "grad_norm": 2.0986180587675882, + "language_loss": 0.7475276, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77543044, + "num_input_tokens_seen": 108930440, + "step": 5069, + "time_per_iteration": 2.983445882797241 + }, + { + "auxiliary_loss_clip": 0.01496934, + "auxiliary_loss_mlp": 0.01299749, + "balance_loss_clip": 1.160339, + "balance_loss_mlp": 1.0559895, + "epoch": 0.30482489102660454, + "flos": 20998035351360.0, + "grad_norm": 2.6902607591750587, + "language_loss": 0.75784785, + "learning_rate": 3.258645826569261e-06, + "loss": 0.78581464, + "num_input_tokens_seen": 108949125, + "step": 5070, + "time_per_iteration": 2.7906131744384766 + }, + { + "auxiliary_loss_clip": 0.0150273, + "auxiliary_loss_mlp": 0.01287297, + "balance_loss_clip": 1.16461015, + "balance_loss_mlp": 1.03514564, + "epoch": 0.3048850142792725, + "flos": 26293466164320.0, + "grad_norm": 1.73939625469437, + "language_loss": 0.81667423, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.84457451, + "num_input_tokens_seen": 108972190, + "step": 5071, + "time_per_iteration": 2.80409836769104 + }, + { + "auxiliary_loss_clip": 0.01503541, + "auxiliary_loss_mlp": 0.01287009, + "balance_loss_clip": 1.16698933, + "balance_loss_mlp": 1.03371286, + "epoch": 0.30494513753194047, + "flos": 22348443039840.0, + "grad_norm": 2.0216313854324346, + "language_loss": 0.7620241, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78992963, + "num_input_tokens_seen": 108990325, + "step": 5072, + "time_per_iteration": 2.838064193725586 + }, + { + "auxiliary_loss_clip": 0.01505334, + "auxiliary_loss_mlp": 0.01294285, + "balance_loss_clip": 1.16743743, + "balance_loss_mlp": 1.04785514, + "epoch": 0.30500526078460843, + "flos": 19539758882880.0, + "grad_norm": 1.9452778769793817, + "language_loss": 0.71173209, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73972821, + "num_input_tokens_seen": 109009505, + "step": 5073, + "time_per_iteration": 2.7544069290161133 + }, + { + "auxiliary_loss_clip": 0.0150968, + "auxiliary_loss_mlp": 0.0130976, + "balance_loss_clip": 1.17193162, + "balance_loss_mlp": 1.05875325, + "epoch": 0.3050653840372764, + "flos": 14467082339520.0, + "grad_norm": 6.857017226355851, + "language_loss": 0.76868188, + "learning_rate": 3.257434773758163e-06, + "loss": 0.79687631, + "num_input_tokens_seen": 109026350, + "step": 5074, + "time_per_iteration": 2.7666027545928955 + }, + { + "auxiliary_loss_clip": 0.01514562, + "auxiliary_loss_mlp": 0.01308301, + "balance_loss_clip": 1.17728209, + "balance_loss_mlp": 1.05920184, + "epoch": 0.30512550728994436, + "flos": 24246121613760.0, + "grad_norm": 2.2727867423918418, + "language_loss": 0.7475642, + "learning_rate": 3.25713189132155e-06, + "loss": 0.77579284, + "num_input_tokens_seen": 109044165, + "step": 5075, + "time_per_iteration": 2.791019916534424 + }, + { + "auxiliary_loss_clip": 0.01508852, + "auxiliary_loss_mlp": 0.01309773, + "balance_loss_clip": 1.17108583, + "balance_loss_mlp": 1.05800283, + "epoch": 0.30518563054261233, + "flos": 16362295583040.0, + "grad_norm": 1.9317401581707208, + "language_loss": 0.75287664, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.78106284, + "num_input_tokens_seen": 109060665, + "step": 5076, + "time_per_iteration": 2.7496368885040283 + }, + { + "auxiliary_loss_clip": 0.01511872, + "auxiliary_loss_mlp": 0.01303487, + "balance_loss_clip": 1.17536092, + "balance_loss_mlp": 1.05305219, + "epoch": 0.30524575379528035, + "flos": 21581717634720.0, + "grad_norm": 1.8692386741381775, + "language_loss": 0.79315758, + "learning_rate": 3.25652598344811e-06, + "loss": 0.82131112, + "num_input_tokens_seen": 109080035, + "step": 5077, + "time_per_iteration": 2.768805742263794 + }, + { + "auxiliary_loss_clip": 0.01504247, + "auxiliary_loss_mlp": 0.01289215, + "balance_loss_clip": 1.16772771, + "balance_loss_mlp": 1.04030573, + "epoch": 0.3053058770479483, + "flos": 16547121328320.0, + "grad_norm": 2.2592400396394003, + "language_loss": 0.74576116, + "learning_rate": 3.256222958034259e-06, + "loss": 0.77369577, + "num_input_tokens_seen": 109097385, + "step": 5078, + "time_per_iteration": 2.7557504177093506 + }, + { + "auxiliary_loss_clip": 0.01501902, + "auxiliary_loss_mlp": 0.01294697, + "balance_loss_clip": 1.16519523, + "balance_loss_mlp": 1.04674149, + "epoch": 0.3053660003006163, + "flos": 12314561908320.0, + "grad_norm": 5.033516975967406, + "language_loss": 0.67184502, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69981098, + "num_input_tokens_seen": 109115495, + "step": 5079, + "time_per_iteration": 2.8574883937835693 + }, + { + "auxiliary_loss_clip": 0.01498211, + "auxiliary_loss_mlp": 0.01282031, + "balance_loss_clip": 1.16123247, + "balance_loss_mlp": 1.03331304, + "epoch": 0.30542612355328425, + "flos": 23114523666240.0, + "grad_norm": 2.698782585369149, + "language_loss": 0.7992326, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82703501, + "num_input_tokens_seen": 109134235, + "step": 5080, + "time_per_iteration": 2.800823926925659 + }, + { + "auxiliary_loss_clip": 0.01506439, + "auxiliary_loss_mlp": 0.01298903, + "balance_loss_clip": 1.17036307, + "balance_loss_mlp": 1.04904056, + "epoch": 0.3054862468059522, + "flos": 24391767205440.0, + "grad_norm": 2.448262682247299, + "language_loss": 0.8083058, + "learning_rate": 3.255313596022074e-06, + "loss": 0.83635926, + "num_input_tokens_seen": 109152760, + "step": 5081, + "time_per_iteration": 4.367122173309326 + }, + { + "auxiliary_loss_clip": 0.01508928, + "auxiliary_loss_mlp": 0.01303251, + "balance_loss_clip": 1.17049956, + "balance_loss_mlp": 1.054533, + "epoch": 0.3055463700586202, + "flos": 29388586700160.0, + "grad_norm": 1.8451999343516878, + "language_loss": 0.71992373, + "learning_rate": 3.255010380132783e-06, + "loss": 0.7480455, + "num_input_tokens_seen": 109173925, + "step": 5082, + "time_per_iteration": 2.8549535274505615 + }, + { + "auxiliary_loss_clip": 0.01504911, + "auxiliary_loss_mlp": 0.01293483, + "balance_loss_clip": 1.1657666, + "balance_loss_mlp": 1.04342985, + "epoch": 0.30560649331128814, + "flos": 25594063971840.0, + "grad_norm": 2.050819575066058, + "language_loss": 0.73027503, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.758259, + "num_input_tokens_seen": 109192510, + "step": 5083, + "time_per_iteration": 2.7921555042266846 + }, + { + "auxiliary_loss_clip": 0.01493991, + "auxiliary_loss_mlp": 0.01285074, + "balance_loss_clip": 1.1539762, + "balance_loss_mlp": 1.03711855, + "epoch": 0.3056666165639561, + "flos": 19129941106560.0, + "grad_norm": 13.409436866515506, + "language_loss": 0.70981771, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73760837, + "num_input_tokens_seen": 109210885, + "step": 5084, + "time_per_iteration": 2.76129150390625 + }, + { + "auxiliary_loss_clip": 0.01498071, + "auxiliary_loss_mlp": 0.0129617, + "balance_loss_clip": 1.15851474, + "balance_loss_mlp": 1.04382789, + "epoch": 0.30572673981662407, + "flos": 15525705778560.0, + "grad_norm": 3.070970056378738, + "language_loss": 0.78854859, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.81649095, + "num_input_tokens_seen": 109229180, + "step": 5085, + "time_per_iteration": 2.743953227996826 + }, + { + "auxiliary_loss_clip": 0.01495621, + "auxiliary_loss_mlp": 0.01281506, + "balance_loss_clip": 1.15568423, + "balance_loss_mlp": 1.03412247, + "epoch": 0.30578686306929204, + "flos": 21508667269920.0, + "grad_norm": 1.7198106618885227, + "language_loss": 0.77922744, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80699873, + "num_input_tokens_seen": 109249510, + "step": 5086, + "time_per_iteration": 2.8612539768218994 + }, + { + "auxiliary_loss_clip": 0.01500809, + "auxiliary_loss_mlp": 0.01291739, + "balance_loss_clip": 1.15982461, + "balance_loss_mlp": 1.04416466, + "epoch": 0.30584698632196, + "flos": 20956086442080.0, + "grad_norm": 1.7663972509779362, + "language_loss": 0.76853895, + "learning_rate": 3.253493587064563e-06, + "loss": 0.79646444, + "num_input_tokens_seen": 109268200, + "step": 5087, + "time_per_iteration": 2.80283260345459 + }, + { + "auxiliary_loss_clip": 0.01503394, + "auxiliary_loss_mlp": 0.01286233, + "balance_loss_clip": 1.16211152, + "balance_loss_mlp": 1.03827822, + "epoch": 0.30590710957462797, + "flos": 24683437670400.0, + "grad_norm": 2.113120651241908, + "language_loss": 0.72624016, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.75413644, + "num_input_tokens_seen": 109288370, + "step": 5088, + "time_per_iteration": 4.374212980270386 + }, + { + "auxiliary_loss_clip": 0.01502842, + "auxiliary_loss_mlp": 0.01288542, + "balance_loss_clip": 1.16156161, + "balance_loss_mlp": 1.03524637, + "epoch": 0.30596723282729593, + "flos": 17088513348960.0, + "grad_norm": 4.93134173278705, + "language_loss": 0.80071199, + "learning_rate": 3.252886537028521e-06, + "loss": 0.82862592, + "num_input_tokens_seen": 109306730, + "step": 5089, + "time_per_iteration": 4.223779916763306 + }, + { + "auxiliary_loss_clip": 0.01503056, + "auxiliary_loss_mlp": 0.0128919, + "balance_loss_clip": 1.161273, + "balance_loss_mlp": 1.0370388, + "epoch": 0.30602735607996395, + "flos": 22859340455520.0, + "grad_norm": 1.8713366000739078, + "language_loss": 0.76961285, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.7975353, + "num_input_tokens_seen": 109327360, + "step": 5090, + "time_per_iteration": 4.38427209854126 + }, + { + "auxiliary_loss_clip": 0.01503317, + "auxiliary_loss_mlp": 0.012909, + "balance_loss_clip": 1.16105354, + "balance_loss_mlp": 1.038939, + "epoch": 0.3060874793326319, + "flos": 29864058850080.0, + "grad_norm": 1.9192772130589946, + "language_loss": 0.76507664, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.79301876, + "num_input_tokens_seen": 109348135, + "step": 5091, + "time_per_iteration": 2.9230234622955322 + }, + { + "auxiliary_loss_clip": 0.01495896, + "auxiliary_loss_mlp": 0.01283976, + "balance_loss_clip": 1.15438843, + "balance_loss_mlp": 1.03125191, + "epoch": 0.3061476025852999, + "flos": 20450650681440.0, + "grad_norm": 1.7442120060736024, + "language_loss": 0.71750718, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.7453059, + "num_input_tokens_seen": 109366220, + "step": 5092, + "time_per_iteration": 2.8343169689178467 + }, + { + "auxiliary_loss_clip": 0.01505657, + "auxiliary_loss_mlp": 0.01290823, + "balance_loss_clip": 1.16461062, + "balance_loss_mlp": 1.03638268, + "epoch": 0.30620772583796785, + "flos": 19393544368800.0, + "grad_norm": 2.1437181543237354, + "language_loss": 0.82398808, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.85195285, + "num_input_tokens_seen": 109385260, + "step": 5093, + "time_per_iteration": 2.8073818683624268 + }, + { + "auxiliary_loss_clip": 0.01506514, + "auxiliary_loss_mlp": 0.0128619, + "balance_loss_clip": 1.16504955, + "balance_loss_mlp": 1.03384745, + "epoch": 0.3062678490906358, + "flos": 24026856734880.0, + "grad_norm": 1.8008563388096983, + "language_loss": 0.74874794, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77667499, + "num_input_tokens_seen": 109405025, + "step": 5094, + "time_per_iteration": 2.7879793643951416 + }, + { + "auxiliary_loss_clip": 0.01510282, + "auxiliary_loss_mlp": 0.01283988, + "balance_loss_clip": 1.16733789, + "balance_loss_mlp": 1.03298104, + "epoch": 0.3063279723433038, + "flos": 19756634287680.0, + "grad_norm": 3.9300231446255793, + "language_loss": 0.76056963, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78851235, + "num_input_tokens_seen": 109422465, + "step": 5095, + "time_per_iteration": 2.8161206245422363 + }, + { + "auxiliary_loss_clip": 0.01500669, + "auxiliary_loss_mlp": 0.01282743, + "balance_loss_clip": 1.15756893, + "balance_loss_mlp": 1.03402483, + "epoch": 0.30638809559597174, + "flos": 22451912153280.0, + "grad_norm": 1.9498014598114524, + "language_loss": 0.8099156, + "learning_rate": 3.250760365955042e-06, + "loss": 0.83774972, + "num_input_tokens_seen": 109440575, + "step": 5096, + "time_per_iteration": 2.800848960876465 + }, + { + "auxiliary_loss_clip": 0.01496233, + "auxiliary_loss_mlp": 0.01282955, + "balance_loss_clip": 1.1544131, + "balance_loss_mlp": 1.03099406, + "epoch": 0.3064482188486397, + "flos": 17167063296960.0, + "grad_norm": 2.187056758533156, + "language_loss": 0.81919378, + "learning_rate": 3.250456437422258e-06, + "loss": 0.84698558, + "num_input_tokens_seen": 109459050, + "step": 5097, + "time_per_iteration": 2.769648313522339 + }, + { + "auxiliary_loss_clip": 0.01507115, + "auxiliary_loss_mlp": 0.01283657, + "balance_loss_clip": 1.16413748, + "balance_loss_mlp": 1.03264964, + "epoch": 0.3065083421013077, + "flos": 23770725320160.0, + "grad_norm": 2.291257600076525, + "language_loss": 0.77898681, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80689454, + "num_input_tokens_seen": 109475860, + "step": 5098, + "time_per_iteration": 2.8181397914886475 + }, + { + "auxiliary_loss_clip": 0.01502436, + "auxiliary_loss_mlp": 0.01279799, + "balance_loss_clip": 1.15875709, + "balance_loss_mlp": 1.03203392, + "epoch": 0.30656846535397564, + "flos": 26434219023360.0, + "grad_norm": 2.395769751061584, + "language_loss": 0.84437871, + "learning_rate": 3.249848438115917e-06, + "loss": 0.87220109, + "num_input_tokens_seen": 109494760, + "step": 5099, + "time_per_iteration": 2.8640379905700684 + }, + { + "auxiliary_loss_clip": 0.01502723, + "auxiliary_loss_mlp": 0.01296564, + "balance_loss_clip": 1.15923905, + "balance_loss_mlp": 1.0465107, + "epoch": 0.3066285886066436, + "flos": 26654242465440.0, + "grad_norm": 5.444693211453497, + "language_loss": 0.85725331, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.88524616, + "num_input_tokens_seen": 109516480, + "step": 5100, + "time_per_iteration": 2.894188165664673 + }, + { + "auxiliary_loss_clip": 0.01498108, + "auxiliary_loss_mlp": 0.01282814, + "balance_loss_clip": 1.15531373, + "balance_loss_mlp": 1.03237915, + "epoch": 0.30668871185931157, + "flos": 15051788683200.0, + "grad_norm": 1.8740782418043975, + "language_loss": 0.78908575, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81689495, + "num_input_tokens_seen": 109534615, + "step": 5101, + "time_per_iteration": 2.802767038345337 + }, + { + "auxiliary_loss_clip": 0.01514261, + "auxiliary_loss_mlp": 0.01292405, + "balance_loss_clip": 1.1714499, + "balance_loss_mlp": 1.04101622, + "epoch": 0.30674883511197953, + "flos": 20084071371840.0, + "grad_norm": 1.963609447888691, + "language_loss": 0.80158985, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.82965648, + "num_input_tokens_seen": 109554040, + "step": 5102, + "time_per_iteration": 2.8101534843444824 + }, + { + "auxiliary_loss_clip": 0.01507606, + "auxiliary_loss_mlp": 0.01295648, + "balance_loss_clip": 1.16404581, + "balance_loss_mlp": 1.0431149, + "epoch": 0.30680895836464755, + "flos": 22896055278720.0, + "grad_norm": 2.0232247957734906, + "language_loss": 0.8880415, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.91607404, + "num_input_tokens_seen": 109574345, + "step": 5103, + "time_per_iteration": 2.807513475418091 + }, + { + "auxiliary_loss_clip": 0.01502051, + "auxiliary_loss_mlp": 0.01287288, + "balance_loss_clip": 1.15794015, + "balance_loss_mlp": 1.03857005, + "epoch": 0.3068690816173155, + "flos": 23698433518560.0, + "grad_norm": 2.028114717711303, + "language_loss": 0.74267936, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.77057278, + "num_input_tokens_seen": 109593670, + "step": 5104, + "time_per_iteration": 2.8226661682128906 + }, + { + "auxiliary_loss_clip": 0.015027, + "auxiliary_loss_mlp": 0.01283863, + "balance_loss_clip": 1.16279995, + "balance_loss_mlp": 1.03209305, + "epoch": 0.3069292048699835, + "flos": 23553356849280.0, + "grad_norm": 2.0722900431347675, + "language_loss": 0.72508138, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75294697, + "num_input_tokens_seen": 109613385, + "step": 5105, + "time_per_iteration": 2.8290107250213623 + }, + { + "auxiliary_loss_clip": 0.0151235, + "auxiliary_loss_mlp": 0.0129563, + "balance_loss_clip": 1.17121387, + "balance_loss_mlp": 1.04805601, + "epoch": 0.30698932812265145, + "flos": 24533771693760.0, + "grad_norm": 2.4482390921624444, + "language_loss": 0.87292373, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.90100348, + "num_input_tokens_seen": 109632395, + "step": 5106, + "time_per_iteration": 2.848644495010376 + }, + { + "auxiliary_loss_clip": 0.01504038, + "auxiliary_loss_mlp": 0.01288643, + "balance_loss_clip": 1.16457379, + "balance_loss_mlp": 1.03038824, + "epoch": 0.3070494513753194, + "flos": 20998566345600.0, + "grad_norm": 2.2291870181676097, + "language_loss": 0.70983785, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73776466, + "num_input_tokens_seen": 109651380, + "step": 5107, + "time_per_iteration": 2.8487443923950195 + }, + { + "auxiliary_loss_clip": 0.01501786, + "auxiliary_loss_mlp": 0.01294725, + "balance_loss_clip": 1.15997267, + "balance_loss_mlp": 1.04524422, + "epoch": 0.3071095746279874, + "flos": 19027951191360.0, + "grad_norm": 2.2935609615616657, + "language_loss": 0.72311926, + "learning_rate": 3.247110096547814e-06, + "loss": 0.75108439, + "num_input_tokens_seen": 109670240, + "step": 5108, + "time_per_iteration": 2.8257648944854736 + }, + { + "auxiliary_loss_clip": 0.0150373, + "auxiliary_loss_mlp": 0.01283339, + "balance_loss_clip": 1.16331315, + "balance_loss_mlp": 1.03500223, + "epoch": 0.30716969788065535, + "flos": 21217679511840.0, + "grad_norm": 1.788372175023749, + "language_loss": 0.85801643, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.88588715, + "num_input_tokens_seen": 109690810, + "step": 5109, + "time_per_iteration": 2.881730318069458 + }, + { + "auxiliary_loss_clip": 0.01503616, + "auxiliary_loss_mlp": 0.01285239, + "balance_loss_clip": 1.16308546, + "balance_loss_mlp": 1.03613973, + "epoch": 0.3072298211333233, + "flos": 25774983116640.0, + "grad_norm": 1.7964600983410557, + "language_loss": 0.67527592, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.70316452, + "num_input_tokens_seen": 109711145, + "step": 5110, + "time_per_iteration": 2.8871262073516846 + }, + { + "auxiliary_loss_clip": 0.01498474, + "auxiliary_loss_mlp": 0.01291585, + "balance_loss_clip": 1.1589148, + "balance_loss_mlp": 1.04515576, + "epoch": 0.3072899443859913, + "flos": 25851333231360.0, + "grad_norm": 1.6171976882262402, + "language_loss": 0.7736156, + "learning_rate": 3.246196464379919e-06, + "loss": 0.80151618, + "num_input_tokens_seen": 109731425, + "step": 5111, + "time_per_iteration": 2.913438320159912 + }, + { + "auxiliary_loss_clip": 0.01504613, + "auxiliary_loss_mlp": 0.0128807, + "balance_loss_clip": 1.1653651, + "balance_loss_mlp": 1.03916132, + "epoch": 0.30735006763865924, + "flos": 25925255943840.0, + "grad_norm": 2.2756636864916415, + "language_loss": 0.67167622, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69960308, + "num_input_tokens_seen": 109752720, + "step": 5112, + "time_per_iteration": 2.9061439037323 + }, + { + "auxiliary_loss_clip": 0.01504064, + "auxiliary_loss_mlp": 0.01294325, + "balance_loss_clip": 1.16383481, + "balance_loss_mlp": 1.04484332, + "epoch": 0.3074101908913272, + "flos": 30919496323680.0, + "grad_norm": 3.9565314884728355, + "language_loss": 0.79678118, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.82476509, + "num_input_tokens_seen": 109772840, + "step": 5113, + "time_per_iteration": 2.834639549255371 + }, + { + "auxiliary_loss_clip": 0.0149307, + "auxiliary_loss_mlp": 0.01293707, + "balance_loss_clip": 1.15370452, + "balance_loss_mlp": 1.04517901, + "epoch": 0.30747031414399517, + "flos": 18402775136640.0, + "grad_norm": 2.7773164646156125, + "language_loss": 0.77145284, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79932058, + "num_input_tokens_seen": 109790150, + "step": 5114, + "time_per_iteration": 2.8117198944091797 + }, + { + "auxiliary_loss_clip": 0.01502185, + "auxiliary_loss_mlp": 0.01288839, + "balance_loss_clip": 1.16125643, + "balance_loss_mlp": 1.04145586, + "epoch": 0.30753043739666314, + "flos": 22635106987680.0, + "grad_norm": 2.846708792624814, + "language_loss": 0.62563562, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.65354586, + "num_input_tokens_seen": 109807985, + "step": 5115, + "time_per_iteration": 2.8494954109191895 + }, + { + "auxiliary_loss_clip": 0.01501651, + "auxiliary_loss_mlp": 0.01303673, + "balance_loss_clip": 1.16090059, + "balance_loss_mlp": 1.05686188, + "epoch": 0.3075905606493311, + "flos": 27346514163840.0, + "grad_norm": 2.2297624363525568, + "language_loss": 0.83020341, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.85825658, + "num_input_tokens_seen": 109825920, + "step": 5116, + "time_per_iteration": 2.9053900241851807 + }, + { + "auxiliary_loss_clip": 0.01497157, + "auxiliary_loss_mlp": 0.01296693, + "balance_loss_clip": 1.15589738, + "balance_loss_mlp": 1.05312502, + "epoch": 0.3076506839019991, + "flos": 22092994332000.0, + "grad_norm": 1.8895977070160062, + "language_loss": 0.7564714, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78440988, + "num_input_tokens_seen": 109846220, + "step": 5117, + "time_per_iteration": 2.787682294845581 + }, + { + "auxiliary_loss_clip": 0.01506576, + "auxiliary_loss_mlp": 0.0129541, + "balance_loss_clip": 1.16559505, + "balance_loss_mlp": 1.04917109, + "epoch": 0.3077108071546671, + "flos": 21291753936960.0, + "grad_norm": 2.507908783423438, + "language_loss": 0.71641749, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.74443734, + "num_input_tokens_seen": 109863870, + "step": 5118, + "time_per_iteration": 4.463576078414917 + }, + { + "auxiliary_loss_clip": 0.01503835, + "auxiliary_loss_mlp": 0.01289822, + "balance_loss_clip": 1.16344166, + "balance_loss_mlp": 1.04205704, + "epoch": 0.30777093040733505, + "flos": 21432961933920.0, + "grad_norm": 1.799884913296925, + "language_loss": 0.74545753, + "learning_rate": 3.243758033520219e-06, + "loss": 0.77339417, + "num_input_tokens_seen": 109883500, + "step": 5119, + "time_per_iteration": 2.8335680961608887 + }, + { + "auxiliary_loss_clip": 0.01498671, + "auxiliary_loss_mlp": 0.01300011, + "balance_loss_clip": 1.15684795, + "balance_loss_mlp": 1.05300975, + "epoch": 0.307831053660003, + "flos": 23151541914720.0, + "grad_norm": 1.8244503295362826, + "language_loss": 0.7988292, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82681608, + "num_input_tokens_seen": 109904620, + "step": 5120, + "time_per_iteration": 2.8315038681030273 + }, + { + "auxiliary_loss_clip": 0.0150152, + "auxiliary_loss_mlp": 0.01285295, + "balance_loss_clip": 1.16098607, + "balance_loss_mlp": 1.03905642, + "epoch": 0.307891176912671, + "flos": 17021683202400.0, + "grad_norm": 1.6462309908371093, + "language_loss": 0.79789251, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82576066, + "num_input_tokens_seen": 109922275, + "step": 5121, + "time_per_iteration": 2.847952365875244 + }, + { + "auxiliary_loss_clip": 0.01502166, + "auxiliary_loss_mlp": 0.01290263, + "balance_loss_clip": 1.1616993, + "balance_loss_mlp": 1.04287958, + "epoch": 0.30795130016533895, + "flos": 27707290464960.0, + "grad_norm": 1.5814993241911586, + "language_loss": 0.82680237, + "learning_rate": 3.242842843433319e-06, + "loss": 0.85472667, + "num_input_tokens_seen": 109944265, + "step": 5122, + "time_per_iteration": 2.8438684940338135 + }, + { + "auxiliary_loss_clip": 0.01614474, + "auxiliary_loss_mlp": 0.01262856, + "balance_loss_clip": 1.2781167, + "balance_loss_mlp": 1.04694366, + "epoch": 0.3080114234180069, + "flos": 69066161521920.0, + "grad_norm": 0.7511171057630911, + "language_loss": 0.58615363, + "learning_rate": 3.242537685798143e-06, + "loss": 0.61492693, + "num_input_tokens_seen": 110014160, + "step": 5123, + "time_per_iteration": 3.486781358718872 + }, + { + "auxiliary_loss_clip": 0.01496141, + "auxiliary_loss_mlp": 0.01287426, + "balance_loss_clip": 1.15491557, + "balance_loss_mlp": 1.03718138, + "epoch": 0.3080715466706749, + "flos": 24062471641440.0, + "grad_norm": 1.9268311445144746, + "language_loss": 0.83518606, + "learning_rate": 3.242232481045813e-06, + "loss": 0.86302173, + "num_input_tokens_seen": 110034865, + "step": 5124, + "time_per_iteration": 2.847368001937866 + }, + { + "auxiliary_loss_clip": 0.01495014, + "auxiliary_loss_mlp": 0.01284168, + "balance_loss_clip": 1.15170932, + "balance_loss_mlp": 1.03411448, + "epoch": 0.30813166992334284, + "flos": 25851067734240.0, + "grad_norm": 2.916401179456486, + "language_loss": 0.79264998, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.82044178, + "num_input_tokens_seen": 110052930, + "step": 5125, + "time_per_iteration": 2.7812728881835938 + }, + { + "auxiliary_loss_clip": 0.01500692, + "auxiliary_loss_mlp": 0.01295007, + "balance_loss_clip": 1.15830421, + "balance_loss_mlp": 1.04438138, + "epoch": 0.3081917931760108, + "flos": 20451750598080.0, + "grad_norm": 2.1769228592757224, + "language_loss": 0.64552253, + "learning_rate": 3.241621930235989e-06, + "loss": 0.67347956, + "num_input_tokens_seen": 110071765, + "step": 5126, + "time_per_iteration": 4.332259654998779 + }, + { + "auxiliary_loss_clip": 0.0150163, + "auxiliary_loss_mlp": 0.01280069, + "balance_loss_clip": 1.15828073, + "balance_loss_mlp": 1.03459322, + "epoch": 0.3082519164286788, + "flos": 22168585883520.0, + "grad_norm": 3.101570532076362, + "language_loss": 0.86927021, + "learning_rate": 3.241316584201646e-06, + "loss": 0.89708716, + "num_input_tokens_seen": 110092660, + "step": 5127, + "time_per_iteration": 4.395834922790527 + }, + { + "auxiliary_loss_clip": 0.01501095, + "auxiliary_loss_mlp": 0.01283373, + "balance_loss_clip": 1.159024, + "balance_loss_mlp": 1.03370094, + "epoch": 0.30831203968134674, + "flos": 28915959162240.0, + "grad_norm": 1.820708923089103, + "language_loss": 0.68589765, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.71374232, + "num_input_tokens_seen": 110114960, + "step": 5128, + "time_per_iteration": 2.8971519470214844 + }, + { + "auxiliary_loss_clip": 0.01503988, + "auxiliary_loss_mlp": 0.0128871, + "balance_loss_clip": 1.16089427, + "balance_loss_mlp": 1.03598666, + "epoch": 0.3083721629340147, + "flos": 25670186517600.0, + "grad_norm": 2.084906437685133, + "language_loss": 0.71587729, + "learning_rate": 3.240705750931993e-06, + "loss": 0.74380428, + "num_input_tokens_seen": 110135750, + "step": 5129, + "time_per_iteration": 2.8331141471862793 + }, + { + "auxiliary_loss_clip": 0.01634647, + "auxiliary_loss_mlp": 0.01232948, + "balance_loss_clip": 1.29625535, + "balance_loss_mlp": 1.01093292, + "epoch": 0.3084322861866827, + "flos": 68219710395840.0, + "grad_norm": 0.842572294421549, + "language_loss": 0.59151059, + "learning_rate": 3.240400263719846e-06, + "loss": 0.62018657, + "num_input_tokens_seen": 110189480, + "step": 5130, + "time_per_iteration": 3.3085899353027344 + }, + { + "auxiliary_loss_clip": 0.01506188, + "auxiliary_loss_mlp": 0.01288787, + "balance_loss_clip": 1.16414809, + "balance_loss_mlp": 1.03339314, + "epoch": 0.3084924094393507, + "flos": 20298215949120.0, + "grad_norm": 2.8390795945379814, + "language_loss": 0.72775233, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75570208, + "num_input_tokens_seen": 110206445, + "step": 5131, + "time_per_iteration": 2.8293354511260986 + }, + { + "auxiliary_loss_clip": 0.01503309, + "auxiliary_loss_mlp": 0.01281036, + "balance_loss_clip": 1.16278696, + "balance_loss_mlp": 1.03022003, + "epoch": 0.30855253269201866, + "flos": 23952023746560.0, + "grad_norm": 1.596978729081421, + "language_loss": 0.71422333, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.74206674, + "num_input_tokens_seen": 110226845, + "step": 5132, + "time_per_iteration": 2.820760726928711 + }, + { + "auxiliary_loss_clip": 0.01507163, + "auxiliary_loss_mlp": 0.01291516, + "balance_loss_clip": 1.16585875, + "balance_loss_mlp": 1.04775703, + "epoch": 0.3086126559446866, + "flos": 19283930893440.0, + "grad_norm": 3.0440114179089046, + "language_loss": 0.90594757, + "learning_rate": 3.239483519913136e-06, + "loss": 0.93393433, + "num_input_tokens_seen": 110244095, + "step": 5133, + "time_per_iteration": 2.802419662475586 + }, + { + "auxiliary_loss_clip": 0.01504385, + "auxiliary_loss_mlp": 0.01287589, + "balance_loss_clip": 1.16218603, + "balance_loss_mlp": 1.03219533, + "epoch": 0.3086727791973546, + "flos": 33763302321120.0, + "grad_norm": 7.493817044597271, + "language_loss": 0.67166626, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69958603, + "num_input_tokens_seen": 110264240, + "step": 5134, + "time_per_iteration": 2.8814144134521484 + }, + { + "auxiliary_loss_clip": 0.01503601, + "auxiliary_loss_mlp": 0.01279734, + "balance_loss_clip": 1.1619848, + "balance_loss_mlp": 1.02720118, + "epoch": 0.30873290245002255, + "flos": 16035996343680.0, + "grad_norm": 2.4833322586255906, + "language_loss": 0.83076906, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.8586024, + "num_input_tokens_seen": 110282450, + "step": 5135, + "time_per_iteration": 2.8278133869171143 + }, + { + "auxiliary_loss_clip": 0.01640304, + "auxiliary_loss_mlp": 0.01239357, + "balance_loss_clip": 1.30114889, + "balance_loss_mlp": 1.02115631, + "epoch": 0.3087930257026905, + "flos": 65055711592800.0, + "grad_norm": 0.7075299734174344, + "language_loss": 0.55289721, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.58169389, + "num_input_tokens_seen": 110343715, + "step": 5136, + "time_per_iteration": 3.37758469581604 + }, + { + "auxiliary_loss_clip": 0.01507707, + "auxiliary_loss_mlp": 0.01300189, + "balance_loss_clip": 1.16614795, + "balance_loss_mlp": 1.05547643, + "epoch": 0.3088531489553585, + "flos": 74743912559520.0, + "grad_norm": 2.064709356583678, + "language_loss": 0.76017445, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78825343, + "num_input_tokens_seen": 110368430, + "step": 5137, + "time_per_iteration": 3.196732759475708 + }, + { + "auxiliary_loss_clip": 0.01501021, + "auxiliary_loss_mlp": 0.01286841, + "balance_loss_clip": 1.15908968, + "balance_loss_mlp": 1.04231882, + "epoch": 0.30891327220802645, + "flos": 21144439506240.0, + "grad_norm": 3.863023595835697, + "language_loss": 0.79878473, + "learning_rate": 3.237954673696424e-06, + "loss": 0.82666337, + "num_input_tokens_seen": 110386735, + "step": 5138, + "time_per_iteration": 2.8257315158843994 + }, + { + "auxiliary_loss_clip": 0.0150394, + "auxiliary_loss_mlp": 0.01293447, + "balance_loss_clip": 1.16253901, + "balance_loss_mlp": 1.04472852, + "epoch": 0.3089733954606944, + "flos": 25666772983200.0, + "grad_norm": 1.8083582931781346, + "language_loss": 0.81448066, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.84245455, + "num_input_tokens_seen": 110406820, + "step": 5139, + "time_per_iteration": 2.8221116065979004 + }, + { + "auxiliary_loss_clip": 0.01501342, + "auxiliary_loss_mlp": 0.01301444, + "balance_loss_clip": 1.15914476, + "balance_loss_mlp": 1.05310702, + "epoch": 0.3090335187133624, + "flos": 19429386844320.0, + "grad_norm": 2.1027647715332023, + "language_loss": 0.77794111, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.805969, + "num_input_tokens_seen": 110424225, + "step": 5140, + "time_per_iteration": 2.783423662185669 + }, + { + "auxiliary_loss_clip": 0.01514285, + "auxiliary_loss_mlp": 0.01311484, + "balance_loss_clip": 1.17264032, + "balance_loss_mlp": 1.07039487, + "epoch": 0.30909364196603034, + "flos": 20013410481120.0, + "grad_norm": 2.229415320135673, + "language_loss": 0.7859242, + "learning_rate": 3.237036802553252e-06, + "loss": 0.8141818, + "num_input_tokens_seen": 110443310, + "step": 5141, + "time_per_iteration": 2.7954261302948 + }, + { + "auxiliary_loss_clip": 0.01505457, + "auxiliary_loss_mlp": 0.01316483, + "balance_loss_clip": 1.16328692, + "balance_loss_mlp": 1.07177007, + "epoch": 0.3091537652186983, + "flos": 19679335968960.0, + "grad_norm": 2.880126290090539, + "language_loss": 0.87643743, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.90465689, + "num_input_tokens_seen": 110460215, + "step": 5142, + "time_per_iteration": 2.8566343784332275 + }, + { + "auxiliary_loss_clip": 0.01506783, + "auxiliary_loss_mlp": 0.0130449, + "balance_loss_clip": 1.16614246, + "balance_loss_mlp": 1.0574882, + "epoch": 0.3092138884713663, + "flos": 17021986627680.0, + "grad_norm": 2.352131266343081, + "language_loss": 0.78992224, + "learning_rate": 3.23642465389567e-06, + "loss": 0.81803501, + "num_input_tokens_seen": 110479385, + "step": 5143, + "time_per_iteration": 2.791632890701294 + }, + { + "auxiliary_loss_clip": 0.01510421, + "auxiliary_loss_mlp": 0.01288174, + "balance_loss_clip": 1.16988778, + "balance_loss_mlp": 1.04269862, + "epoch": 0.3092740117240343, + "flos": 25012353952800.0, + "grad_norm": 4.538528294687867, + "language_loss": 0.72167641, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74966234, + "num_input_tokens_seen": 110499885, + "step": 5144, + "time_per_iteration": 2.791691541671753 + }, + { + "auxiliary_loss_clip": 0.0150513, + "auxiliary_loss_mlp": 0.01296671, + "balance_loss_clip": 1.16356611, + "balance_loss_mlp": 1.04966927, + "epoch": 0.30933413497670226, + "flos": 25592622701760.0, + "grad_norm": 2.008151347984697, + "language_loss": 0.74478436, + "learning_rate": 3.235812317696702e-06, + "loss": 0.77280241, + "num_input_tokens_seen": 110519690, + "step": 5145, + "time_per_iteration": 2.8365478515625 + }, + { + "auxiliary_loss_clip": 0.01501056, + "auxiliary_loss_mlp": 0.01295105, + "balance_loss_clip": 1.15955055, + "balance_loss_mlp": 1.04962969, + "epoch": 0.3093942582293702, + "flos": 24391994774400.0, + "grad_norm": 4.213028261110326, + "language_loss": 0.76524174, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.79320335, + "num_input_tokens_seen": 110540520, + "step": 5146, + "time_per_iteration": 2.767270565032959 + }, + { + "auxiliary_loss_clip": 0.01501199, + "auxiliary_loss_mlp": 0.01286663, + "balance_loss_clip": 1.16015136, + "balance_loss_mlp": 1.04042482, + "epoch": 0.3094543814820382, + "flos": 19648500010560.0, + "grad_norm": 2.655899296656221, + "language_loss": 0.669397, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.69727564, + "num_input_tokens_seen": 110557950, + "step": 5147, + "time_per_iteration": 2.78896164894104 + }, + { + "auxiliary_loss_clip": 0.01510997, + "auxiliary_loss_mlp": 0.01297236, + "balance_loss_clip": 1.16982937, + "balance_loss_mlp": 1.04794586, + "epoch": 0.30951450473470615, + "flos": 25666052348160.0, + "grad_norm": 2.938857884857106, + "language_loss": 0.74748355, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77556586, + "num_input_tokens_seen": 110578215, + "step": 5148, + "time_per_iteration": 2.800853729248047 + }, + { + "auxiliary_loss_clip": 0.01507821, + "auxiliary_loss_mlp": 0.01299617, + "balance_loss_clip": 1.16696215, + "balance_loss_mlp": 1.04689336, + "epoch": 0.3095746279873741, + "flos": 12022019095680.0, + "grad_norm": 2.7553743498584766, + "language_loss": 0.72874057, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.75681496, + "num_input_tokens_seen": 110592990, + "step": 5149, + "time_per_iteration": 2.796473264694214 + }, + { + "auxiliary_loss_clip": 0.01505585, + "auxiliary_loss_mlp": 0.01292926, + "balance_loss_clip": 1.16494262, + "balance_loss_mlp": 1.04344523, + "epoch": 0.3096347512400421, + "flos": 23625307297440.0, + "grad_norm": 1.8495912828185297, + "language_loss": 0.84982294, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.87780809, + "num_input_tokens_seen": 110612130, + "step": 5150, + "time_per_iteration": 2.786074638366699 + }, + { + "auxiliary_loss_clip": 0.01503676, + "auxiliary_loss_mlp": 0.01276302, + "balance_loss_clip": 1.16522789, + "balance_loss_mlp": 1.02758384, + "epoch": 0.30969487449271005, + "flos": 22531562017920.0, + "grad_norm": 1.9797562383492684, + "language_loss": 0.79070824, + "learning_rate": 3.233974184780424e-06, + "loss": 0.81850803, + "num_input_tokens_seen": 110632045, + "step": 5151, + "time_per_iteration": 2.7933828830718994 + }, + { + "auxiliary_loss_clip": 0.01502085, + "auxiliary_loss_mlp": 0.01294226, + "balance_loss_clip": 1.16142857, + "balance_loss_mlp": 1.04493594, + "epoch": 0.309754997745378, + "flos": 15269726076480.0, + "grad_norm": 2.277276797711849, + "language_loss": 0.67148054, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69944358, + "num_input_tokens_seen": 110649340, + "step": 5152, + "time_per_iteration": 2.789032459259033 + }, + { + "auxiliary_loss_clip": 0.01504111, + "auxiliary_loss_mlp": 0.01290684, + "balance_loss_clip": 1.16411245, + "balance_loss_mlp": 1.04291952, + "epoch": 0.309815120998046, + "flos": 26981717477760.0, + "grad_norm": 2.179914615878273, + "language_loss": 0.82832587, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.85627389, + "num_input_tokens_seen": 110668450, + "step": 5153, + "time_per_iteration": 2.8829543590545654 + }, + { + "auxiliary_loss_clip": 0.01505642, + "auxiliary_loss_mlp": 0.01282714, + "balance_loss_clip": 1.16781783, + "balance_loss_mlp": 1.03361392, + "epoch": 0.30987524425071394, + "flos": 21145349782080.0, + "grad_norm": 2.23860509458605, + "language_loss": 0.7390368, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76692033, + "num_input_tokens_seen": 110689410, + "step": 5154, + "time_per_iteration": 2.7885117530822754 + }, + { + "auxiliary_loss_clip": 0.01512178, + "auxiliary_loss_mlp": 0.01283486, + "balance_loss_clip": 1.17298532, + "balance_loss_mlp": 1.03305125, + "epoch": 0.3099353675033819, + "flos": 15270105358080.0, + "grad_norm": 2.0636379845857875, + "language_loss": 0.7645694, + "learning_rate": 3.232747826832858e-06, + "loss": 0.79252601, + "num_input_tokens_seen": 110707350, + "step": 5155, + "time_per_iteration": 2.7215754985809326 + }, + { + "auxiliary_loss_clip": 0.01502458, + "auxiliary_loss_mlp": 0.01288147, + "balance_loss_clip": 1.16373563, + "balance_loss_mlp": 1.03790283, + "epoch": 0.30999549075604993, + "flos": 15415599237120.0, + "grad_norm": 1.9256378541154033, + "language_loss": 0.78905261, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81695861, + "num_input_tokens_seen": 110724910, + "step": 5156, + "time_per_iteration": 4.611695051193237 + }, + { + "auxiliary_loss_clip": 0.01505893, + "auxiliary_loss_mlp": 0.01294298, + "balance_loss_clip": 1.16704142, + "balance_loss_mlp": 1.04367256, + "epoch": 0.3100556140087179, + "flos": 23186739611520.0, + "grad_norm": 2.4686057835722446, + "language_loss": 0.74625719, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77425915, + "num_input_tokens_seen": 110744010, + "step": 5157, + "time_per_iteration": 2.9106194972991943 + }, + { + "auxiliary_loss_clip": 0.01506178, + "auxiliary_loss_mlp": 0.01289722, + "balance_loss_clip": 1.16870606, + "balance_loss_mlp": 1.04043162, + "epoch": 0.31011573726138586, + "flos": 25744716080640.0, + "grad_norm": 1.6443773498644647, + "language_loss": 0.69202781, + "learning_rate": 3.231827567499327e-06, + "loss": 0.7199868, + "num_input_tokens_seen": 110765835, + "step": 5158, + "time_per_iteration": 2.9362568855285645 + }, + { + "auxiliary_loss_clip": 0.01513074, + "auxiliary_loss_mlp": 0.01282431, + "balance_loss_clip": 1.17638159, + "balance_loss_mlp": 1.03542948, + "epoch": 0.3101758605140538, + "flos": 20013410481120.0, + "grad_norm": 2.1357692506340142, + "language_loss": 0.84934491, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.8772999, + "num_input_tokens_seen": 110784655, + "step": 5159, + "time_per_iteration": 2.8980770111083984 + }, + { + "auxiliary_loss_clip": 0.01506684, + "auxiliary_loss_mlp": 0.01280764, + "balance_loss_clip": 1.17075849, + "balance_loss_mlp": 1.0307107, + "epoch": 0.3102359837667218, + "flos": 19137716379360.0, + "grad_norm": 2.4764428381047003, + "language_loss": 0.84981549, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87769002, + "num_input_tokens_seen": 110802545, + "step": 5160, + "time_per_iteration": 2.8352344036102295 + }, + { + "auxiliary_loss_clip": 0.01516223, + "auxiliary_loss_mlp": 0.01276065, + "balance_loss_clip": 1.17968953, + "balance_loss_mlp": 1.02658415, + "epoch": 0.31029610701938976, + "flos": 22267162264320.0, + "grad_norm": 2.174535014442364, + "language_loss": 0.75184572, + "learning_rate": 3.230906887766584e-06, + "loss": 0.77976865, + "num_input_tokens_seen": 110820265, + "step": 5161, + "time_per_iteration": 2.851994514465332 + }, + { + "auxiliary_loss_clip": 0.01507779, + "auxiliary_loss_mlp": 0.01289942, + "balance_loss_clip": 1.17107165, + "balance_loss_mlp": 1.03664589, + "epoch": 0.3103562302720577, + "flos": 20806989387840.0, + "grad_norm": 2.2839910521667917, + "language_loss": 0.82101381, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.84899104, + "num_input_tokens_seen": 110836195, + "step": 5162, + "time_per_iteration": 2.8669626712799072 + }, + { + "auxiliary_loss_clip": 0.01507813, + "auxiliary_loss_mlp": 0.01280643, + "balance_loss_clip": 1.17141533, + "balance_loss_mlp": 1.03230596, + "epoch": 0.3104163535247257, + "flos": 22346243206560.0, + "grad_norm": 1.958976785972039, + "language_loss": 0.82791483, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85579944, + "num_input_tokens_seen": 110856420, + "step": 5163, + "time_per_iteration": 2.901947498321533 + }, + { + "auxiliary_loss_clip": 0.01504542, + "auxiliary_loss_mlp": 0.01296101, + "balance_loss_clip": 1.16844904, + "balance_loss_mlp": 1.04642868, + "epoch": 0.31047647677739365, + "flos": 21691520750880.0, + "grad_norm": 1.9076257973756663, + "language_loss": 0.76103795, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78904432, + "num_input_tokens_seen": 110876650, + "step": 5164, + "time_per_iteration": 4.363858461380005 + }, + { + "auxiliary_loss_clip": 0.01512107, + "auxiliary_loss_mlp": 0.01294194, + "balance_loss_clip": 1.17685556, + "balance_loss_mlp": 1.04566693, + "epoch": 0.3105366000300616, + "flos": 18919323848160.0, + "grad_norm": 2.37286568140833, + "language_loss": 0.74645931, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.7745223, + "num_input_tokens_seen": 110894445, + "step": 5165, + "time_per_iteration": 4.944403171539307 + }, + { + "auxiliary_loss_clip": 0.01511689, + "auxiliary_loss_mlp": 0.01288039, + "balance_loss_clip": 1.17663157, + "balance_loss_mlp": 1.03951144, + "epoch": 0.3105967232827296, + "flos": 18262401559200.0, + "grad_norm": 1.5094798464207104, + "language_loss": 0.75808132, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78607869, + "num_input_tokens_seen": 110912855, + "step": 5166, + "time_per_iteration": 4.25383996963501 + }, + { + "auxiliary_loss_clip": 0.01517771, + "auxiliary_loss_mlp": 0.01302345, + "balance_loss_clip": 1.18200552, + "balance_loss_mlp": 1.05534387, + "epoch": 0.31065684653539755, + "flos": 17673864471360.0, + "grad_norm": 2.9952998542392972, + "language_loss": 0.73433596, + "learning_rate": 3.229064268360444e-06, + "loss": 0.76253712, + "num_input_tokens_seen": 110928025, + "step": 5167, + "time_per_iteration": 2.780322313308716 + }, + { + "auxiliary_loss_clip": 0.01600812, + "auxiliary_loss_mlp": 0.01335617, + "balance_loss_clip": 1.26992929, + "balance_loss_mlp": 1.12199402, + "epoch": 0.3107169697880655, + "flos": 68538537787680.0, + "grad_norm": 0.7419778767950135, + "language_loss": 0.52983451, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55919886, + "num_input_tokens_seen": 110992215, + "step": 5168, + "time_per_iteration": 3.3954555988311768 + }, + { + "auxiliary_loss_clip": 0.01502343, + "auxiliary_loss_mlp": 0.01294073, + "balance_loss_clip": 1.16613269, + "balance_loss_mlp": 1.04611778, + "epoch": 0.3107770930407335, + "flos": 13190445650880.0, + "grad_norm": 2.9243930390633155, + "language_loss": 0.78766859, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.8156327, + "num_input_tokens_seen": 111010400, + "step": 5169, + "time_per_iteration": 2.8632845878601074 + }, + { + "auxiliary_loss_clip": 0.01503709, + "auxiliary_loss_mlp": 0.01288541, + "balance_loss_clip": 1.16695881, + "balance_loss_mlp": 1.03906024, + "epoch": 0.3108372162934015, + "flos": 31583776675680.0, + "grad_norm": 2.244989856457952, + "language_loss": 0.64038062, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66830313, + "num_input_tokens_seen": 111033960, + "step": 5170, + "time_per_iteration": 2.872701406478882 + }, + { + "auxiliary_loss_clip": 0.0151497, + "auxiliary_loss_mlp": 0.0129483, + "balance_loss_clip": 1.17900503, + "balance_loss_mlp": 1.04630208, + "epoch": 0.31089733954606946, + "flos": 28732309189920.0, + "grad_norm": 2.688128177005215, + "language_loss": 0.78012443, + "learning_rate": 3.22783492314295e-06, + "loss": 0.80822241, + "num_input_tokens_seen": 111053265, + "step": 5171, + "time_per_iteration": 2.851208448410034 + }, + { + "auxiliary_loss_clip": 0.01509927, + "auxiliary_loss_mlp": 0.01304644, + "balance_loss_clip": 1.17344999, + "balance_loss_mlp": 1.05630732, + "epoch": 0.3109574627987374, + "flos": 19685328618240.0, + "grad_norm": 1.844028163911526, + "language_loss": 0.84126705, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.86941272, + "num_input_tokens_seen": 111071130, + "step": 5172, + "time_per_iteration": 2.9347596168518066 + }, + { + "auxiliary_loss_clip": 0.0149942, + "auxiliary_loss_mlp": 0.01294837, + "balance_loss_clip": 1.16294599, + "balance_loss_mlp": 1.04230368, + "epoch": 0.3110175860514054, + "flos": 14685740367840.0, + "grad_norm": 4.729580868676706, + "language_loss": 0.84717655, + "learning_rate": 3.227219971129842e-06, + "loss": 0.87511915, + "num_input_tokens_seen": 111089560, + "step": 5173, + "time_per_iteration": 2.8117032051086426 + }, + { + "auxiliary_loss_clip": 0.01512793, + "auxiliary_loss_mlp": 0.01290299, + "balance_loss_clip": 1.17613137, + "balance_loss_mlp": 1.04653978, + "epoch": 0.31107770930407336, + "flos": 25742061109440.0, + "grad_norm": 1.7356391679122334, + "language_loss": 0.83595735, + "learning_rate": 3.226912425313001e-06, + "loss": 0.86398828, + "num_input_tokens_seen": 111109960, + "step": 5174, + "time_per_iteration": 2.8337790966033936 + }, + { + "auxiliary_loss_clip": 0.01506595, + "auxiliary_loss_mlp": 0.01287283, + "balance_loss_clip": 1.17135715, + "balance_loss_mlp": 1.03818321, + "epoch": 0.3111378325567413, + "flos": 19210273678080.0, + "grad_norm": 2.4528148084000625, + "language_loss": 0.85111201, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87905073, + "num_input_tokens_seen": 111127960, + "step": 5175, + "time_per_iteration": 2.7788796424865723 + }, + { + "auxiliary_loss_clip": 0.01504461, + "auxiliary_loss_mlp": 0.01303196, + "balance_loss_clip": 1.16911411, + "balance_loss_mlp": 1.05714798, + "epoch": 0.3111979558094093, + "flos": 23698661087520.0, + "grad_norm": 1.7112423628313598, + "language_loss": 0.83345151, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.86152816, + "num_input_tokens_seen": 111146730, + "step": 5176, + "time_per_iteration": 2.80869722366333 + }, + { + "auxiliary_loss_clip": 0.01503831, + "auxiliary_loss_mlp": 0.01287908, + "balance_loss_clip": 1.16517353, + "balance_loss_mlp": 1.0426228, + "epoch": 0.31125807906207725, + "flos": 21035167384320.0, + "grad_norm": 4.3833938825657865, + "language_loss": 0.80978894, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.83770633, + "num_input_tokens_seen": 111166295, + "step": 5177, + "time_per_iteration": 2.8079771995544434 + }, + { + "auxiliary_loss_clip": 0.01506248, + "auxiliary_loss_mlp": 0.01291993, + "balance_loss_clip": 1.17140794, + "balance_loss_mlp": 1.04136777, + "epoch": 0.3113182023147452, + "flos": 23078908759680.0, + "grad_norm": 1.9602014805501058, + "language_loss": 0.80968952, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.83767194, + "num_input_tokens_seen": 111185665, + "step": 5178, + "time_per_iteration": 2.9472124576568604 + }, + { + "auxiliary_loss_clip": 0.01502751, + "auxiliary_loss_mlp": 0.01289437, + "balance_loss_clip": 1.16623545, + "balance_loss_mlp": 1.0369041, + "epoch": 0.3113783255674132, + "flos": 11840644812960.0, + "grad_norm": 2.0720334891907353, + "language_loss": 0.81505591, + "learning_rate": 3.225373998592471e-06, + "loss": 0.84297776, + "num_input_tokens_seen": 111201615, + "step": 5179, + "time_per_iteration": 2.7503864765167236 + }, + { + "auxiliary_loss_clip": 0.01511321, + "auxiliary_loss_mlp": 0.01290304, + "balance_loss_clip": 1.17532253, + "balance_loss_mlp": 1.04063225, + "epoch": 0.31143844882008115, + "flos": 16291407123360.0, + "grad_norm": 1.6530104805678791, + "language_loss": 0.78229105, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.81030726, + "num_input_tokens_seen": 111220515, + "step": 5180, + "time_per_iteration": 2.798112154006958 + }, + { + "auxiliary_loss_clip": 0.01502392, + "auxiliary_loss_mlp": 0.01273219, + "balance_loss_clip": 1.1656673, + "balance_loss_mlp": 1.02450037, + "epoch": 0.3114985720727491, + "flos": 23219813331360.0, + "grad_norm": 2.2739513525757604, + "language_loss": 0.8316263, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85938245, + "num_input_tokens_seen": 111240395, + "step": 5181, + "time_per_iteration": 2.881450653076172 + }, + { + "auxiliary_loss_clip": 0.01509473, + "auxiliary_loss_mlp": 0.01275223, + "balance_loss_clip": 1.17350936, + "balance_loss_mlp": 1.02230883, + "epoch": 0.3115586953254171, + "flos": 30047253684480.0, + "grad_norm": 1.7749800887068716, + "language_loss": 0.74577951, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.77362645, + "num_input_tokens_seen": 111261100, + "step": 5182, + "time_per_iteration": 2.825983762741089 + }, + { + "auxiliary_loss_clip": 0.01504497, + "auxiliary_loss_mlp": 0.01296388, + "balance_loss_clip": 1.16738272, + "balance_loss_mlp": 1.04309237, + "epoch": 0.3116188185780851, + "flos": 25668707319360.0, + "grad_norm": 2.365122964967555, + "language_loss": 0.710495, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.73850381, + "num_input_tokens_seen": 111281320, + "step": 5183, + "time_per_iteration": 2.846652030944824 + }, + { + "auxiliary_loss_clip": 0.01599174, + "auxiliary_loss_mlp": 0.01242615, + "balance_loss_clip": 1.26649022, + "balance_loss_mlp": 1.01144409, + "epoch": 0.31167894183075306, + "flos": 69516259732800.0, + "grad_norm": 0.9642632617348111, + "language_loss": 0.59523493, + "learning_rate": 3.223834410214408e-06, + "loss": 0.62365282, + "num_input_tokens_seen": 111341405, + "step": 5184, + "time_per_iteration": 3.336977243423462 + }, + { + "auxiliary_loss_clip": 0.01505363, + "auxiliary_loss_mlp": 0.01279613, + "balance_loss_clip": 1.16792655, + "balance_loss_mlp": 1.03013194, + "epoch": 0.31173906508342103, + "flos": 14941644213600.0, + "grad_norm": 6.878899800577155, + "language_loss": 0.69773865, + "learning_rate": 3.223526353268311e-06, + "loss": 0.72558844, + "num_input_tokens_seen": 111358975, + "step": 5185, + "time_per_iteration": 2.806776285171509 + }, + { + "auxiliary_loss_clip": 0.01503177, + "auxiliary_loss_mlp": 0.01297491, + "balance_loss_clip": 1.1650368, + "balance_loss_mlp": 1.04686546, + "epoch": 0.311799188336089, + "flos": 16177659478560.0, + "grad_norm": 3.0475030723582814, + "language_loss": 0.63890105, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66690767, + "num_input_tokens_seen": 111375845, + "step": 5186, + "time_per_iteration": 2.838395833969116 + }, + { + "auxiliary_loss_clip": 0.01510193, + "auxiliary_loss_mlp": 0.01301505, + "balance_loss_clip": 1.17071843, + "balance_loss_mlp": 1.04801869, + "epoch": 0.31185931158875696, + "flos": 25011974671200.0, + "grad_norm": 2.1254660088585444, + "language_loss": 0.85800588, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88612288, + "num_input_tokens_seen": 111394150, + "step": 5187, + "time_per_iteration": 2.8227169513702393 + }, + { + "auxiliary_loss_clip": 0.01501232, + "auxiliary_loss_mlp": 0.01308526, + "balance_loss_clip": 1.16488767, + "balance_loss_mlp": 1.06018901, + "epoch": 0.3119194348414249, + "flos": 37235963404800.0, + "grad_norm": 1.5007319906279855, + "language_loss": 0.62938422, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.65748185, + "num_input_tokens_seen": 111418355, + "step": 5188, + "time_per_iteration": 2.936718463897705 + }, + { + "auxiliary_loss_clip": 0.01505237, + "auxiliary_loss_mlp": 0.01300731, + "balance_loss_clip": 1.16727686, + "balance_loss_mlp": 1.05048716, + "epoch": 0.3119795580940929, + "flos": 15014580793920.0, + "grad_norm": 2.7740089800169625, + "language_loss": 0.83383369, + "learning_rate": 3.222293661638346e-06, + "loss": 0.8618933, + "num_input_tokens_seen": 111435445, + "step": 5189, + "time_per_iteration": 2.7831830978393555 + }, + { + "auxiliary_loss_clip": 0.01507791, + "auxiliary_loss_mlp": 0.01301214, + "balance_loss_clip": 1.16962302, + "balance_loss_mlp": 1.05535698, + "epoch": 0.31203968134676086, + "flos": 16000040083680.0, + "grad_norm": 1.8798799552223715, + "language_loss": 0.79613078, + "learning_rate": 3.22198537282789e-06, + "loss": 0.8242209, + "num_input_tokens_seen": 111453430, + "step": 5190, + "time_per_iteration": 2.7826364040374756 + }, + { + "auxiliary_loss_clip": 0.01503237, + "auxiliary_loss_mlp": 0.01303068, + "balance_loss_clip": 1.16549933, + "balance_loss_mlp": 1.05244255, + "epoch": 0.3120998045994288, + "flos": 23839376018400.0, + "grad_norm": 1.573214445750134, + "language_loss": 0.75203323, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.78009629, + "num_input_tokens_seen": 111475325, + "step": 5191, + "time_per_iteration": 2.76686429977417 + }, + { + "auxiliary_loss_clip": 0.01592516, + "auxiliary_loss_mlp": 0.01254158, + "balance_loss_clip": 1.25715137, + "balance_loss_mlp": 1.03366852, + "epoch": 0.3121599278520968, + "flos": 69190529415840.0, + "grad_norm": 0.8415654408628415, + "language_loss": 0.63839519, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66686189, + "num_input_tokens_seen": 111533960, + "step": 5192, + "time_per_iteration": 3.4123637676239014 + }, + { + "auxiliary_loss_clip": 0.01496363, + "auxiliary_loss_mlp": 0.0130486, + "balance_loss_clip": 1.15746737, + "balance_loss_mlp": 1.05156446, + "epoch": 0.31222005110476475, + "flos": 23808691772640.0, + "grad_norm": 3.4402706233806346, + "language_loss": 0.80219901, + "learning_rate": 3.221060228416446e-06, + "loss": 0.83021116, + "num_input_tokens_seen": 111554055, + "step": 5193, + "time_per_iteration": 2.8092873096466064 + }, + { + "auxiliary_loss_clip": 0.01493988, + "auxiliary_loss_mlp": 0.01301209, + "balance_loss_clip": 1.15525007, + "balance_loss_mlp": 1.05420721, + "epoch": 0.3122801743574327, + "flos": 25228357009920.0, + "grad_norm": 2.5210320161570703, + "language_loss": 0.72762221, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.75557423, + "num_input_tokens_seen": 111574305, + "step": 5194, + "time_per_iteration": 4.60297703742981 + }, + { + "auxiliary_loss_clip": 0.01499153, + "auxiliary_loss_mlp": 0.01299338, + "balance_loss_clip": 1.16122341, + "balance_loss_mlp": 1.05290842, + "epoch": 0.3123402976101007, + "flos": 22968726361920.0, + "grad_norm": 2.5017190399150127, + "language_loss": 0.76462233, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.79260725, + "num_input_tokens_seen": 111595680, + "step": 5195, + "time_per_iteration": 2.83834171295166 + }, + { + "auxiliary_loss_clip": 0.01486263, + "auxiliary_loss_mlp": 0.01301931, + "balance_loss_clip": 1.14845097, + "balance_loss_mlp": 1.05359423, + "epoch": 0.3124004208627687, + "flos": 25194448870560.0, + "grad_norm": 1.9232175554375055, + "language_loss": 0.77937764, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80725956, + "num_input_tokens_seen": 111618135, + "step": 5196, + "time_per_iteration": 2.8652291297912598 + }, + { + "auxiliary_loss_clip": 0.0158698, + "auxiliary_loss_mlp": 0.01254898, + "balance_loss_clip": 1.25173879, + "balance_loss_mlp": 1.03364563, + "epoch": 0.31246054411543667, + "flos": 67492278928800.0, + "grad_norm": 0.769178499994538, + "language_loss": 0.54634738, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.57476616, + "num_input_tokens_seen": 111682220, + "step": 5197, + "time_per_iteration": 3.3487157821655273 + }, + { + "auxiliary_loss_clip": 0.01492271, + "auxiliary_loss_mlp": 0.01281257, + "balance_loss_clip": 1.15574527, + "balance_loss_mlp": 1.02948725, + "epoch": 0.31252066736810463, + "flos": 17860472840160.0, + "grad_norm": 1.8262073853212244, + "language_loss": 0.66660386, + "learning_rate": 3.21951739516552e-06, + "loss": 0.69433916, + "num_input_tokens_seen": 111700815, + "step": 5198, + "time_per_iteration": 2.79313588142395 + }, + { + "auxiliary_loss_clip": 0.01493305, + "auxiliary_loss_mlp": 0.01295567, + "balance_loss_clip": 1.15552175, + "balance_loss_mlp": 1.03902888, + "epoch": 0.3125807906207726, + "flos": 18476811633600.0, + "grad_norm": 2.591601290756655, + "language_loss": 0.69167817, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71956688, + "num_input_tokens_seen": 111718195, + "step": 5199, + "time_per_iteration": 2.837158441543579 + }, + { + "auxiliary_loss_clip": 0.01495888, + "auxiliary_loss_mlp": 0.01293606, + "balance_loss_clip": 1.15752816, + "balance_loss_mlp": 1.0435524, + "epoch": 0.31264091387344056, + "flos": 18948187542240.0, + "grad_norm": 2.084706981241035, + "language_loss": 0.78564936, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81354427, + "num_input_tokens_seen": 111734440, + "step": 5200, + "time_per_iteration": 2.776317596435547 + }, + { + "auxiliary_loss_clip": 0.01494405, + "auxiliary_loss_mlp": 0.01297044, + "balance_loss_clip": 1.15620112, + "balance_loss_mlp": 1.04966056, + "epoch": 0.3127010371261085, + "flos": 21470549104800.0, + "grad_norm": 2.3190250349137824, + "language_loss": 0.83739173, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.8653062, + "num_input_tokens_seen": 111751960, + "step": 5201, + "time_per_iteration": 2.7654552459716797 + }, + { + "auxiliary_loss_clip": 0.01498025, + "auxiliary_loss_mlp": 0.01283835, + "balance_loss_clip": 1.1569277, + "balance_loss_mlp": 1.03092074, + "epoch": 0.3127611603787765, + "flos": 15337580283360.0, + "grad_norm": 2.436631967168035, + "language_loss": 0.69199622, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71981484, + "num_input_tokens_seen": 111769585, + "step": 5202, + "time_per_iteration": 4.281016111373901 + }, + { + "auxiliary_loss_clip": 0.01495303, + "auxiliary_loss_mlp": 0.01283438, + "balance_loss_clip": 1.15502632, + "balance_loss_mlp": 1.02728128, + "epoch": 0.31282128363144446, + "flos": 17604910347840.0, + "grad_norm": 2.794065673334413, + "language_loss": 0.84498966, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.8727771, + "num_input_tokens_seen": 111787880, + "step": 5203, + "time_per_iteration": 4.656061410903931 + }, + { + "auxiliary_loss_clip": 0.01494234, + "auxiliary_loss_mlp": 0.0128704, + "balance_loss_clip": 1.15573514, + "balance_loss_mlp": 1.03412509, + "epoch": 0.3128814068841124, + "flos": 26758963208160.0, + "grad_norm": 2.3880801379051455, + "language_loss": 0.61225498, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.6400677, + "num_input_tokens_seen": 111805950, + "step": 5204, + "time_per_iteration": 4.329213380813599 + }, + { + "auxiliary_loss_clip": 0.0149382, + "auxiliary_loss_mlp": 0.01282151, + "balance_loss_clip": 1.1545198, + "balance_loss_mlp": 1.03381395, + "epoch": 0.3129415301367804, + "flos": 22274520327360.0, + "grad_norm": 2.05603552772782, + "language_loss": 0.66388655, + "learning_rate": 3.217355486684887e-06, + "loss": 0.69164628, + "num_input_tokens_seen": 111826135, + "step": 5205, + "time_per_iteration": 2.8122735023498535 + }, + { + "auxiliary_loss_clip": 0.01497583, + "auxiliary_loss_mlp": 0.01289404, + "balance_loss_clip": 1.15861285, + "balance_loss_mlp": 1.04049492, + "epoch": 0.31300165338944835, + "flos": 26467103102400.0, + "grad_norm": 2.097962220455743, + "language_loss": 0.7643162, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.79218614, + "num_input_tokens_seen": 111844700, + "step": 5206, + "time_per_iteration": 2.8390731811523438 + }, + { + "auxiliary_loss_clip": 0.01498048, + "auxiliary_loss_mlp": 0.01281789, + "balance_loss_clip": 1.15813136, + "balance_loss_mlp": 1.03059161, + "epoch": 0.3130617766421163, + "flos": 21946817746080.0, + "grad_norm": 2.918331197077257, + "language_loss": 0.83203447, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85983288, + "num_input_tokens_seen": 111861585, + "step": 5207, + "time_per_iteration": 2.7662432193756104 + }, + { + "auxiliary_loss_clip": 0.01490855, + "auxiliary_loss_mlp": 0.01285149, + "balance_loss_clip": 1.15116644, + "balance_loss_mlp": 1.04005468, + "epoch": 0.3131218998947843, + "flos": 23294911816800.0, + "grad_norm": 1.8875416756483037, + "language_loss": 0.71773851, + "learning_rate": 3.216428261810999e-06, + "loss": 0.74549854, + "num_input_tokens_seen": 111882950, + "step": 5208, + "time_per_iteration": 2.7758915424346924 + }, + { + "auxiliary_loss_clip": 0.01497095, + "auxiliary_loss_mlp": 0.01290797, + "balance_loss_clip": 1.1567775, + "balance_loss_mlp": 1.04265094, + "epoch": 0.3131820231474523, + "flos": 21141632822400.0, + "grad_norm": 1.9581304002814341, + "language_loss": 0.74714565, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.77502453, + "num_input_tokens_seen": 111901640, + "step": 5209, + "time_per_iteration": 2.802816152572632 + }, + { + "auxiliary_loss_clip": 0.0149134, + "auxiliary_loss_mlp": 0.01290351, + "balance_loss_clip": 1.15137672, + "balance_loss_mlp": 1.04468417, + "epoch": 0.31324214640012027, + "flos": 23911857460800.0, + "grad_norm": 2.7361879237828366, + "language_loss": 0.77609676, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.80391365, + "num_input_tokens_seen": 111919615, + "step": 5210, + "time_per_iteration": 2.788329839706421 + }, + { + "auxiliary_loss_clip": 0.01495694, + "auxiliary_loss_mlp": 0.01285349, + "balance_loss_clip": 1.15480757, + "balance_loss_mlp": 1.04063606, + "epoch": 0.31330226965278823, + "flos": 22239132989760.0, + "grad_norm": 2.666949726056074, + "language_loss": 0.7926271, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.82043749, + "num_input_tokens_seen": 111938485, + "step": 5211, + "time_per_iteration": 2.7949914932250977 + }, + { + "auxiliary_loss_clip": 0.01489745, + "auxiliary_loss_mlp": 0.01300249, + "balance_loss_clip": 1.14961433, + "balance_loss_mlp": 1.05744398, + "epoch": 0.3133623929054562, + "flos": 19755951580800.0, + "grad_norm": 1.9486972143586554, + "language_loss": 0.79587585, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.82377577, + "num_input_tokens_seen": 111956425, + "step": 5212, + "time_per_iteration": 2.8224029541015625 + }, + { + "auxiliary_loss_clip": 0.01504243, + "auxiliary_loss_mlp": 0.01301207, + "balance_loss_clip": 1.16329908, + "balance_loss_mlp": 1.05191684, + "epoch": 0.31342251615812416, + "flos": 27164722671360.0, + "grad_norm": 2.7723551266757855, + "language_loss": 0.71452773, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.7425822, + "num_input_tokens_seen": 111975915, + "step": 5213, + "time_per_iteration": 2.7877137660980225 + }, + { + "auxiliary_loss_clip": 0.0149407, + "auxiliary_loss_mlp": 0.01296503, + "balance_loss_clip": 1.15246296, + "balance_loss_mlp": 1.04664004, + "epoch": 0.31348263941079213, + "flos": 20231878868640.0, + "grad_norm": 2.65962790323188, + "language_loss": 0.77960241, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.80750811, + "num_input_tokens_seen": 111995055, + "step": 5214, + "time_per_iteration": 2.8231794834136963 + }, + { + "auxiliary_loss_clip": 0.01489341, + "auxiliary_loss_mlp": 0.0128689, + "balance_loss_clip": 1.14886451, + "balance_loss_mlp": 1.04294014, + "epoch": 0.3135427626634601, + "flos": 24609932167680.0, + "grad_norm": 1.7301008893046714, + "language_loss": 0.82687891, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.8546412, + "num_input_tokens_seen": 112015830, + "step": 5215, + "time_per_iteration": 2.816507339477539 + }, + { + "auxiliary_loss_clip": 0.01488541, + "auxiliary_loss_mlp": 0.0130358, + "balance_loss_clip": 1.14777899, + "balance_loss_mlp": 1.05753183, + "epoch": 0.31360288591612806, + "flos": 20962003235040.0, + "grad_norm": 2.473488491837801, + "language_loss": 0.79490715, + "learning_rate": 3.213953633415686e-06, + "loss": 0.82282841, + "num_input_tokens_seen": 112035065, + "step": 5216, + "time_per_iteration": 2.8137011528015137 + }, + { + "auxiliary_loss_clip": 0.01495203, + "auxiliary_loss_mlp": 0.01300648, + "balance_loss_clip": 1.1542412, + "balance_loss_mlp": 1.05288315, + "epoch": 0.313663009168796, + "flos": 26983007035200.0, + "grad_norm": 2.5184769609512245, + "language_loss": 0.68747973, + "learning_rate": 3.213644097593477e-06, + "loss": 0.71543825, + "num_input_tokens_seen": 112058405, + "step": 5217, + "time_per_iteration": 2.8072268962860107 + }, + { + "auxiliary_loss_clip": 0.01502381, + "auxiliary_loss_mlp": 0.01286691, + "balance_loss_clip": 1.16234827, + "balance_loss_mlp": 1.0389266, + "epoch": 0.313723132421464, + "flos": 18042871183200.0, + "grad_norm": 1.85956087478906, + "language_loss": 0.80329084, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.83118153, + "num_input_tokens_seen": 112076420, + "step": 5218, + "time_per_iteration": 2.7730143070220947 + }, + { + "auxiliary_loss_clip": 0.01494963, + "auxiliary_loss_mlp": 0.01294462, + "balance_loss_clip": 1.15434241, + "balance_loss_mlp": 1.04555285, + "epoch": 0.31378325567413196, + "flos": 22490940594240.0, + "grad_norm": 2.783640303757616, + "language_loss": 0.69233549, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.72022974, + "num_input_tokens_seen": 112090775, + "step": 5219, + "time_per_iteration": 2.851858377456665 + }, + { + "auxiliary_loss_clip": 0.01492865, + "auxiliary_loss_mlp": 0.01295102, + "balance_loss_clip": 1.15261841, + "balance_loss_mlp": 1.04905391, + "epoch": 0.3138433789267999, + "flos": 22421455476480.0, + "grad_norm": 4.753682812190641, + "language_loss": 0.79467624, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.8225559, + "num_input_tokens_seen": 112110980, + "step": 5220, + "time_per_iteration": 2.7999629974365234 + }, + { + "auxiliary_loss_clip": 0.0148995, + "auxiliary_loss_mlp": 0.01285341, + "balance_loss_clip": 1.14972138, + "balance_loss_mlp": 1.03872073, + "epoch": 0.3139035021794679, + "flos": 13007592169920.0, + "grad_norm": 1.9650909975078499, + "language_loss": 0.73077327, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75852615, + "num_input_tokens_seen": 112129020, + "step": 5221, + "time_per_iteration": 2.8930823802948 + }, + { + "auxiliary_loss_clip": 0.01498526, + "auxiliary_loss_mlp": 0.01291857, + "balance_loss_clip": 1.1585685, + "balance_loss_mlp": 1.0461911, + "epoch": 0.31396362543213585, + "flos": 16947874274400.0, + "grad_norm": 2.976659160362279, + "language_loss": 0.81691617, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84482002, + "num_input_tokens_seen": 112147865, + "step": 5222, + "time_per_iteration": 2.7772679328918457 + }, + { + "auxiliary_loss_clip": 0.01492375, + "auxiliary_loss_mlp": 0.01293742, + "balance_loss_clip": 1.15092385, + "balance_loss_mlp": 1.04883885, + "epoch": 0.31402374868480387, + "flos": 20158790575680.0, + "grad_norm": 2.312994563371127, + "language_loss": 0.70039314, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.72825432, + "num_input_tokens_seen": 112166745, + "step": 5223, + "time_per_iteration": 2.785116195678711 + }, + { + "auxiliary_loss_clip": 0.01491532, + "auxiliary_loss_mlp": 0.01289105, + "balance_loss_clip": 1.14992225, + "balance_loss_mlp": 1.0470624, + "epoch": 0.31408387193747184, + "flos": 21253180633920.0, + "grad_norm": 3.1448502280855113, + "language_loss": 0.80374181, + "learning_rate": 3.211476058893379e-06, + "loss": 0.83154821, + "num_input_tokens_seen": 112185895, + "step": 5224, + "time_per_iteration": 2.8176279067993164 + }, + { + "auxiliary_loss_clip": 0.01485303, + "auxiliary_loss_mlp": 0.01307608, + "balance_loss_clip": 1.14429617, + "balance_loss_mlp": 1.06079674, + "epoch": 0.3141439951901398, + "flos": 27486394675200.0, + "grad_norm": 3.7742461997014565, + "language_loss": 0.57818961, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.6061188, + "num_input_tokens_seen": 112204465, + "step": 5225, + "time_per_iteration": 2.982513189315796 + }, + { + "auxiliary_loss_clip": 0.01482878, + "auxiliary_loss_mlp": 0.01283982, + "balance_loss_clip": 1.14238012, + "balance_loss_mlp": 1.03831601, + "epoch": 0.31420411844280777, + "flos": 17853835412160.0, + "grad_norm": 2.044774601917731, + "language_loss": 0.8186022, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.8462708, + "num_input_tokens_seen": 112221635, + "step": 5226, + "time_per_iteration": 2.8161122798919678 + }, + { + "auxiliary_loss_clip": 0.0149115, + "auxiliary_loss_mlp": 0.01301428, + "balance_loss_clip": 1.14926887, + "balance_loss_mlp": 1.05423594, + "epoch": 0.31426424169547573, + "flos": 21619153092960.0, + "grad_norm": 2.2223414982909944, + "language_loss": 0.73939353, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76731932, + "num_input_tokens_seen": 112241240, + "step": 5227, + "time_per_iteration": 2.784285068511963 + }, + { + "auxiliary_loss_clip": 0.01491729, + "auxiliary_loss_mlp": 0.01306242, + "balance_loss_clip": 1.1506846, + "balance_loss_mlp": 1.05866778, + "epoch": 0.3143243649481437, + "flos": 30923327067840.0, + "grad_norm": 2.358082558214934, + "language_loss": 0.67971706, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.7076968, + "num_input_tokens_seen": 112262350, + "step": 5228, + "time_per_iteration": 2.8706772327423096 + }, + { + "auxiliary_loss_clip": 0.0148719, + "auxiliary_loss_mlp": 0.01300904, + "balance_loss_clip": 1.14444804, + "balance_loss_mlp": 1.05580962, + "epoch": 0.31438448820081166, + "flos": 22823687620800.0, + "grad_norm": 1.9338584490015402, + "language_loss": 0.79465169, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82253265, + "num_input_tokens_seen": 112283710, + "step": 5229, + "time_per_iteration": 2.775669574737549 + }, + { + "auxiliary_loss_clip": 0.01485342, + "auxiliary_loss_mlp": 0.01296505, + "balance_loss_clip": 1.14427328, + "balance_loss_mlp": 1.050457, + "epoch": 0.3144446114534796, + "flos": 23294229109920.0, + "grad_norm": 2.0817731920497837, + "language_loss": 0.69800687, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72582531, + "num_input_tokens_seen": 112304285, + "step": 5230, + "time_per_iteration": 2.8252334594726562 + }, + { + "auxiliary_loss_clip": 0.01480931, + "auxiliary_loss_mlp": 0.01285186, + "balance_loss_clip": 1.13935804, + "balance_loss_mlp": 1.03684878, + "epoch": 0.3145047347061476, + "flos": 31358557075680.0, + "grad_norm": 1.847786120546986, + "language_loss": 0.79788506, + "learning_rate": 3.209305769168239e-06, + "loss": 0.82554621, + "num_input_tokens_seen": 112325110, + "step": 5231, + "time_per_iteration": 2.7884817123413086 + }, + { + "auxiliary_loss_clip": 0.01482088, + "auxiliary_loss_mlp": 0.01285646, + "balance_loss_clip": 1.13924527, + "balance_loss_mlp": 1.03826296, + "epoch": 0.31456485795881556, + "flos": 10891103855040.0, + "grad_norm": 1.9518589051542894, + "language_loss": 0.84501404, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87269139, + "num_input_tokens_seen": 112339855, + "step": 5232, + "time_per_iteration": 2.7739815711975098 + }, + { + "auxiliary_loss_clip": 0.01481383, + "auxiliary_loss_mlp": 0.01290186, + "balance_loss_clip": 1.13885188, + "balance_loss_mlp": 1.04242134, + "epoch": 0.3146249812114835, + "flos": 17094354285600.0, + "grad_norm": 1.8679975082043816, + "language_loss": 0.80105817, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82877392, + "num_input_tokens_seen": 112358480, + "step": 5233, + "time_per_iteration": 4.447204351425171 + }, + { + "auxiliary_loss_clip": 0.01490012, + "auxiliary_loss_mlp": 0.01285048, + "balance_loss_clip": 1.14673257, + "balance_loss_mlp": 1.03652084, + "epoch": 0.3146851044641515, + "flos": 55295409558240.0, + "grad_norm": 2.0734015781506616, + "language_loss": 0.70746964, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.73522031, + "num_input_tokens_seen": 112382350, + "step": 5234, + "time_per_iteration": 3.0500569343566895 + }, + { + "auxiliary_loss_clip": 0.01477504, + "auxiliary_loss_mlp": 0.01281121, + "balance_loss_clip": 1.1356355, + "balance_loss_mlp": 1.0324024, + "epoch": 0.31474522771681945, + "flos": 27018318516480.0, + "grad_norm": 1.9149535199192016, + "language_loss": 0.71924758, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74683386, + "num_input_tokens_seen": 112400260, + "step": 5235, + "time_per_iteration": 2.7792551517486572 + }, + { + "auxiliary_loss_clip": 0.01473109, + "auxiliary_loss_mlp": 0.01297358, + "balance_loss_clip": 1.12986636, + "balance_loss_mlp": 1.04825783, + "epoch": 0.3148053509694875, + "flos": 21254090909760.0, + "grad_norm": 2.149468963011226, + "language_loss": 0.78678054, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.81448519, + "num_input_tokens_seen": 112419400, + "step": 5236, + "time_per_iteration": 2.7813594341278076 + }, + { + "auxiliary_loss_clip": 0.01477616, + "auxiliary_loss_mlp": 0.01295488, + "balance_loss_clip": 1.13432348, + "balance_loss_mlp": 1.04581571, + "epoch": 0.31486547422215544, + "flos": 31251371002560.0, + "grad_norm": 1.889957090273392, + "language_loss": 0.75788373, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78561473, + "num_input_tokens_seen": 112440825, + "step": 5237, + "time_per_iteration": 2.884915828704834 + }, + { + "auxiliary_loss_clip": 0.01480053, + "auxiliary_loss_mlp": 0.01288628, + "balance_loss_clip": 1.13930559, + "balance_loss_mlp": 1.04524994, + "epoch": 0.3149255974748234, + "flos": 19830670784640.0, + "grad_norm": 2.3930267286427944, + "language_loss": 0.79503328, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82272005, + "num_input_tokens_seen": 112459180, + "step": 5238, + "time_per_iteration": 2.7848713397979736 + }, + { + "auxiliary_loss_clip": 0.01591331, + "auxiliary_loss_mlp": 0.01245796, + "balance_loss_clip": 1.24955201, + "balance_loss_mlp": 1.02988434, + "epoch": 0.31498572072749137, + "flos": 67689962684640.0, + "grad_norm": 0.8297683618770342, + "language_loss": 0.67910779, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.70747906, + "num_input_tokens_seen": 112516680, + "step": 5239, + "time_per_iteration": 3.290994167327881 + }, + { + "auxiliary_loss_clip": 0.01477757, + "auxiliary_loss_mlp": 0.01293609, + "balance_loss_clip": 1.13402617, + "balance_loss_mlp": 1.04508173, + "epoch": 0.31504584398015933, + "flos": 19795245518880.0, + "grad_norm": 3.9465958627862885, + "language_loss": 0.82576263, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.85347629, + "num_input_tokens_seen": 112535895, + "step": 5240, + "time_per_iteration": 4.249819278717041 + }, + { + "auxiliary_loss_clip": 0.01484447, + "auxiliary_loss_mlp": 0.013098, + "balance_loss_clip": 1.14248621, + "balance_loss_mlp": 1.06413305, + "epoch": 0.3151059672328273, + "flos": 26617868995680.0, + "grad_norm": 1.9836411810528343, + "language_loss": 0.81170881, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83965123, + "num_input_tokens_seen": 112557490, + "step": 5241, + "time_per_iteration": 5.998563289642334 + }, + { + "auxiliary_loss_clip": 0.01478966, + "auxiliary_loss_mlp": 0.01286358, + "balance_loss_clip": 1.1363318, + "balance_loss_mlp": 1.0397377, + "epoch": 0.31516609048549526, + "flos": 24206600106720.0, + "grad_norm": 2.379989867183029, + "language_loss": 0.74702382, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.7746771, + "num_input_tokens_seen": 112577075, + "step": 5242, + "time_per_iteration": 2.7989556789398193 + }, + { + "auxiliary_loss_clip": 0.01479603, + "auxiliary_loss_mlp": 0.01286042, + "balance_loss_clip": 1.13873768, + "balance_loss_mlp": 1.0363698, + "epoch": 0.31522621373816323, + "flos": 25961136347520.0, + "grad_norm": 1.9700645861796413, + "language_loss": 0.73628157, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.76393807, + "num_input_tokens_seen": 112597620, + "step": 5243, + "time_per_iteration": 2.766000747680664 + }, + { + "auxiliary_loss_clip": 0.01478763, + "auxiliary_loss_mlp": 0.01296653, + "balance_loss_clip": 1.13575232, + "balance_loss_mlp": 1.05079603, + "epoch": 0.3152863369908312, + "flos": 21911013198720.0, + "grad_norm": 3.7065238826599023, + "language_loss": 0.6467008, + "learning_rate": 3.205269272758513e-06, + "loss": 0.67445493, + "num_input_tokens_seen": 112617150, + "step": 5244, + "time_per_iteration": 2.8057730197906494 + }, + { + "auxiliary_loss_clip": 0.01474968, + "auxiliary_loss_mlp": 0.01292032, + "balance_loss_clip": 1.1317482, + "balance_loss_mlp": 1.04598355, + "epoch": 0.31534646024349916, + "flos": 16282456077600.0, + "grad_norm": 2.240399108683134, + "language_loss": 0.91376591, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.94143593, + "num_input_tokens_seen": 112631090, + "step": 5245, + "time_per_iteration": 2.7784080505371094 + }, + { + "auxiliary_loss_clip": 0.01473996, + "auxiliary_loss_mlp": 0.01282896, + "balance_loss_clip": 1.13240981, + "balance_loss_mlp": 1.03227007, + "epoch": 0.3154065834961671, + "flos": 24719545643040.0, + "grad_norm": 1.7854873413020564, + "language_loss": 0.75623882, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.78380775, + "num_input_tokens_seen": 112651220, + "step": 5246, + "time_per_iteration": 2.8098723888397217 + }, + { + "auxiliary_loss_clip": 0.01474047, + "auxiliary_loss_mlp": 0.01281795, + "balance_loss_clip": 1.13069475, + "balance_loss_mlp": 1.03403091, + "epoch": 0.3154667067488351, + "flos": 35374051450080.0, + "grad_norm": 1.736499826728513, + "language_loss": 0.61477846, + "learning_rate": 3.204336675750321e-06, + "loss": 0.64233691, + "num_input_tokens_seen": 112671560, + "step": 5247, + "time_per_iteration": 2.9324705600738525 + }, + { + "auxiliary_loss_clip": 0.01470587, + "auxiliary_loss_mlp": 0.01293708, + "balance_loss_clip": 1.12815833, + "balance_loss_mlp": 1.04098392, + "epoch": 0.31552683000150306, + "flos": 17458354480320.0, + "grad_norm": 3.9131806443831967, + "language_loss": 0.82070899, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84835196, + "num_input_tokens_seen": 112689790, + "step": 5248, + "time_per_iteration": 2.7594919204711914 + }, + { + "auxiliary_loss_clip": 0.01473206, + "auxiliary_loss_mlp": 0.01306601, + "balance_loss_clip": 1.13077092, + "balance_loss_mlp": 1.05959892, + "epoch": 0.3155869532541711, + "flos": 18407743725600.0, + "grad_norm": 2.6291177704542936, + "language_loss": 0.85312486, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.88092291, + "num_input_tokens_seen": 112708265, + "step": 5249, + "time_per_iteration": 2.7220826148986816 + }, + { + "auxiliary_loss_clip": 0.01474597, + "auxiliary_loss_mlp": 0.01291333, + "balance_loss_clip": 1.13158131, + "balance_loss_mlp": 1.04108894, + "epoch": 0.31564707650683904, + "flos": 21581907275520.0, + "grad_norm": 2.0079849088274204, + "language_loss": 0.85072899, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87838829, + "num_input_tokens_seen": 112727820, + "step": 5250, + "time_per_iteration": 2.7336151599884033 + }, + { + "auxiliary_loss_clip": 0.01474411, + "auxiliary_loss_mlp": 0.01304074, + "balance_loss_clip": 1.13228512, + "balance_loss_mlp": 1.05707252, + "epoch": 0.315707199759507, + "flos": 21033005479200.0, + "grad_norm": 2.8866086892868554, + "language_loss": 0.6863718, + "learning_rate": 3.203092573767835e-06, + "loss": 0.71415663, + "num_input_tokens_seen": 112743140, + "step": 5251, + "time_per_iteration": 2.7663071155548096 + }, + { + "auxiliary_loss_clip": 0.01473847, + "auxiliary_loss_mlp": 0.01295936, + "balance_loss_clip": 1.1307075, + "balance_loss_mlp": 1.04950643, + "epoch": 0.31576732301217497, + "flos": 26831027440800.0, + "grad_norm": 2.460300266777492, + "language_loss": 0.78988612, + "learning_rate": 3.202781434189246e-06, + "loss": 0.81758392, + "num_input_tokens_seen": 112764705, + "step": 5252, + "time_per_iteration": 2.7758126258850098 + }, + { + "auxiliary_loss_clip": 0.01476522, + "auxiliary_loss_mlp": 0.01280719, + "balance_loss_clip": 1.13518476, + "balance_loss_mlp": 1.03333545, + "epoch": 0.31582744626484294, + "flos": 22713391438560.0, + "grad_norm": 2.0869085144872366, + "language_loss": 0.74113715, + "learning_rate": 3.202470249001066e-06, + "loss": 0.7687096, + "num_input_tokens_seen": 112785310, + "step": 5253, + "time_per_iteration": 2.810824394226074 + }, + { + "auxiliary_loss_clip": 0.01468073, + "auxiliary_loss_mlp": 0.0128926, + "balance_loss_clip": 1.12686586, + "balance_loss_mlp": 1.03806269, + "epoch": 0.3158875695175109, + "flos": 23954223579840.0, + "grad_norm": 3.666416268081177, + "language_loss": 0.73644519, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.76401854, + "num_input_tokens_seen": 112802905, + "step": 5254, + "time_per_iteration": 2.8340632915496826 + }, + { + "auxiliary_loss_clip": 0.01467439, + "auxiliary_loss_mlp": 0.01281635, + "balance_loss_clip": 1.12677944, + "balance_loss_mlp": 1.02852988, + "epoch": 0.31594769277017887, + "flos": 13263609800160.0, + "grad_norm": 2.088762332354237, + "language_loss": 0.77515578, + "learning_rate": 3.201847741843128e-06, + "loss": 0.80264652, + "num_input_tokens_seen": 112820305, + "step": 5255, + "time_per_iteration": 2.716203451156616 + }, + { + "auxiliary_loss_clip": 0.01476315, + "auxiliary_loss_mlp": 0.0129782, + "balance_loss_clip": 1.13503611, + "balance_loss_mlp": 1.04986453, + "epoch": 0.31600781602284683, + "flos": 23370541296480.0, + "grad_norm": 2.3002984032053404, + "language_loss": 0.77570981, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80345112, + "num_input_tokens_seen": 112841185, + "step": 5256, + "time_per_iteration": 2.809018135070801 + }, + { + "auxiliary_loss_clip": 0.01477678, + "auxiliary_loss_mlp": 0.01312505, + "balance_loss_clip": 1.13593388, + "balance_loss_mlp": 1.07103431, + "epoch": 0.3160679392755148, + "flos": 19830632856480.0, + "grad_norm": 1.9881142004054837, + "language_loss": 0.71641707, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.74431884, + "num_input_tokens_seen": 112860570, + "step": 5257, + "time_per_iteration": 2.7246811389923096 + }, + { + "auxiliary_loss_clip": 0.01467854, + "auxiliary_loss_mlp": 0.0128915, + "balance_loss_clip": 1.12616682, + "balance_loss_mlp": 1.03852463, + "epoch": 0.31612806252818276, + "flos": 20195277829920.0, + "grad_norm": 2.2774332018295147, + "language_loss": 0.76702833, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.79459834, + "num_input_tokens_seen": 112877975, + "step": 5258, + "time_per_iteration": 2.7580769062042236 + }, + { + "auxiliary_loss_clip": 0.01476767, + "auxiliary_loss_mlp": 0.01280967, + "balance_loss_clip": 1.13600206, + "balance_loss_mlp": 1.03015018, + "epoch": 0.31618818578085073, + "flos": 24237853274880.0, + "grad_norm": 2.395096277021478, + "language_loss": 0.72354352, + "learning_rate": 3.200602180731467e-06, + "loss": 0.75112081, + "num_input_tokens_seen": 112896170, + "step": 5259, + "time_per_iteration": 2.702498435974121 + }, + { + "auxiliary_loss_clip": 0.01477112, + "auxiliary_loss_mlp": 0.01292953, + "balance_loss_clip": 1.13468647, + "balance_loss_mlp": 1.04499745, + "epoch": 0.3162483090335187, + "flos": 25084152688320.0, + "grad_norm": 2.4535339058390755, + "language_loss": 0.66111135, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68881202, + "num_input_tokens_seen": 112916180, + "step": 5260, + "time_per_iteration": 2.7585597038269043 + }, + { + "auxiliary_loss_clip": 0.01462863, + "auxiliary_loss_mlp": 0.01278389, + "balance_loss_clip": 1.12201643, + "balance_loss_mlp": 1.02394867, + "epoch": 0.31630843228618666, + "flos": 26325819249120.0, + "grad_norm": 1.9771111182604248, + "language_loss": 0.72264349, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.75005603, + "num_input_tokens_seen": 112936745, + "step": 5261, + "time_per_iteration": 2.7686421871185303 + }, + { + "auxiliary_loss_clip": 0.01568796, + "auxiliary_loss_mlp": 0.01228523, + "balance_loss_clip": 1.22675204, + "balance_loss_mlp": 1.00269318, + "epoch": 0.3163685555388547, + "flos": 66765681881280.0, + "grad_norm": 0.7539342344666206, + "language_loss": 0.50647032, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.5344435, + "num_input_tokens_seen": 112994845, + "step": 5262, + "time_per_iteration": 3.3648416996002197 + }, + { + "auxiliary_loss_clip": 0.01475604, + "auxiliary_loss_mlp": 0.01297484, + "balance_loss_clip": 1.13321304, + "balance_loss_mlp": 1.04819322, + "epoch": 0.31642867879152264, + "flos": 25998116667840.0, + "grad_norm": 1.777379254694584, + "language_loss": 0.8560456, + "learning_rate": 3.19935589118856e-06, + "loss": 0.88377649, + "num_input_tokens_seen": 113015125, + "step": 5263, + "time_per_iteration": 2.8866965770721436 + }, + { + "auxiliary_loss_clip": 0.014677, + "auxiliary_loss_mlp": 0.01282466, + "balance_loss_clip": 1.12644863, + "balance_loss_mlp": 1.03451049, + "epoch": 0.3164888020441906, + "flos": 25777296734400.0, + "grad_norm": 1.5467197418191796, + "language_loss": 0.82037085, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.8478725, + "num_input_tokens_seen": 113035535, + "step": 5264, + "time_per_iteration": 2.7634501457214355 + }, + { + "auxiliary_loss_clip": 0.01467745, + "auxiliary_loss_mlp": 0.01286751, + "balance_loss_clip": 1.1255641, + "balance_loss_mlp": 1.03612518, + "epoch": 0.3165489252968586, + "flos": 19758303126720.0, + "grad_norm": 2.7704278348713833, + "language_loss": 0.79382229, + "learning_rate": 3.19873247349167e-06, + "loss": 0.82136726, + "num_input_tokens_seen": 113052720, + "step": 5265, + "time_per_iteration": 2.754669427871704 + }, + { + "auxiliary_loss_clip": 0.014677, + "auxiliary_loss_mlp": 0.01288641, + "balance_loss_clip": 1.12469435, + "balance_loss_mlp": 1.03820658, + "epoch": 0.31660904854952654, + "flos": 23186094832800.0, + "grad_norm": 1.499415429059127, + "language_loss": 0.74817431, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77573776, + "num_input_tokens_seen": 113071435, + "step": 5266, + "time_per_iteration": 2.781954526901245 + }, + { + "auxiliary_loss_clip": 0.01461191, + "auxiliary_loss_mlp": 0.01295374, + "balance_loss_clip": 1.11965466, + "balance_loss_mlp": 1.04512978, + "epoch": 0.3166691718021945, + "flos": 20410370611200.0, + "grad_norm": 6.3521199566097355, + "language_loss": 0.79036498, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81793064, + "num_input_tokens_seen": 113088645, + "step": 5267, + "time_per_iteration": 2.7654125690460205 + }, + { + "auxiliary_loss_clip": 0.01568235, + "auxiliary_loss_mlp": 0.01233673, + "balance_loss_clip": 1.22659373, + "balance_loss_mlp": 1.01470947, + "epoch": 0.31672929505486247, + "flos": 70151828103360.0, + "grad_norm": 0.737117445467474, + "language_loss": 0.57725018, + "learning_rate": 3.197797006055478e-06, + "loss": 0.60526925, + "num_input_tokens_seen": 113152775, + "step": 5268, + "time_per_iteration": 3.3334503173828125 + }, + { + "auxiliary_loss_clip": 0.01468211, + "auxiliary_loss_mlp": 0.01296813, + "balance_loss_clip": 1.12504303, + "balance_loss_mlp": 1.05038381, + "epoch": 0.31678941830753043, + "flos": 14357506792320.0, + "grad_norm": 3.300384314211048, + "language_loss": 0.73638952, + "learning_rate": 3.197485092719815e-06, + "loss": 0.76403981, + "num_input_tokens_seen": 113171410, + "step": 5269, + "time_per_iteration": 2.7678539752960205 + }, + { + "auxiliary_loss_clip": 0.01469482, + "auxiliary_loss_mlp": 0.01283828, + "balance_loss_clip": 1.12697816, + "balance_loss_mlp": 1.03530014, + "epoch": 0.3168495415601984, + "flos": 22749954549120.0, + "grad_norm": 1.92678471467403, + "language_loss": 0.79760885, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82514197, + "num_input_tokens_seen": 113189965, + "step": 5270, + "time_per_iteration": 4.372671842575073 + }, + { + "auxiliary_loss_clip": 0.01470833, + "auxiliary_loss_mlp": 0.01291896, + "balance_loss_clip": 1.12801349, + "balance_loss_mlp": 1.04031682, + "epoch": 0.31690966481286637, + "flos": 20117031307200.0, + "grad_norm": 3.411380890723506, + "language_loss": 0.7962544, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.82388169, + "num_input_tokens_seen": 113206355, + "step": 5271, + "time_per_iteration": 2.7520716190338135 + }, + { + "auxiliary_loss_clip": 0.01473709, + "auxiliary_loss_mlp": 0.01287669, + "balance_loss_clip": 1.13152254, + "balance_loss_mlp": 1.03856897, + "epoch": 0.31696978806553433, + "flos": 21181078473120.0, + "grad_norm": 1.9469163483107075, + "language_loss": 0.73223197, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75984573, + "num_input_tokens_seen": 113225440, + "step": 5272, + "time_per_iteration": 2.743795156478882 + }, + { + "auxiliary_loss_clip": 0.01468244, + "auxiliary_loss_mlp": 0.01295379, + "balance_loss_clip": 1.12494135, + "balance_loss_mlp": 1.04208338, + "epoch": 0.3170299113182023, + "flos": 43000380652320.0, + "grad_norm": 2.4338716890254783, + "language_loss": 0.69635767, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.7239939, + "num_input_tokens_seen": 113248840, + "step": 5273, + "time_per_iteration": 2.989400625228882 + }, + { + "auxiliary_loss_clip": 0.01468714, + "auxiliary_loss_mlp": 0.01292358, + "balance_loss_clip": 1.12621856, + "balance_loss_mlp": 1.04402161, + "epoch": 0.31709003457087026, + "flos": 24462352239840.0, + "grad_norm": 1.8442818777628691, + "language_loss": 0.6772204, + "learning_rate": 3.195924845146795e-06, + "loss": 0.70483112, + "num_input_tokens_seen": 113269630, + "step": 5274, + "time_per_iteration": 2.769498586654663 + }, + { + "auxiliary_loss_clip": 0.01466262, + "auxiliary_loss_mlp": 0.01286791, + "balance_loss_clip": 1.12378287, + "balance_loss_mlp": 1.04169619, + "epoch": 0.3171501578235382, + "flos": 24137683911360.0, + "grad_norm": 1.587162596927474, + "language_loss": 0.80885738, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83638787, + "num_input_tokens_seen": 113291200, + "step": 5275, + "time_per_iteration": 2.8528892993927 + }, + { + "auxiliary_loss_clip": 0.01470468, + "auxiliary_loss_mlp": 0.01298888, + "balance_loss_clip": 1.12761855, + "balance_loss_mlp": 1.05303049, + "epoch": 0.31721028107620625, + "flos": 18881698749120.0, + "grad_norm": 1.9443971313160102, + "language_loss": 0.73000717, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.75770068, + "num_input_tokens_seen": 113310170, + "step": 5276, + "time_per_iteration": 2.7153096199035645 + }, + { + "auxiliary_loss_clip": 0.01469463, + "auxiliary_loss_mlp": 0.01286489, + "balance_loss_clip": 1.12605834, + "balance_loss_mlp": 1.03967786, + "epoch": 0.3172704043288742, + "flos": 23150062716480.0, + "grad_norm": 1.7104561194098313, + "language_loss": 0.78201723, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80957681, + "num_input_tokens_seen": 113331140, + "step": 5277, + "time_per_iteration": 2.8216214179992676 + }, + { + "auxiliary_loss_clip": 0.01467932, + "auxiliary_loss_mlp": 0.0128702, + "balance_loss_clip": 1.12463021, + "balance_loss_mlp": 1.03219795, + "epoch": 0.3173305275815422, + "flos": 17860472840160.0, + "grad_norm": 1.9982031425909015, + "language_loss": 0.78646624, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.81401587, + "num_input_tokens_seen": 113350030, + "step": 5278, + "time_per_iteration": 2.804161310195923 + }, + { + "auxiliary_loss_clip": 0.01571752, + "auxiliary_loss_mlp": 0.01259476, + "balance_loss_clip": 1.22981393, + "balance_loss_mlp": 1.04203796, + "epoch": 0.31739065083421014, + "flos": 59978370250080.0, + "grad_norm": 0.8732759207639851, + "language_loss": 0.62809002, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.65640229, + "num_input_tokens_seen": 113395820, + "step": 5279, + "time_per_iteration": 6.1139140129089355 + }, + { + "auxiliary_loss_clip": 0.01473931, + "auxiliary_loss_mlp": 0.012908, + "balance_loss_clip": 1.13105643, + "balance_loss_mlp": 1.03388059, + "epoch": 0.3174507740868781, + "flos": 23803002548640.0, + "grad_norm": 2.2271929302828015, + "language_loss": 0.81479824, + "learning_rate": 3.194051051653053e-06, + "loss": 0.84244549, + "num_input_tokens_seen": 113416835, + "step": 5280, + "time_per_iteration": 4.330230712890625 + }, + { + "auxiliary_loss_clip": 0.01471333, + "auxiliary_loss_mlp": 0.01291579, + "balance_loss_clip": 1.12752557, + "balance_loss_mlp": 1.04286075, + "epoch": 0.31751089733954607, + "flos": 27641522306880.0, + "grad_norm": 1.7243292337646867, + "language_loss": 0.7838273, + "learning_rate": 3.19373859419346e-06, + "loss": 0.81145644, + "num_input_tokens_seen": 113440850, + "step": 5281, + "time_per_iteration": 2.909285306930542 + }, + { + "auxiliary_loss_clip": 0.01479019, + "auxiliary_loss_mlp": 0.0130009, + "balance_loss_clip": 1.1353544, + "balance_loss_mlp": 1.0532788, + "epoch": 0.31757102059221404, + "flos": 23771559739680.0, + "grad_norm": 2.328124119167303, + "language_loss": 0.78366435, + "learning_rate": 3.193426091467179e-06, + "loss": 0.81145543, + "num_input_tokens_seen": 113461000, + "step": 5282, + "time_per_iteration": 2.80904483795166 + }, + { + "auxiliary_loss_clip": 0.01467364, + "auxiliary_loss_mlp": 0.0129393, + "balance_loss_clip": 1.12449551, + "balance_loss_mlp": 1.04101562, + "epoch": 0.317631143844882, + "flos": 25266854456640.0, + "grad_norm": 3.5629359509738427, + "language_loss": 0.67467272, + "learning_rate": 3.193113543486061e-06, + "loss": 0.70228565, + "num_input_tokens_seen": 113480820, + "step": 5283, + "time_per_iteration": 2.8374156951904297 + }, + { + "auxiliary_loss_clip": 0.01576566, + "auxiliary_loss_mlp": 0.01229858, + "balance_loss_clip": 1.23490024, + "balance_loss_mlp": 1.01013184, + "epoch": 0.31769126709754997, + "flos": 55831263703200.0, + "grad_norm": 0.770749962214909, + "language_loss": 0.52754503, + "learning_rate": 3.192800950261958e-06, + "loss": 0.55560929, + "num_input_tokens_seen": 113536910, + "step": 5284, + "time_per_iteration": 3.35552978515625 + }, + { + "auxiliary_loss_clip": 0.01469659, + "auxiliary_loss_mlp": 0.0128114, + "balance_loss_clip": 1.1272794, + "balance_loss_mlp": 1.02536476, + "epoch": 0.31775139035021793, + "flos": 16692425566560.0, + "grad_norm": 2.07928030959015, + "language_loss": 0.70537949, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.73288751, + "num_input_tokens_seen": 113555480, + "step": 5285, + "time_per_iteration": 2.9288973808288574 + }, + { + "auxiliary_loss_clip": 0.01575332, + "auxiliary_loss_mlp": 0.01229652, + "balance_loss_clip": 1.2340647, + "balance_loss_mlp": 1.0129776, + "epoch": 0.3178115136028859, + "flos": 64233572781600.0, + "grad_norm": 0.8264607328832028, + "language_loss": 0.6038425, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.63189232, + "num_input_tokens_seen": 113616790, + "step": 5286, + "time_per_iteration": 3.2928972244262695 + }, + { + "auxiliary_loss_clip": 0.01468563, + "auxiliary_loss_mlp": 0.01296064, + "balance_loss_clip": 1.12592328, + "balance_loss_mlp": 1.04143333, + "epoch": 0.31787163685555386, + "flos": 18699452118720.0, + "grad_norm": 2.2367327734521383, + "language_loss": 0.71987355, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74751979, + "num_input_tokens_seen": 113635320, + "step": 5287, + "time_per_iteration": 2.7988839149475098 + }, + { + "auxiliary_loss_clip": 0.01466171, + "auxiliary_loss_mlp": 0.0129309, + "balance_loss_clip": 1.12335765, + "balance_loss_mlp": 1.03769577, + "epoch": 0.31793176010822183, + "flos": 21326913705600.0, + "grad_norm": 1.839815593797751, + "language_loss": 0.75541317, + "learning_rate": 3.191550125172792e-06, + "loss": 0.78300583, + "num_input_tokens_seen": 113654000, + "step": 5288, + "time_per_iteration": 2.82159161567688 + }, + { + "auxiliary_loss_clip": 0.01461241, + "auxiliary_loss_mlp": 0.01276517, + "balance_loss_clip": 1.11685395, + "balance_loss_mlp": 1.02493787, + "epoch": 0.31799188336088985, + "flos": 20960751605760.0, + "grad_norm": 3.8800732085620666, + "language_loss": 0.87817872, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.9055562, + "num_input_tokens_seen": 113672375, + "step": 5289, + "time_per_iteration": 2.7780921459198 + }, + { + "auxiliary_loss_clip": 0.01474209, + "auxiliary_loss_mlp": 0.01289499, + "balance_loss_clip": 1.13043547, + "balance_loss_mlp": 1.04078066, + "epoch": 0.3180520066135578, + "flos": 22494126559680.0, + "grad_norm": 1.7635060523221062, + "language_loss": 0.67998588, + "learning_rate": 3.190924441478572e-06, + "loss": 0.707623, + "num_input_tokens_seen": 113692385, + "step": 5290, + "time_per_iteration": 2.882761001586914 + }, + { + "auxiliary_loss_clip": 0.01470296, + "auxiliary_loss_mlp": 0.01305558, + "balance_loss_clip": 1.12789917, + "balance_loss_mlp": 1.05588615, + "epoch": 0.3181121298662258, + "flos": 27237810964320.0, + "grad_norm": 2.58044184742548, + "language_loss": 0.79690421, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.8246628, + "num_input_tokens_seen": 113712145, + "step": 5291, + "time_per_iteration": 2.859490394592285 + }, + { + "auxiliary_loss_clip": 0.01474403, + "auxiliary_loss_mlp": 0.01291045, + "balance_loss_clip": 1.13141561, + "balance_loss_mlp": 1.03946614, + "epoch": 0.31817225311889374, + "flos": 23182112376000.0, + "grad_norm": 2.113215724789019, + "language_loss": 0.79343235, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82108682, + "num_input_tokens_seen": 113731435, + "step": 5292, + "time_per_iteration": 2.9313464164733887 + }, + { + "auxiliary_loss_clip": 0.01469622, + "auxiliary_loss_mlp": 0.01279217, + "balance_loss_clip": 1.12662888, + "balance_loss_mlp": 1.033741, + "epoch": 0.3182323763715617, + "flos": 23261079533760.0, + "grad_norm": 2.1280200010334505, + "language_loss": 0.74826419, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.77575254, + "num_input_tokens_seen": 113750825, + "step": 5293, + "time_per_iteration": 2.797290086746216 + }, + { + "auxiliary_loss_clip": 0.01470044, + "auxiliary_loss_mlp": 0.01280671, + "balance_loss_clip": 1.12697291, + "balance_loss_mlp": 1.03195262, + "epoch": 0.3182924996242297, + "flos": 29018897281440.0, + "grad_norm": 1.9369001380218147, + "language_loss": 0.73849362, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76600075, + "num_input_tokens_seen": 113770010, + "step": 5294, + "time_per_iteration": 2.832517385482788 + }, + { + "auxiliary_loss_clip": 0.01469127, + "auxiliary_loss_mlp": 0.01293137, + "balance_loss_clip": 1.12544894, + "balance_loss_mlp": 1.04022264, + "epoch": 0.31835262287689764, + "flos": 20451295460160.0, + "grad_norm": 2.4227957418033585, + "language_loss": 0.76183259, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78945524, + "num_input_tokens_seen": 113788640, + "step": 5295, + "time_per_iteration": 2.7357921600341797 + }, + { + "auxiliary_loss_clip": 0.01470752, + "auxiliary_loss_mlp": 0.01292365, + "balance_loss_clip": 1.12733078, + "balance_loss_mlp": 1.03945041, + "epoch": 0.3184127461295656, + "flos": 25121853643680.0, + "grad_norm": 3.3816639305141063, + "language_loss": 0.69678897, + "learning_rate": 3.189046306936296e-06, + "loss": 0.72442007, + "num_input_tokens_seen": 113809515, + "step": 5296, + "time_per_iteration": 2.8303446769714355 + }, + { + "auxiliary_loss_clip": 0.01472129, + "auxiliary_loss_mlp": 0.01289715, + "balance_loss_clip": 1.12858295, + "balance_loss_mlp": 1.03927994, + "epoch": 0.31847286938223357, + "flos": 25553290835520.0, + "grad_norm": 1.9997681141455432, + "language_loss": 0.77559394, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.8032124, + "num_input_tokens_seen": 113829770, + "step": 5297, + "time_per_iteration": 2.844069719314575 + }, + { + "auxiliary_loss_clip": 0.01463203, + "auxiliary_loss_mlp": 0.01279527, + "balance_loss_clip": 1.12053061, + "balance_loss_mlp": 1.03214383, + "epoch": 0.31853299263490154, + "flos": 27784968065280.0, + "grad_norm": 2.159132117383503, + "language_loss": 0.79660594, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.82403332, + "num_input_tokens_seen": 113849320, + "step": 5298, + "time_per_iteration": 2.81958270072937 + }, + { + "auxiliary_loss_clip": 0.01466929, + "auxiliary_loss_mlp": 0.01288082, + "balance_loss_clip": 1.12595963, + "balance_loss_mlp": 1.03726542, + "epoch": 0.3185931158875695, + "flos": 22708726274880.0, + "grad_norm": 1.9195515197402893, + "language_loss": 0.74817777, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.77572787, + "num_input_tokens_seen": 113867860, + "step": 5299, + "time_per_iteration": 2.8451905250549316 + }, + { + "auxiliary_loss_clip": 0.01468651, + "auxiliary_loss_mlp": 0.01285131, + "balance_loss_clip": 1.12619305, + "balance_loss_mlp": 1.03469586, + "epoch": 0.31865323914023747, + "flos": 24573824195040.0, + "grad_norm": 2.6780825883145307, + "language_loss": 0.77839357, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80593139, + "num_input_tokens_seen": 113886375, + "step": 5300, + "time_per_iteration": 2.8216612339019775 + }, + { + "auxiliary_loss_clip": 0.01470679, + "auxiliary_loss_mlp": 0.01282787, + "balance_loss_clip": 1.12753272, + "balance_loss_mlp": 1.03006291, + "epoch": 0.31871336239290543, + "flos": 18188516774880.0, + "grad_norm": 2.677268478917248, + "language_loss": 0.84069276, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86822742, + "num_input_tokens_seen": 113904065, + "step": 5301, + "time_per_iteration": 2.7407407760620117 + }, + { + "auxiliary_loss_clip": 0.01479107, + "auxiliary_loss_mlp": 0.01299067, + "balance_loss_clip": 1.13631105, + "balance_loss_mlp": 1.04977703, + "epoch": 0.31877348564557345, + "flos": 21828442865760.0, + "grad_norm": 3.047261336757043, + "language_loss": 0.77407467, + "learning_rate": 3.187166549199015e-06, + "loss": 0.8018564, + "num_input_tokens_seen": 113918415, + "step": 5302, + "time_per_iteration": 2.793466567993164 + }, + { + "auxiliary_loss_clip": 0.01480624, + "auxiliary_loss_mlp": 0.01285403, + "balance_loss_clip": 1.13591194, + "balance_loss_mlp": 1.03649449, + "epoch": 0.3188336088982414, + "flos": 22017288996000.0, + "grad_norm": 2.2296211794625305, + "language_loss": 0.79822063, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.82588089, + "num_input_tokens_seen": 113938135, + "step": 5303, + "time_per_iteration": 2.889099597930908 + }, + { + "auxiliary_loss_clip": 0.01466701, + "auxiliary_loss_mlp": 0.01294234, + "balance_loss_clip": 1.12539482, + "balance_loss_mlp": 1.04017484, + "epoch": 0.3188937321509094, + "flos": 20049973591680.0, + "grad_norm": 2.036596737935198, + "language_loss": 0.72361791, + "learning_rate": 3.186539603020047e-06, + "loss": 0.75122726, + "num_input_tokens_seen": 113957125, + "step": 5304, + "time_per_iteration": 2.778226375579834 + }, + { + "auxiliary_loss_clip": 0.01473209, + "auxiliary_loss_mlp": 0.01289416, + "balance_loss_clip": 1.13147068, + "balance_loss_mlp": 1.04069829, + "epoch": 0.31895385540357735, + "flos": 25850460883680.0, + "grad_norm": 4.330438259146089, + "language_loss": 0.72005808, + "learning_rate": 3.186226062434068e-06, + "loss": 0.74768436, + "num_input_tokens_seen": 113974875, + "step": 5305, + "time_per_iteration": 2.8635027408599854 + }, + { + "auxiliary_loss_clip": 0.01468227, + "auxiliary_loss_mlp": 0.01294889, + "balance_loss_clip": 1.12628102, + "balance_loss_mlp": 1.04597974, + "epoch": 0.3190139786562453, + "flos": 23480003059200.0, + "grad_norm": 3.031451597940883, + "language_loss": 0.64187497, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66950607, + "num_input_tokens_seen": 113994450, + "step": 5306, + "time_per_iteration": 2.824993371963501 + }, + { + "auxiliary_loss_clip": 0.01477135, + "auxiliary_loss_mlp": 0.01290614, + "balance_loss_clip": 1.13360608, + "balance_loss_mlp": 1.04189539, + "epoch": 0.3190741019089133, + "flos": 29098092008160.0, + "grad_norm": 2.3502923675389624, + "language_loss": 0.7935614, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.82123893, + "num_input_tokens_seen": 114013945, + "step": 5307, + "time_per_iteration": 2.8351519107818604 + }, + { + "auxiliary_loss_clip": 0.01470815, + "auxiliary_loss_mlp": 0.01279732, + "balance_loss_clip": 1.12763345, + "balance_loss_mlp": 1.03025055, + "epoch": 0.31913422516158124, + "flos": 17131486318560.0, + "grad_norm": 1.869142475046133, + "language_loss": 0.7787714, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.80627686, + "num_input_tokens_seen": 114031375, + "step": 5308, + "time_per_iteration": 2.7898671627044678 + }, + { + "auxiliary_loss_clip": 0.01482552, + "auxiliary_loss_mlp": 0.01297364, + "balance_loss_clip": 1.13967502, + "balance_loss_mlp": 1.04483116, + "epoch": 0.3191943484142492, + "flos": 16071649178400.0, + "grad_norm": 3.2127713752486335, + "language_loss": 0.74526966, + "learning_rate": 3.184971450390961e-06, + "loss": 0.77306885, + "num_input_tokens_seen": 114048465, + "step": 5309, + "time_per_iteration": 4.646327018737793 + }, + { + "auxiliary_loss_clip": 0.01473317, + "auxiliary_loss_mlp": 0.0128912, + "balance_loss_clip": 1.13189328, + "balance_loss_mlp": 1.03830338, + "epoch": 0.3192544716669172, + "flos": 22968385008480.0, + "grad_norm": 2.1207261875417416, + "language_loss": 0.82939517, + "learning_rate": 3.184657685014856e-06, + "loss": 0.85701954, + "num_input_tokens_seen": 114068415, + "step": 5310, + "time_per_iteration": 2.7977635860443115 + }, + { + "auxiliary_loss_clip": 0.01468812, + "auxiliary_loss_mlp": 0.01293226, + "balance_loss_clip": 1.12590039, + "balance_loss_mlp": 1.04012108, + "epoch": 0.31931459491958514, + "flos": 26872786709280.0, + "grad_norm": 1.7179106728108973, + "language_loss": 0.78115642, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80877686, + "num_input_tokens_seen": 114088565, + "step": 5311, + "time_per_iteration": 2.919309139251709 + }, + { + "auxiliary_loss_clip": 0.014721, + "auxiliary_loss_mlp": 0.01294758, + "balance_loss_clip": 1.1285758, + "balance_loss_mlp": 1.04604018, + "epoch": 0.3193747181722531, + "flos": 21838759325280.0, + "grad_norm": 1.9917221333795532, + "language_loss": 0.84588146, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.87355006, + "num_input_tokens_seen": 114107160, + "step": 5312, + "time_per_iteration": 2.7701566219329834 + }, + { + "auxiliary_loss_clip": 0.0148034, + "auxiliary_loss_mlp": 0.01318071, + "balance_loss_clip": 1.13731265, + "balance_loss_mlp": 1.06839907, + "epoch": 0.31943484142492107, + "flos": 18326538806400.0, + "grad_norm": 5.36198974749828, + "language_loss": 0.77720696, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.8051911, + "num_input_tokens_seen": 114123420, + "step": 5313, + "time_per_iteration": 2.775078058242798 + }, + { + "auxiliary_loss_clip": 0.01470872, + "auxiliary_loss_mlp": 0.012869, + "balance_loss_clip": 1.12932277, + "balance_loss_mlp": 1.03646553, + "epoch": 0.31949496467758903, + "flos": 21617939391840.0, + "grad_norm": 2.7668923785466757, + "language_loss": 0.85583663, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88341439, + "num_input_tokens_seen": 114139230, + "step": 5314, + "time_per_iteration": 2.756464719772339 + }, + { + "auxiliary_loss_clip": 0.01470543, + "auxiliary_loss_mlp": 0.01290487, + "balance_loss_clip": 1.12889051, + "balance_loss_mlp": 1.04558372, + "epoch": 0.31955508793025705, + "flos": 21762257497920.0, + "grad_norm": 5.807420539471437, + "language_loss": 0.79915571, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82676601, + "num_input_tokens_seen": 114159290, + "step": 5315, + "time_per_iteration": 2.804190158843994 + }, + { + "auxiliary_loss_clip": 0.0147512, + "auxiliary_loss_mlp": 0.01289259, + "balance_loss_clip": 1.13350821, + "balance_loss_mlp": 1.03710783, + "epoch": 0.319615211182925, + "flos": 17166190949280.0, + "grad_norm": 2.2984126841694743, + "language_loss": 0.67468107, + "learning_rate": 3.18277414980567e-06, + "loss": 0.70232487, + "num_input_tokens_seen": 114177655, + "step": 5316, + "time_per_iteration": 2.7982711791992188 + }, + { + "auxiliary_loss_clip": 0.01478152, + "auxiliary_loss_mlp": 0.01285923, + "balance_loss_clip": 1.13657689, + "balance_loss_mlp": 1.03586936, + "epoch": 0.319675334435593, + "flos": 28115439402240.0, + "grad_norm": 3.6628712851528724, + "language_loss": 0.69175977, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71940053, + "num_input_tokens_seen": 114200880, + "step": 5317, + "time_per_iteration": 4.325338125228882 + }, + { + "auxiliary_loss_clip": 0.0157651, + "auxiliary_loss_mlp": 0.01263916, + "balance_loss_clip": 1.23337054, + "balance_loss_mlp": 1.05181885, + "epoch": 0.31973545768826095, + "flos": 69508863377280.0, + "grad_norm": 0.7302716975302876, + "language_loss": 0.52956247, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55796677, + "num_input_tokens_seen": 114267145, + "step": 5318, + "time_per_iteration": 4.963151454925537 + }, + { + "auxiliary_loss_clip": 0.0147736, + "auxiliary_loss_mlp": 0.01280403, + "balance_loss_clip": 1.13482261, + "balance_loss_mlp": 1.03206599, + "epoch": 0.3197955809409289, + "flos": 13700925856800.0, + "grad_norm": 1.8456703131551249, + "language_loss": 0.84394974, + "learning_rate": 3.181831776553012e-06, + "loss": 0.87152737, + "num_input_tokens_seen": 114284630, + "step": 5319, + "time_per_iteration": 2.7412381172180176 + }, + { + "auxiliary_loss_clip": 0.01477306, + "auxiliary_loss_mlp": 0.01294068, + "balance_loss_clip": 1.13451815, + "balance_loss_mlp": 1.04802012, + "epoch": 0.3198557041935969, + "flos": 33220886240160.0, + "grad_norm": 2.6404139981437433, + "language_loss": 0.63719845, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.66491222, + "num_input_tokens_seen": 114305830, + "step": 5320, + "time_per_iteration": 2.908526659011841 + }, + { + "auxiliary_loss_clip": 0.01483701, + "auxiliary_loss_mlp": 0.01290194, + "balance_loss_clip": 1.14114642, + "balance_loss_mlp": 1.0384239, + "epoch": 0.31991582744626484, + "flos": 23734275994080.0, + "grad_norm": 2.734544918596576, + "language_loss": 0.7055583, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.73329729, + "num_input_tokens_seen": 114325165, + "step": 5321, + "time_per_iteration": 2.827227830886841 + }, + { + "auxiliary_loss_clip": 0.0148821, + "auxiliary_loss_mlp": 0.01305101, + "balance_loss_clip": 1.1451416, + "balance_loss_mlp": 1.04837203, + "epoch": 0.3199759506989328, + "flos": 18552782466720.0, + "grad_norm": 3.3149533734979104, + "language_loss": 0.8656894, + "learning_rate": 3.180888999963749e-06, + "loss": 0.89362252, + "num_input_tokens_seen": 114341310, + "step": 5322, + "time_per_iteration": 2.783071994781494 + }, + { + "auxiliary_loss_clip": 0.01481827, + "auxiliary_loss_mlp": 0.01277359, + "balance_loss_clip": 1.13805842, + "balance_loss_mlp": 1.02463567, + "epoch": 0.3200360739516008, + "flos": 22421076194880.0, + "grad_norm": 1.9543810339119965, + "language_loss": 0.83377546, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.86136734, + "num_input_tokens_seen": 114360355, + "step": 5323, + "time_per_iteration": 2.7666397094726562 + }, + { + "auxiliary_loss_clip": 0.01479164, + "auxiliary_loss_mlp": 0.01287039, + "balance_loss_clip": 1.13721037, + "balance_loss_mlp": 1.03889322, + "epoch": 0.32009619720426874, + "flos": 20597358261600.0, + "grad_norm": 1.8246508837460012, + "language_loss": 0.78484809, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.81251013, + "num_input_tokens_seen": 114379220, + "step": 5324, + "time_per_iteration": 2.791579484939575 + }, + { + "auxiliary_loss_clip": 0.01479173, + "auxiliary_loss_mlp": 0.01274044, + "balance_loss_clip": 1.13672781, + "balance_loss_mlp": 1.02131999, + "epoch": 0.3201563204569367, + "flos": 18149450405760.0, + "grad_norm": 2.1285068366072424, + "language_loss": 0.80323499, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.83076715, + "num_input_tokens_seen": 114396365, + "step": 5325, + "time_per_iteration": 2.837751865386963 + }, + { + "auxiliary_loss_clip": 0.01486009, + "auxiliary_loss_mlp": 0.01284541, + "balance_loss_clip": 1.14353812, + "balance_loss_mlp": 1.03296173, + "epoch": 0.32021644370960467, + "flos": 31686828579360.0, + "grad_norm": 1.850080385286273, + "language_loss": 0.74954367, + "learning_rate": 3.179631337655037e-06, + "loss": 0.7772491, + "num_input_tokens_seen": 114416780, + "step": 5326, + "time_per_iteration": 2.8237533569335938 + }, + { + "auxiliary_loss_clip": 0.01484504, + "auxiliary_loss_mlp": 0.01276257, + "balance_loss_clip": 1.14087629, + "balance_loss_mlp": 1.02601242, + "epoch": 0.32027656696227264, + "flos": 26868121545600.0, + "grad_norm": 1.5171368928237599, + "language_loss": 0.81129456, + "learning_rate": 3.179316810218701e-06, + "loss": 0.83890218, + "num_input_tokens_seen": 114437405, + "step": 5327, + "time_per_iteration": 2.786585569381714 + }, + { + "auxiliary_loss_clip": 0.01479673, + "auxiliary_loss_mlp": 0.01285857, + "balance_loss_clip": 1.13757753, + "balance_loss_mlp": 1.03446889, + "epoch": 0.32033669021494066, + "flos": 24172198901280.0, + "grad_norm": 1.6211085933463736, + "language_loss": 0.77979362, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80744886, + "num_input_tokens_seen": 114458505, + "step": 5328, + "time_per_iteration": 2.8240747451782227 + }, + { + "auxiliary_loss_clip": 0.01489282, + "auxiliary_loss_mlp": 0.01285393, + "balance_loss_clip": 1.14679611, + "balance_loss_mlp": 1.03686523, + "epoch": 0.3203968134676086, + "flos": 24462997018560.0, + "grad_norm": 2.256739089434012, + "language_loss": 0.74026018, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76800692, + "num_input_tokens_seen": 114479050, + "step": 5329, + "time_per_iteration": 2.835710048675537 + }, + { + "auxiliary_loss_clip": 0.01485696, + "auxiliary_loss_mlp": 0.01282168, + "balance_loss_clip": 1.1425879, + "balance_loss_mlp": 1.03345001, + "epoch": 0.3204569367202766, + "flos": 18006687354240.0, + "grad_norm": 1.8103358507833467, + "language_loss": 0.71071064, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73838931, + "num_input_tokens_seen": 114497415, + "step": 5330, + "time_per_iteration": 2.750164747238159 + }, + { + "auxiliary_loss_clip": 0.01486754, + "auxiliary_loss_mlp": 0.01288305, + "balance_loss_clip": 1.14373112, + "balance_loss_mlp": 1.0338645, + "epoch": 0.32051705997294455, + "flos": 30592210952160.0, + "grad_norm": 2.0389090239486727, + "language_loss": 0.79772669, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.8254773, + "num_input_tokens_seen": 114518785, + "step": 5331, + "time_per_iteration": 2.931840658187866 + }, + { + "auxiliary_loss_clip": 0.01605349, + "auxiliary_loss_mlp": 0.01233414, + "balance_loss_clip": 1.2640537, + "balance_loss_mlp": 1.02055359, + "epoch": 0.3205771832256125, + "flos": 68424259148640.0, + "grad_norm": 0.8327901306962844, + "language_loss": 0.57811904, + "learning_rate": 3.177743502478447e-06, + "loss": 0.60650659, + "num_input_tokens_seen": 114577710, + "step": 5332, + "time_per_iteration": 3.2491674423217773 + }, + { + "auxiliary_loss_clip": 0.0149765, + "auxiliary_loss_mlp": 0.01287641, + "balance_loss_clip": 1.15520883, + "balance_loss_mlp": 1.03625262, + "epoch": 0.3206373064782805, + "flos": 30446489504160.0, + "grad_norm": 1.6340214588414972, + "language_loss": 0.73095769, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75881064, + "num_input_tokens_seen": 114598640, + "step": 5333, + "time_per_iteration": 2.8633570671081543 + }, + { + "auxiliary_loss_clip": 0.01492154, + "auxiliary_loss_mlp": 0.01294642, + "balance_loss_clip": 1.15000212, + "balance_loss_mlp": 1.04821253, + "epoch": 0.32069742973094845, + "flos": 22056696718560.0, + "grad_norm": 1.95750732584747, + "language_loss": 0.70538592, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.7332539, + "num_input_tokens_seen": 114618780, + "step": 5334, + "time_per_iteration": 2.833033800125122 + }, + { + "auxiliary_loss_clip": 0.01480776, + "auxiliary_loss_mlp": 0.01288651, + "balance_loss_clip": 1.13891482, + "balance_loss_mlp": 1.03611803, + "epoch": 0.3207575529836164, + "flos": 22056165724320.0, + "grad_norm": 2.057791700557645, + "language_loss": 0.7708441, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79853839, + "num_input_tokens_seen": 114637525, + "step": 5335, + "time_per_iteration": 2.8293209075927734 + }, + { + "auxiliary_loss_clip": 0.01490925, + "auxiliary_loss_mlp": 0.01295023, + "balance_loss_clip": 1.14980781, + "balance_loss_mlp": 1.04554212, + "epoch": 0.3208176762362844, + "flos": 34060358584800.0, + "grad_norm": 1.8791626124841134, + "language_loss": 0.68441379, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.7122733, + "num_input_tokens_seen": 114659705, + "step": 5336, + "time_per_iteration": 2.9940969944000244 + }, + { + "auxiliary_loss_clip": 0.01497806, + "auxiliary_loss_mlp": 0.01291625, + "balance_loss_clip": 1.15587544, + "balance_loss_mlp": 1.04271591, + "epoch": 0.32087779948895234, + "flos": 21800868729120.0, + "grad_norm": 4.480152345472178, + "language_loss": 0.78781348, + "learning_rate": 3.176169078234487e-06, + "loss": 0.8157078, + "num_input_tokens_seen": 114678340, + "step": 5337, + "time_per_iteration": 2.8056223392486572 + }, + { + "auxiliary_loss_clip": 0.01482856, + "auxiliary_loss_mlp": 0.01294447, + "balance_loss_clip": 1.14150548, + "balance_loss_mlp": 1.04954338, + "epoch": 0.3209379227416203, + "flos": 21436261683840.0, + "grad_norm": 1.5790806556627541, + "language_loss": 0.74089062, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76866364, + "num_input_tokens_seen": 114696980, + "step": 5338, + "time_per_iteration": 2.852872848510742 + }, + { + "auxiliary_loss_clip": 0.0148492, + "auxiliary_loss_mlp": 0.01293438, + "balance_loss_clip": 1.14342296, + "balance_loss_mlp": 1.04433775, + "epoch": 0.3209980459942883, + "flos": 25851484944000.0, + "grad_norm": 1.9444626846249686, + "language_loss": 0.63120168, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65898526, + "num_input_tokens_seen": 114717330, + "step": 5339, + "time_per_iteration": 2.7669670581817627 + }, + { + "auxiliary_loss_clip": 0.01488219, + "auxiliary_loss_mlp": 0.01298149, + "balance_loss_clip": 1.1476227, + "balance_loss_mlp": 1.04313624, + "epoch": 0.32105816924695624, + "flos": 19101191196960.0, + "grad_norm": 2.2715812995615328, + "language_loss": 0.81340104, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84126472, + "num_input_tokens_seen": 114736320, + "step": 5340, + "time_per_iteration": 2.758204460144043 + }, + { + "auxiliary_loss_clip": 0.01491343, + "auxiliary_loss_mlp": 0.01306697, + "balance_loss_clip": 1.14868379, + "balance_loss_mlp": 1.06160283, + "epoch": 0.3211182924996242, + "flos": 16583760295200.0, + "grad_norm": 3.3645843247516076, + "language_loss": 0.76726484, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.79524529, + "num_input_tokens_seen": 114754575, + "step": 5341, + "time_per_iteration": 2.721890687942505 + }, + { + "auxiliary_loss_clip": 0.01481978, + "auxiliary_loss_mlp": 0.01299657, + "balance_loss_clip": 1.1404084, + "balance_loss_mlp": 1.05456245, + "epoch": 0.3211784157522922, + "flos": 22674287141280.0, + "grad_norm": 2.472895870397348, + "language_loss": 0.78956753, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81738389, + "num_input_tokens_seen": 114773590, + "step": 5342, + "time_per_iteration": 2.762303352355957 + }, + { + "auxiliary_loss_clip": 0.0148891, + "auxiliary_loss_mlp": 0.01301182, + "balance_loss_clip": 1.14712894, + "balance_loss_mlp": 1.05170107, + "epoch": 0.3212385390049602, + "flos": 20560908935520.0, + "grad_norm": 3.1953171305716803, + "language_loss": 0.74873412, + "learning_rate": 3.174278297458438e-06, + "loss": 0.77663505, + "num_input_tokens_seen": 114790775, + "step": 5343, + "time_per_iteration": 2.824280023574829 + }, + { + "auxiliary_loss_clip": 0.01489676, + "auxiliary_loss_mlp": 0.01290303, + "balance_loss_clip": 1.14832687, + "balance_loss_mlp": 1.04425502, + "epoch": 0.32129866225762815, + "flos": 24793506283680.0, + "grad_norm": 1.6185697693669678, + "language_loss": 0.82765543, + "learning_rate": 3.173963011408748e-06, + "loss": 0.85545516, + "num_input_tokens_seen": 114809835, + "step": 5344, + "time_per_iteration": 2.807535409927368 + }, + { + "auxiliary_loss_clip": 0.01483143, + "auxiliary_loss_mlp": 0.01296129, + "balance_loss_clip": 1.14185798, + "balance_loss_mlp": 1.04416847, + "epoch": 0.3213587855102961, + "flos": 18368715284640.0, + "grad_norm": 6.011460202574382, + "language_loss": 0.79725069, + "learning_rate": 3.173647680842262e-06, + "loss": 0.82504338, + "num_input_tokens_seen": 114826505, + "step": 5345, + "time_per_iteration": 2.7765538692474365 + }, + { + "auxiliary_loss_clip": 0.0148602, + "auxiliary_loss_mlp": 0.01286366, + "balance_loss_clip": 1.14478421, + "balance_loss_mlp": 1.03688502, + "epoch": 0.3214189087629641, + "flos": 27018432300960.0, + "grad_norm": 2.241752144231131, + "language_loss": 0.8314808, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85920465, + "num_input_tokens_seen": 114846140, + "step": 5346, + "time_per_iteration": 4.478646993637085 + }, + { + "auxiliary_loss_clip": 0.01489617, + "auxiliary_loss_mlp": 0.012792, + "balance_loss_clip": 1.14833903, + "balance_loss_mlp": 1.02571344, + "epoch": 0.32147903201563205, + "flos": 23150479926240.0, + "grad_norm": 2.4673790426541737, + "language_loss": 0.81728947, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.84497762, + "num_input_tokens_seen": 114866660, + "step": 5347, + "time_per_iteration": 2.8735547065734863 + }, + { + "auxiliary_loss_clip": 0.01492395, + "auxiliary_loss_mlp": 0.01295068, + "balance_loss_clip": 1.15088415, + "balance_loss_mlp": 1.04558635, + "epoch": 0.3215391552683, + "flos": 16582584522240.0, + "grad_norm": 2.0008317723199176, + "language_loss": 0.79509103, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82296562, + "num_input_tokens_seen": 114882820, + "step": 5348, + "time_per_iteration": 2.7443013191223145 + }, + { + "auxiliary_loss_clip": 0.01490154, + "auxiliary_loss_mlp": 0.01305064, + "balance_loss_clip": 1.1494863, + "balance_loss_mlp": 1.05615544, + "epoch": 0.321599278520968, + "flos": 17823758016960.0, + "grad_norm": 2.2718824912415303, + "language_loss": 0.85567856, + "learning_rate": 3.172385913647542e-06, + "loss": 0.88363075, + "num_input_tokens_seen": 114900745, + "step": 5349, + "time_per_iteration": 2.736018180847168 + }, + { + "auxiliary_loss_clip": 0.01492559, + "auxiliary_loss_mlp": 0.01279459, + "balance_loss_clip": 1.15214539, + "balance_loss_mlp": 1.0286423, + "epoch": 0.32165940177363594, + "flos": 16253554455360.0, + "grad_norm": 2.227323649219261, + "language_loss": 0.8089447, + "learning_rate": 3.172070360676475e-06, + "loss": 0.83666492, + "num_input_tokens_seen": 114917940, + "step": 5350, + "time_per_iteration": 2.937052011489868 + }, + { + "auxiliary_loss_clip": 0.01483151, + "auxiliary_loss_mlp": 0.01280504, + "balance_loss_clip": 1.14358461, + "balance_loss_mlp": 1.03197634, + "epoch": 0.3217195250263039, + "flos": 27602380081440.0, + "grad_norm": 1.7775440112633003, + "language_loss": 0.80339372, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.83103025, + "num_input_tokens_seen": 114937735, + "step": 5351, + "time_per_iteration": 2.8066534996032715 + }, + { + "auxiliary_loss_clip": 0.01491013, + "auxiliary_loss_mlp": 0.01286035, + "balance_loss_clip": 1.15019143, + "balance_loss_mlp": 1.03636265, + "epoch": 0.3217796482789719, + "flos": 21472748938080.0, + "grad_norm": 1.7895152985137797, + "language_loss": 0.7609905, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.78876102, + "num_input_tokens_seen": 114956630, + "step": 5352, + "time_per_iteration": 2.8454337120056152 + }, + { + "auxiliary_loss_clip": 0.01489072, + "auxiliary_loss_mlp": 0.01288513, + "balance_loss_clip": 1.14750338, + "balance_loss_mlp": 1.03826904, + "epoch": 0.32183977153163984, + "flos": 21217755368160.0, + "grad_norm": 2.432760081939102, + "language_loss": 0.81750327, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.8452791, + "num_input_tokens_seen": 114976470, + "step": 5353, + "time_per_iteration": 2.966387987136841 + }, + { + "auxiliary_loss_clip": 0.0149049, + "auxiliary_loss_mlp": 0.0128749, + "balance_loss_clip": 1.15011299, + "balance_loss_mlp": 1.0399158, + "epoch": 0.3218998947843078, + "flos": 24610501090080.0, + "grad_norm": 1.8525685561137448, + "language_loss": 0.73305035, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.76083016, + "num_input_tokens_seen": 114996710, + "step": 5354, + "time_per_iteration": 2.829854726791382 + }, + { + "auxiliary_loss_clip": 0.01492725, + "auxiliary_loss_mlp": 0.01297612, + "balance_loss_clip": 1.15218723, + "balance_loss_mlp": 1.04469764, + "epoch": 0.3219600180369758, + "flos": 22272623919360.0, + "grad_norm": 2.201373603519141, + "language_loss": 0.83647895, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.86438233, + "num_input_tokens_seen": 115015775, + "step": 5355, + "time_per_iteration": 4.321765422821045 + }, + { + "auxiliary_loss_clip": 0.0149303, + "auxiliary_loss_mlp": 0.01294743, + "balance_loss_clip": 1.15319943, + "balance_loss_mlp": 1.04430819, + "epoch": 0.3220201412896438, + "flos": 14940885650400.0, + "grad_norm": 2.9179847421798635, + "language_loss": 0.71352386, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.74140167, + "num_input_tokens_seen": 115034265, + "step": 5356, + "time_per_iteration": 5.666183233261108 + }, + { + "auxiliary_loss_clip": 0.01495043, + "auxiliary_loss_mlp": 0.01301846, + "balance_loss_clip": 1.15415907, + "balance_loss_mlp": 1.04931307, + "epoch": 0.32208026454231176, + "flos": 22669204767840.0, + "grad_norm": 3.2558516390891796, + "language_loss": 0.6825217, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.71049058, + "num_input_tokens_seen": 115051945, + "step": 5357, + "time_per_iteration": 2.8500208854675293 + }, + { + "auxiliary_loss_clip": 0.01585838, + "auxiliary_loss_mlp": 0.01238815, + "balance_loss_clip": 1.24098921, + "balance_loss_mlp": 1.02519226, + "epoch": 0.3221403877949797, + "flos": 64612213246080.0, + "grad_norm": 0.7016572665486384, + "language_loss": 0.582394, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.61064053, + "num_input_tokens_seen": 115119090, + "step": 5358, + "time_per_iteration": 3.3809287548065186 + }, + { + "auxiliary_loss_clip": 0.01485655, + "auxiliary_loss_mlp": 0.01284576, + "balance_loss_clip": 1.14509356, + "balance_loss_mlp": 1.0320431, + "epoch": 0.3222005110476477, + "flos": 20159094000960.0, + "grad_norm": 2.0198630738145047, + "language_loss": 0.83430851, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.86201084, + "num_input_tokens_seen": 115137755, + "step": 5359, + "time_per_iteration": 2.805715799331665 + }, + { + "auxiliary_loss_clip": 0.01488091, + "auxiliary_loss_mlp": 0.01288113, + "balance_loss_clip": 1.14696229, + "balance_loss_mlp": 1.03920364, + "epoch": 0.32226063430031565, + "flos": 22676373190080.0, + "grad_norm": 1.8466204165490139, + "language_loss": 0.7963928, + "learning_rate": 3.168912388464595e-06, + "loss": 0.82415485, + "num_input_tokens_seen": 115158150, + "step": 5360, + "time_per_iteration": 2.835824489593506 + }, + { + "auxiliary_loss_clip": 0.01579799, + "auxiliary_loss_mlp": 0.01266472, + "balance_loss_clip": 1.2353673, + "balance_loss_mlp": 1.05742645, + "epoch": 0.3223207575529836, + "flos": 63834981740640.0, + "grad_norm": 0.6568183440883361, + "language_loss": 0.5689851, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59744781, + "num_input_tokens_seen": 115212755, + "step": 5361, + "time_per_iteration": 3.093289852142334 + }, + { + "auxiliary_loss_clip": 0.01489352, + "auxiliary_loss_mlp": 0.01292434, + "balance_loss_clip": 1.14883685, + "balance_loss_mlp": 1.04257083, + "epoch": 0.3223808808056516, + "flos": 26872559140320.0, + "grad_norm": 2.0227802717615306, + "language_loss": 0.71533871, + "learning_rate": 3.168280261735588e-06, + "loss": 0.74315655, + "num_input_tokens_seen": 115233090, + "step": 5362, + "time_per_iteration": 2.8539581298828125 + }, + { + "auxiliary_loss_clip": 0.01484267, + "auxiliary_loss_mlp": 0.01289016, + "balance_loss_clip": 1.14312661, + "balance_loss_mlp": 1.04029799, + "epoch": 0.32244100405831955, + "flos": 26763818012640.0, + "grad_norm": 2.277378400040407, + "language_loss": 0.73994732, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76768011, + "num_input_tokens_seen": 115252645, + "step": 5363, + "time_per_iteration": 2.8020520210266113 + }, + { + "auxiliary_loss_clip": 0.01487351, + "auxiliary_loss_mlp": 0.01288942, + "balance_loss_clip": 1.14581394, + "balance_loss_mlp": 1.03507352, + "epoch": 0.3225011273109875, + "flos": 23805164453760.0, + "grad_norm": 2.6039056994679384, + "language_loss": 0.76713598, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79489887, + "num_input_tokens_seen": 115269085, + "step": 5364, + "time_per_iteration": 2.85518741607666 + }, + { + "auxiliary_loss_clip": 0.01493152, + "auxiliary_loss_mlp": 0.01289999, + "balance_loss_clip": 1.1515975, + "balance_loss_mlp": 1.0403266, + "epoch": 0.3225612505636555, + "flos": 17276411275200.0, + "grad_norm": 4.246530901344336, + "language_loss": 0.7700727, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79790419, + "num_input_tokens_seen": 115286470, + "step": 5365, + "time_per_iteration": 2.7925713062286377 + }, + { + "auxiliary_loss_clip": 0.01486244, + "auxiliary_loss_mlp": 0.01292038, + "balance_loss_clip": 1.1459831, + "balance_loss_mlp": 1.0427475, + "epoch": 0.32262137381632344, + "flos": 23368417319520.0, + "grad_norm": 1.6170602310731765, + "language_loss": 0.76441717, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79219997, + "num_input_tokens_seen": 115307000, + "step": 5366, + "time_per_iteration": 2.780712366104126 + }, + { + "auxiliary_loss_clip": 0.01481898, + "auxiliary_loss_mlp": 0.01292726, + "balance_loss_clip": 1.14157104, + "balance_loss_mlp": 1.04553378, + "epoch": 0.3226814970689914, + "flos": 23261079533760.0, + "grad_norm": 2.1566595959930153, + "language_loss": 0.71806455, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74581081, + "num_input_tokens_seen": 115325925, + "step": 5367, + "time_per_iteration": 2.8318917751312256 + }, + { + "auxiliary_loss_clip": 0.01487215, + "auxiliary_loss_mlp": 0.01287877, + "balance_loss_clip": 1.14458752, + "balance_loss_mlp": 1.04240155, + "epoch": 0.32274162032165943, + "flos": 16396848501120.0, + "grad_norm": 1.9920389662640006, + "language_loss": 0.74774367, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.77549458, + "num_input_tokens_seen": 115343705, + "step": 5368, + "time_per_iteration": 2.7430901527404785 + }, + { + "auxiliary_loss_clip": 0.01482552, + "auxiliary_loss_mlp": 0.01289192, + "balance_loss_clip": 1.14174736, + "balance_loss_mlp": 1.04619575, + "epoch": 0.3228017435743274, + "flos": 27857563292160.0, + "grad_norm": 5.689058891814036, + "language_loss": 0.78522956, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.81294698, + "num_input_tokens_seen": 115364170, + "step": 5369, + "time_per_iteration": 2.842664957046509 + }, + { + "auxiliary_loss_clip": 0.01486143, + "auxiliary_loss_mlp": 0.01283291, + "balance_loss_clip": 1.14586067, + "balance_loss_mlp": 1.03419113, + "epoch": 0.32286186682699536, + "flos": 19610761127040.0, + "grad_norm": 2.219068915587318, + "language_loss": 0.8340385, + "learning_rate": 3.16574998372661e-06, + "loss": 0.86173284, + "num_input_tokens_seen": 115382495, + "step": 5370, + "time_per_iteration": 2.790391206741333 + }, + { + "auxiliary_loss_clip": 0.01482441, + "auxiliary_loss_mlp": 0.01289844, + "balance_loss_clip": 1.14244008, + "balance_loss_mlp": 1.04055369, + "epoch": 0.3229219900796633, + "flos": 24136318497600.0, + "grad_norm": 2.175541304331957, + "language_loss": 0.8336817, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.86140454, + "num_input_tokens_seen": 115399450, + "step": 5371, + "time_per_iteration": 2.8081724643707275 + }, + { + "auxiliary_loss_clip": 0.01477637, + "auxiliary_loss_mlp": 0.01293715, + "balance_loss_clip": 1.13813806, + "balance_loss_mlp": 1.04327989, + "epoch": 0.3229821133323313, + "flos": 17750631795840.0, + "grad_norm": 3.564771875962289, + "language_loss": 0.88410974, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.91182315, + "num_input_tokens_seen": 115417700, + "step": 5372, + "time_per_iteration": 2.7358951568603516 + }, + { + "auxiliary_loss_clip": 0.01481844, + "auxiliary_loss_mlp": 0.01293376, + "balance_loss_clip": 1.14355028, + "balance_loss_mlp": 1.04217792, + "epoch": 0.32304223658499925, + "flos": 22348215470880.0, + "grad_norm": 8.162599450218996, + "language_loss": 0.73087621, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75862837, + "num_input_tokens_seen": 115435840, + "step": 5373, + "time_per_iteration": 2.790482521057129 + }, + { + "auxiliary_loss_clip": 0.01485164, + "auxiliary_loss_mlp": 0.01279403, + "balance_loss_clip": 1.14510441, + "balance_loss_mlp": 1.03030324, + "epoch": 0.3231023598376672, + "flos": 18480263096160.0, + "grad_norm": 3.09070292993757, + "language_loss": 0.81670487, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.84435052, + "num_input_tokens_seen": 115454210, + "step": 5374, + "time_per_iteration": 2.724696159362793 + }, + { + "auxiliary_loss_clip": 0.01475872, + "auxiliary_loss_mlp": 0.01276426, + "balance_loss_clip": 1.13614011, + "balance_loss_mlp": 1.02618217, + "epoch": 0.3231624830903352, + "flos": 27638791479360.0, + "grad_norm": 11.499993642680517, + "language_loss": 0.87981856, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.90734148, + "num_input_tokens_seen": 115471785, + "step": 5375, + "time_per_iteration": 2.8226749897003174 + }, + { + "auxiliary_loss_clip": 0.0148712, + "auxiliary_loss_mlp": 0.01285492, + "balance_loss_clip": 1.14828336, + "balance_loss_mlp": 1.03086138, + "epoch": 0.32322260634300315, + "flos": 21728614855680.0, + "grad_norm": 1.9670627542518766, + "language_loss": 0.76320612, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.79093224, + "num_input_tokens_seen": 115491405, + "step": 5376, + "time_per_iteration": 2.8122096061706543 + }, + { + "auxiliary_loss_clip": 0.01478411, + "auxiliary_loss_mlp": 0.01277001, + "balance_loss_clip": 1.1390661, + "balance_loss_mlp": 1.03152549, + "epoch": 0.3232827295956711, + "flos": 22639848007680.0, + "grad_norm": 1.529050304295458, + "language_loss": 0.66788292, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69543701, + "num_input_tokens_seen": 115511555, + "step": 5377, + "time_per_iteration": 2.795239210128784 + }, + { + "auxiliary_loss_clip": 0.0147721, + "auxiliary_loss_mlp": 0.01274064, + "balance_loss_clip": 1.13808775, + "balance_loss_mlp": 1.02515531, + "epoch": 0.3233428528483391, + "flos": 26324984829600.0, + "grad_norm": 1.7680688810559284, + "language_loss": 0.7241677, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.75168049, + "num_input_tokens_seen": 115532860, + "step": 5378, + "time_per_iteration": 2.816096544265747 + }, + { + "auxiliary_loss_clip": 0.01482123, + "auxiliary_loss_mlp": 0.01283305, + "balance_loss_clip": 1.1430521, + "balance_loss_mlp": 1.03229761, + "epoch": 0.32340297610100704, + "flos": 28587763514880.0, + "grad_norm": 2.5270849486278713, + "language_loss": 0.82086217, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84851646, + "num_input_tokens_seen": 115553850, + "step": 5379, + "time_per_iteration": 2.7982988357543945 + }, + { + "auxiliary_loss_clip": 0.0147328, + "auxiliary_loss_mlp": 0.01271628, + "balance_loss_clip": 1.13382339, + "balance_loss_mlp": 1.02176547, + "epoch": 0.323463099353675, + "flos": 30776429846880.0, + "grad_norm": 1.73772941558602, + "language_loss": 0.78610408, + "learning_rate": 3.162583158454388e-06, + "loss": 0.81355321, + "num_input_tokens_seen": 115575530, + "step": 5380, + "time_per_iteration": 2.864867687225342 + }, + { + "auxiliary_loss_clip": 0.01479778, + "auxiliary_loss_mlp": 0.01283208, + "balance_loss_clip": 1.14095545, + "balance_loss_mlp": 1.03315437, + "epoch": 0.32352322260634303, + "flos": 25230974052960.0, + "grad_norm": 1.8313293972536995, + "language_loss": 0.77161455, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.7992444, + "num_input_tokens_seen": 115594885, + "step": 5381, + "time_per_iteration": 2.808795928955078 + }, + { + "auxiliary_loss_clip": 0.01479735, + "auxiliary_loss_mlp": 0.01287444, + "balance_loss_clip": 1.14147794, + "balance_loss_mlp": 1.04025161, + "epoch": 0.323583345859011, + "flos": 23332688628480.0, + "grad_norm": 1.9200842811055543, + "language_loss": 0.7183603, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.74603212, + "num_input_tokens_seen": 115614080, + "step": 5382, + "time_per_iteration": 2.8112218379974365 + }, + { + "auxiliary_loss_clip": 0.01474769, + "auxiliary_loss_mlp": 0.01280893, + "balance_loss_clip": 1.13569403, + "balance_loss_mlp": 1.0279789, + "epoch": 0.32364346911167896, + "flos": 26209568345760.0, + "grad_norm": 2.9366661807630607, + "language_loss": 0.70407212, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.73162872, + "num_input_tokens_seen": 115632820, + "step": 5383, + "time_per_iteration": 2.8020904064178467 + }, + { + "auxiliary_loss_clip": 0.01480353, + "auxiliary_loss_mlp": 0.01284565, + "balance_loss_clip": 1.14099193, + "balance_loss_mlp": 1.04004335, + "epoch": 0.3237035923643469, + "flos": 23698016308800.0, + "grad_norm": 2.129911142659054, + "language_loss": 0.78673363, + "learning_rate": 3.161315193285283e-06, + "loss": 0.81438279, + "num_input_tokens_seen": 115652860, + "step": 5384, + "time_per_iteration": 4.455191612243652 + }, + { + "auxiliary_loss_clip": 0.01476348, + "auxiliary_loss_mlp": 0.0128931, + "balance_loss_clip": 1.13722634, + "balance_loss_mlp": 1.04135442, + "epoch": 0.3237637156170149, + "flos": 14430481300800.0, + "grad_norm": 2.3814620010405334, + "language_loss": 0.7535522, + "learning_rate": 3.16099809186998e-06, + "loss": 0.78120875, + "num_input_tokens_seen": 115670940, + "step": 5385, + "time_per_iteration": 2.8033530712127686 + }, + { + "auxiliary_loss_clip": 0.01479714, + "auxiliary_loss_mlp": 0.0129236, + "balance_loss_clip": 1.13973486, + "balance_loss_mlp": 1.04497647, + "epoch": 0.32382383886968286, + "flos": 31065483268800.0, + "grad_norm": 2.114503371973723, + "language_loss": 0.71415979, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74188054, + "num_input_tokens_seen": 115691155, + "step": 5386, + "time_per_iteration": 2.883016586303711 + }, + { + "auxiliary_loss_clip": 0.01474871, + "auxiliary_loss_mlp": 0.01282954, + "balance_loss_clip": 1.13485718, + "balance_loss_mlp": 1.03213811, + "epoch": 0.3238839621223508, + "flos": 23259107269440.0, + "grad_norm": 3.8932178236192123, + "language_loss": 0.94628644, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.97386467, + "num_input_tokens_seen": 115710340, + "step": 5387, + "time_per_iteration": 2.8174092769622803 + }, + { + "auxiliary_loss_clip": 0.01481748, + "auxiliary_loss_mlp": 0.01286991, + "balance_loss_clip": 1.14407349, + "balance_loss_mlp": 1.03560257, + "epoch": 0.3239440853750188, + "flos": 22966792025760.0, + "grad_norm": 2.0177730832387093, + "language_loss": 0.775051, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.80273843, + "num_input_tokens_seen": 115726745, + "step": 5388, + "time_per_iteration": 2.826352834701538 + }, + { + "auxiliary_loss_clip": 0.01484485, + "auxiliary_loss_mlp": 0.01285313, + "balance_loss_clip": 1.14514518, + "balance_loss_mlp": 1.03926539, + "epoch": 0.32400420862768675, + "flos": 36249555911040.0, + "grad_norm": 3.322142350757488, + "language_loss": 0.71449929, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.74219728, + "num_input_tokens_seen": 115749385, + "step": 5389, + "time_per_iteration": 2.8918707370758057 + }, + { + "auxiliary_loss_clip": 0.01479155, + "auxiliary_loss_mlp": 0.01281167, + "balance_loss_clip": 1.1415832, + "balance_loss_mlp": 1.03206706, + "epoch": 0.3240643318803547, + "flos": 21618660026880.0, + "grad_norm": 5.289762901240245, + "language_loss": 0.81036377, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83796692, + "num_input_tokens_seen": 115768105, + "step": 5390, + "time_per_iteration": 2.8178391456604004 + }, + { + "auxiliary_loss_clip": 0.01494963, + "auxiliary_loss_mlp": 0.01297369, + "balance_loss_clip": 1.15602303, + "balance_loss_mlp": 1.0480783, + "epoch": 0.3241244551330227, + "flos": 23297908141440.0, + "grad_norm": 2.586835479037387, + "language_loss": 0.72975671, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75768, + "num_input_tokens_seen": 115787340, + "step": 5391, + "time_per_iteration": 2.77854585647583 + }, + { + "auxiliary_loss_clip": 0.01483494, + "auxiliary_loss_mlp": 0.01277128, + "balance_loss_clip": 1.14497781, + "balance_loss_mlp": 1.03184283, + "epoch": 0.32418457838569065, + "flos": 14098796262720.0, + "grad_norm": 2.310521453088792, + "language_loss": 0.77410591, + "learning_rate": 3.158777149931855e-06, + "loss": 0.80171216, + "num_input_tokens_seen": 115805565, + "step": 5392, + "time_per_iteration": 2.7595536708831787 + }, + { + "auxiliary_loss_clip": 0.01483393, + "auxiliary_loss_mlp": 0.01311067, + "balance_loss_clip": 1.14431942, + "balance_loss_mlp": 1.06444693, + "epoch": 0.3242447016383586, + "flos": 29755279794240.0, + "grad_norm": 2.226799677954163, + "language_loss": 0.62712824, + "learning_rate": 3.158459696652067e-06, + "loss": 0.65507287, + "num_input_tokens_seen": 115826725, + "step": 5393, + "time_per_iteration": 4.322778940200806 + }, + { + "auxiliary_loss_clip": 0.01482514, + "auxiliary_loss_mlp": 0.01294977, + "balance_loss_clip": 1.14479804, + "balance_loss_mlp": 1.04683113, + "epoch": 0.3243048248910266, + "flos": 24353421471360.0, + "grad_norm": 1.7821488506924825, + "language_loss": 0.82896101, + "learning_rate": 3.158142199443371e-06, + "loss": 0.85673594, + "num_input_tokens_seen": 115846955, + "step": 5394, + "time_per_iteration": 2.7968342304229736 + }, + { + "auxiliary_loss_clip": 0.01495569, + "auxiliary_loss_mlp": 0.01292593, + "balance_loss_clip": 1.15805471, + "balance_loss_mlp": 1.04768908, + "epoch": 0.3243649481436946, + "flos": 24355355807520.0, + "grad_norm": 2.0185329293801137, + "language_loss": 0.81948233, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.84736389, + "num_input_tokens_seen": 115865975, + "step": 5395, + "time_per_iteration": 4.236144781112671 + }, + { + "auxiliary_loss_clip": 0.01493604, + "auxiliary_loss_mlp": 0.01288162, + "balance_loss_clip": 1.15514708, + "balance_loss_mlp": 1.04535651, + "epoch": 0.32442507139636256, + "flos": 22927194662400.0, + "grad_norm": 1.8025736122089624, + "language_loss": 0.833058, + "learning_rate": 3.157507073287417e-06, + "loss": 0.86087561, + "num_input_tokens_seen": 115884950, + "step": 5396, + "time_per_iteration": 2.9062552452087402 + }, + { + "auxiliary_loss_clip": 0.01493646, + "auxiliary_loss_mlp": 0.0130048, + "balance_loss_clip": 1.15544081, + "balance_loss_mlp": 1.04928255, + "epoch": 0.32448519464903053, + "flos": 22202380238400.0, + "grad_norm": 2.2646001342859217, + "language_loss": 0.75407511, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.7820164, + "num_input_tokens_seen": 115904170, + "step": 5397, + "time_per_iteration": 2.772764205932617 + }, + { + "auxiliary_loss_clip": 0.01486651, + "auxiliary_loss_mlp": 0.01295411, + "balance_loss_clip": 1.14842343, + "balance_loss_mlp": 1.05012631, + "epoch": 0.3245453179016985, + "flos": 18840167049600.0, + "grad_norm": 7.27386630997239, + "language_loss": 0.67317367, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.70099431, + "num_input_tokens_seen": 115919255, + "step": 5398, + "time_per_iteration": 2.7728939056396484 + }, + { + "auxiliary_loss_clip": 0.01495154, + "auxiliary_loss_mlp": 0.01291203, + "balance_loss_clip": 1.15505743, + "balance_loss_mlp": 1.0426755, + "epoch": 0.32460544115436646, + "flos": 21180509550720.0, + "grad_norm": 1.489576785284235, + "language_loss": 0.73002064, + "learning_rate": 3.156554054887718e-06, + "loss": 0.7578842, + "num_input_tokens_seen": 115938535, + "step": 5399, + "time_per_iteration": 2.752528429031372 + }, + { + "auxiliary_loss_clip": 0.01490815, + "auxiliary_loss_mlp": 0.01292757, + "balance_loss_clip": 1.15124345, + "balance_loss_mlp": 1.04651809, + "epoch": 0.3246655644070344, + "flos": 21983305000320.0, + "grad_norm": 2.5218820118223277, + "language_loss": 0.71634406, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.74417973, + "num_input_tokens_seen": 115955005, + "step": 5400, + "time_per_iteration": 2.780334949493408 + }, + { + "auxiliary_loss_clip": 0.01485205, + "auxiliary_loss_mlp": 0.0128086, + "balance_loss_clip": 1.14711642, + "balance_loss_mlp": 1.03118777, + "epoch": 0.3247256876597024, + "flos": 32162414513760.0, + "grad_norm": 2.198472221352268, + "language_loss": 0.79927742, + "learning_rate": 3.155918489984614e-06, + "loss": 0.82693803, + "num_input_tokens_seen": 115975305, + "step": 5401, + "time_per_iteration": 2.862928867340088 + }, + { + "auxiliary_loss_clip": 0.01492673, + "auxiliary_loss_mlp": 0.01293203, + "balance_loss_clip": 1.15317869, + "balance_loss_mlp": 1.04524803, + "epoch": 0.32478581091237035, + "flos": 20999742118560.0, + "grad_norm": 1.6255864512175278, + "language_loss": 0.87482393, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.90268272, + "num_input_tokens_seen": 115994810, + "step": 5402, + "time_per_iteration": 2.7118427753448486 + }, + { + "auxiliary_loss_clip": 0.01486188, + "auxiliary_loss_mlp": 0.01278441, + "balance_loss_clip": 1.14778161, + "balance_loss_mlp": 1.03239286, + "epoch": 0.3248459341650383, + "flos": 17926582351680.0, + "grad_norm": 3.455306318820556, + "language_loss": 0.84793115, + "learning_rate": 3.155282749751332e-06, + "loss": 0.87557745, + "num_input_tokens_seen": 116011095, + "step": 5403, + "time_per_iteration": 2.708601951599121 + }, + { + "auxiliary_loss_clip": 0.01493865, + "auxiliary_loss_mlp": 0.01288789, + "balance_loss_clip": 1.1562109, + "balance_loss_mlp": 1.04350436, + "epoch": 0.3249060574177063, + "flos": 24537943791360.0, + "grad_norm": 2.88830688903024, + "language_loss": 0.86937988, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89720643, + "num_input_tokens_seen": 116028805, + "step": 5404, + "time_per_iteration": 2.758082151412964 + }, + { + "auxiliary_loss_clip": 0.01485681, + "auxiliary_loss_mlp": 0.01291092, + "balance_loss_clip": 1.14707065, + "balance_loss_mlp": 1.04561663, + "epoch": 0.32496618067037425, + "flos": 25997168463840.0, + "grad_norm": 2.1524492934090436, + "language_loss": 0.73377138, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.7615391, + "num_input_tokens_seen": 116047765, + "step": 5405, + "time_per_iteration": 2.7874717712402344 + }, + { + "auxiliary_loss_clip": 0.01494625, + "auxiliary_loss_mlp": 0.01291853, + "balance_loss_clip": 1.15601444, + "balance_loss_mlp": 1.04561388, + "epoch": 0.3250263039230422, + "flos": 19575828927360.0, + "grad_norm": 2.190670787860181, + "language_loss": 0.83203721, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85990196, + "num_input_tokens_seen": 116068385, + "step": 5406, + "time_per_iteration": 2.754650354385376 + }, + { + "auxiliary_loss_clip": 0.01495297, + "auxiliary_loss_mlp": 0.01289487, + "balance_loss_clip": 1.15656853, + "balance_loss_mlp": 1.04343951, + "epoch": 0.3250864271757102, + "flos": 16765513859520.0, + "grad_norm": 2.1814535514067956, + "language_loss": 0.87872583, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90657365, + "num_input_tokens_seen": 116085350, + "step": 5407, + "time_per_iteration": 2.7741730213165283 + }, + { + "auxiliary_loss_clip": 0.0148488, + "auxiliary_loss_mlp": 0.01276649, + "balance_loss_clip": 1.14539409, + "balance_loss_mlp": 1.03079152, + "epoch": 0.3251465504283782, + "flos": 27821986313760.0, + "grad_norm": 1.4734026571619792, + "language_loss": 0.69512784, + "learning_rate": 3.153692632731479e-06, + "loss": 0.72274309, + "num_input_tokens_seen": 116107560, + "step": 5408, + "time_per_iteration": 2.7933547496795654 + }, + { + "auxiliary_loss_clip": 0.01487239, + "auxiliary_loss_mlp": 0.01285474, + "balance_loss_clip": 1.14803112, + "balance_loss_mlp": 1.03408504, + "epoch": 0.32520667368104617, + "flos": 19065197008800.0, + "grad_norm": 2.669794274389278, + "language_loss": 0.77769244, + "learning_rate": 3.153374478034841e-06, + "loss": 0.80541956, + "num_input_tokens_seen": 116125980, + "step": 5409, + "time_per_iteration": 2.73880934715271 + }, + { + "auxiliary_loss_clip": 0.01476065, + "auxiliary_loss_mlp": 0.01286896, + "balance_loss_clip": 1.13730502, + "balance_loss_mlp": 1.03817785, + "epoch": 0.32526679693371413, + "flos": 29384414602560.0, + "grad_norm": 1.9171547368435093, + "language_loss": 0.83155322, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85918283, + "num_input_tokens_seen": 116146530, + "step": 5410, + "time_per_iteration": 2.8657009601593018 + }, + { + "auxiliary_loss_clip": 0.01487852, + "auxiliary_loss_mlp": 0.01286845, + "balance_loss_clip": 1.14799023, + "balance_loss_mlp": 1.04422987, + "epoch": 0.3253269201863821, + "flos": 20706630383520.0, + "grad_norm": 2.1581556759458547, + "language_loss": 0.71115005, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73889697, + "num_input_tokens_seen": 116165695, + "step": 5411, + "time_per_iteration": 2.752633571624756 + }, + { + "auxiliary_loss_clip": 0.01484698, + "auxiliary_loss_mlp": 0.01276687, + "balance_loss_clip": 1.14736176, + "balance_loss_mlp": 1.03063893, + "epoch": 0.32538704343905006, + "flos": 29096612809920.0, + "grad_norm": 1.8072472087550961, + "language_loss": 0.83240175, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.86001563, + "num_input_tokens_seen": 116185375, + "step": 5412, + "time_per_iteration": 2.9024062156677246 + }, + { + "auxiliary_loss_clip": 0.01486, + "auxiliary_loss_mlp": 0.01278661, + "balance_loss_clip": 1.14587355, + "balance_loss_mlp": 1.02669978, + "epoch": 0.325447166691718, + "flos": 24678165656160.0, + "grad_norm": 1.9181832814670625, + "language_loss": 0.8067323, + "learning_rate": 3.152101422008203e-06, + "loss": 0.83437896, + "num_input_tokens_seen": 116204335, + "step": 5413, + "time_per_iteration": 2.862959861755371 + }, + { + "auxiliary_loss_clip": 0.01488767, + "auxiliary_loss_mlp": 0.01277565, + "balance_loss_clip": 1.15006101, + "balance_loss_mlp": 1.03056335, + "epoch": 0.325507289944386, + "flos": 21545268308640.0, + "grad_norm": 2.137274318908313, + "language_loss": 0.76563692, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79330027, + "num_input_tokens_seen": 116222840, + "step": 5414, + "time_per_iteration": 2.833268642425537 + }, + { + "auxiliary_loss_clip": 0.01576705, + "auxiliary_loss_mlp": 0.01238266, + "balance_loss_clip": 1.23219967, + "balance_loss_mlp": 1.02616882, + "epoch": 0.32556741319705396, + "flos": 71525144764800.0, + "grad_norm": 0.8976332131471497, + "language_loss": 0.63932669, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66747642, + "num_input_tokens_seen": 116274940, + "step": 5415, + "time_per_iteration": 3.2959280014038086 + }, + { + "auxiliary_loss_clip": 0.01477023, + "auxiliary_loss_mlp": 0.01277957, + "balance_loss_clip": 1.13937235, + "balance_loss_mlp": 1.02923894, + "epoch": 0.3256275364497219, + "flos": 23734693203840.0, + "grad_norm": 1.8268817431499529, + "language_loss": 0.74272257, + "learning_rate": 3.151146171224075e-06, + "loss": 0.77027237, + "num_input_tokens_seen": 116297300, + "step": 5416, + "time_per_iteration": 2.8324294090270996 + }, + { + "auxiliary_loss_clip": 0.01576569, + "auxiliary_loss_mlp": 0.01251984, + "balance_loss_clip": 1.23181593, + "balance_loss_mlp": 1.04141235, + "epoch": 0.3256876597023899, + "flos": 67295543741280.0, + "grad_norm": 0.7785837037721342, + "language_loss": 0.57866025, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.60694575, + "num_input_tokens_seen": 116362370, + "step": 5417, + "time_per_iteration": 3.36851167678833 + }, + { + "auxiliary_loss_clip": 0.01571465, + "auxiliary_loss_mlp": 0.01258286, + "balance_loss_clip": 1.22711432, + "balance_loss_mlp": 1.04695129, + "epoch": 0.32574778295505785, + "flos": 71289797981760.0, + "grad_norm": 0.8282541226738142, + "language_loss": 0.63374186, + "learning_rate": 3.150509119089975e-06, + "loss": 0.6620394, + "num_input_tokens_seen": 116430365, + "step": 5418, + "time_per_iteration": 3.3875441551208496 + }, + { + "auxiliary_loss_clip": 0.01476337, + "auxiliary_loss_mlp": 0.01284597, + "balance_loss_clip": 1.13874865, + "balance_loss_mlp": 1.03778648, + "epoch": 0.3258079062077258, + "flos": 20778429119040.0, + "grad_norm": 3.22216174341622, + "language_loss": 0.69028938, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71789873, + "num_input_tokens_seen": 116447525, + "step": 5419, + "time_per_iteration": 2.8350770473480225 + }, + { + "auxiliary_loss_clip": 0.01486941, + "auxiliary_loss_mlp": 0.0129766, + "balance_loss_clip": 1.14815187, + "balance_loss_mlp": 1.04932332, + "epoch": 0.3258680294603938, + "flos": 22237577935200.0, + "grad_norm": 2.054177928948345, + "language_loss": 0.76865697, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79650295, + "num_input_tokens_seen": 116466310, + "step": 5420, + "time_per_iteration": 2.860903024673462 + }, + { + "auxiliary_loss_clip": 0.0148348, + "auxiliary_loss_mlp": 0.01283527, + "balance_loss_clip": 1.14435029, + "balance_loss_mlp": 1.03518987, + "epoch": 0.3259281527130618, + "flos": 26982438112800.0, + "grad_norm": 1.901574124837035, + "language_loss": 0.80180138, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82947147, + "num_input_tokens_seen": 116487825, + "step": 5421, + "time_per_iteration": 2.8351821899414062 + }, + { + "auxiliary_loss_clip": 0.01480507, + "auxiliary_loss_mlp": 0.01280554, + "balance_loss_clip": 1.14290881, + "balance_loss_mlp": 1.03297997, + "epoch": 0.32598827596572977, + "flos": 26216698839840.0, + "grad_norm": 2.12338999181122, + "language_loss": 0.75655341, + "learning_rate": 3.149234491389381e-06, + "loss": 0.78416401, + "num_input_tokens_seen": 116509950, + "step": 5422, + "time_per_iteration": 4.431002378463745 + }, + { + "auxiliary_loss_clip": 0.01487959, + "auxiliary_loss_mlp": 0.01293682, + "balance_loss_clip": 1.15143204, + "balance_loss_mlp": 1.04515505, + "epoch": 0.32604839921839773, + "flos": 17641701027360.0, + "grad_norm": 2.2952896911422394, + "language_loss": 0.63203359, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.65985, + "num_input_tokens_seen": 116527695, + "step": 5423, + "time_per_iteration": 2.8091015815734863 + }, + { + "auxiliary_loss_clip": 0.0147875, + "auxiliary_loss_mlp": 0.01276906, + "balance_loss_clip": 1.14061677, + "balance_loss_mlp": 1.02914166, + "epoch": 0.3261085224710657, + "flos": 23624965944000.0, + "grad_norm": 1.6166727912361352, + "language_loss": 0.74833286, + "learning_rate": 3.148596916016224e-06, + "loss": 0.7758894, + "num_input_tokens_seen": 116547800, + "step": 5424, + "time_per_iteration": 2.8179330825805664 + }, + { + "auxiliary_loss_clip": 0.01479665, + "auxiliary_loss_mlp": 0.0128488, + "balance_loss_clip": 1.14195526, + "balance_loss_mlp": 1.03635263, + "epoch": 0.32616864572373366, + "flos": 23262824229120.0, + "grad_norm": 2.2891400528707857, + "language_loss": 0.77030826, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79795372, + "num_input_tokens_seen": 116568460, + "step": 5425, + "time_per_iteration": 2.833739995956421 + }, + { + "auxiliary_loss_clip": 0.01476388, + "auxiliary_loss_mlp": 0.01292414, + "balance_loss_clip": 1.14025939, + "balance_loss_mlp": 1.04026306, + "epoch": 0.32622876897640163, + "flos": 25595808667200.0, + "grad_norm": 2.775074749499776, + "language_loss": 0.78010535, + "learning_rate": 3.147959166423428e-06, + "loss": 0.80779338, + "num_input_tokens_seen": 116588705, + "step": 5426, + "time_per_iteration": 2.803008794784546 + }, + { + "auxiliary_loss_clip": 0.01480455, + "auxiliary_loss_mlp": 0.01278648, + "balance_loss_clip": 1.14328408, + "balance_loss_mlp": 1.02859461, + "epoch": 0.3262888922290696, + "flos": 22421189979360.0, + "grad_norm": 1.8095943809922157, + "language_loss": 0.74707246, + "learning_rate": 3.147640226324893e-06, + "loss": 0.77466351, + "num_input_tokens_seen": 116608845, + "step": 5427, + "time_per_iteration": 2.831312656402588 + }, + { + "auxiliary_loss_clip": 0.01482196, + "auxiliary_loss_mlp": 0.01277922, + "balance_loss_clip": 1.14534688, + "balance_loss_mlp": 1.02119255, + "epoch": 0.32634901548173756, + "flos": 19720943524800.0, + "grad_norm": 1.6501330118864763, + "language_loss": 0.78938323, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.81698442, + "num_input_tokens_seen": 116628145, + "step": 5428, + "time_per_iteration": 2.8252668380737305 + }, + { + "auxiliary_loss_clip": 0.01472602, + "auxiliary_loss_mlp": 0.01280032, + "balance_loss_clip": 1.13431334, + "balance_loss_mlp": 1.03226709, + "epoch": 0.3264091387344055, + "flos": 16144737471360.0, + "grad_norm": 2.495315719254583, + "language_loss": 0.71019518, + "learning_rate": 3.147002215584023e-06, + "loss": 0.7377215, + "num_input_tokens_seen": 116646920, + "step": 5429, + "time_per_iteration": 2.77701997756958 + }, + { + "auxiliary_loss_clip": 0.01472882, + "auxiliary_loss_mlp": 0.01277424, + "balance_loss_clip": 1.13598931, + "balance_loss_mlp": 1.02965963, + "epoch": 0.3264692619870735, + "flos": 16400944742400.0, + "grad_norm": 1.705109019371875, + "language_loss": 0.79223078, + "learning_rate": 3.146683144965881e-06, + "loss": 0.81973386, + "num_input_tokens_seen": 116665100, + "step": 5430, + "time_per_iteration": 2.7523856163024902 + }, + { + "auxiliary_loss_clip": 0.01484664, + "auxiliary_loss_mlp": 0.01289801, + "balance_loss_clip": 1.14677322, + "balance_loss_mlp": 1.04165459, + "epoch": 0.32652938523974145, + "flos": 22384513084320.0, + "grad_norm": 2.149269753185805, + "language_loss": 0.83860874, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86635333, + "num_input_tokens_seen": 116682205, + "step": 5431, + "time_per_iteration": 4.182904481887817 + }, + { + "auxiliary_loss_clip": 0.01472967, + "auxiliary_loss_mlp": 0.01282138, + "balance_loss_clip": 1.13617492, + "balance_loss_mlp": 1.03227544, + "epoch": 0.3265895084924094, + "flos": 21910216707360.0, + "grad_norm": 1.6924078136799838, + "language_loss": 0.70760119, + "learning_rate": 3.146044873294678e-06, + "loss": 0.73515224, + "num_input_tokens_seen": 116702575, + "step": 5432, + "time_per_iteration": 4.2375407218933105 + }, + { + "auxiliary_loss_clip": 0.01471604, + "auxiliary_loss_mlp": 0.01272799, + "balance_loss_clip": 1.13396788, + "balance_loss_mlp": 1.02369881, + "epoch": 0.3266496317450774, + "flos": 16068804566400.0, + "grad_norm": 7.539627164322289, + "language_loss": 0.84127998, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86872399, + "num_input_tokens_seen": 116720885, + "step": 5433, + "time_per_iteration": 4.2623865604400635 + }, + { + "auxiliary_loss_clip": 0.01471766, + "auxiliary_loss_mlp": 0.01282637, + "balance_loss_clip": 1.13457239, + "balance_loss_mlp": 1.03582656, + "epoch": 0.3267097549977454, + "flos": 22530424173120.0, + "grad_norm": 1.4289610977728768, + "language_loss": 0.85365444, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88119847, + "num_input_tokens_seen": 116740395, + "step": 5434, + "time_per_iteration": 2.8240671157836914 + }, + { + "auxiliary_loss_clip": 0.01476888, + "auxiliary_loss_mlp": 0.01280718, + "balance_loss_clip": 1.1407057, + "balance_loss_mlp": 1.03085566, + "epoch": 0.32676987825041337, + "flos": 27272819020320.0, + "grad_norm": 2.292483710725517, + "language_loss": 0.8813979, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.90897393, + "num_input_tokens_seen": 116758870, + "step": 5435, + "time_per_iteration": 2.8177707195281982 + }, + { + "auxiliary_loss_clip": 0.01478103, + "auxiliary_loss_mlp": 0.01280277, + "balance_loss_clip": 1.14102077, + "balance_loss_mlp": 1.03346634, + "epoch": 0.32683000150308134, + "flos": 11509756266240.0, + "grad_norm": 2.8224082581067194, + "language_loss": 0.76722014, + "learning_rate": 3.144767808551479e-06, + "loss": 0.79480398, + "num_input_tokens_seen": 116773440, + "step": 5436, + "time_per_iteration": 2.7788050174713135 + }, + { + "auxiliary_loss_clip": 0.01477903, + "auxiliary_loss_mlp": 0.01272652, + "balance_loss_clip": 1.14166665, + "balance_loss_mlp": 1.02832115, + "epoch": 0.3268901247557493, + "flos": 25632599346720.0, + "grad_norm": 2.3344274185820355, + "language_loss": 0.72016144, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74766707, + "num_input_tokens_seen": 116794375, + "step": 5437, + "time_per_iteration": 2.804164171218872 + }, + { + "auxiliary_loss_clip": 0.01476094, + "auxiliary_loss_mlp": 0.01291433, + "balance_loss_clip": 1.13913584, + "balance_loss_mlp": 1.0423336, + "epoch": 0.32695024800841727, + "flos": 24863029329600.0, + "grad_norm": 1.7039469092695598, + "language_loss": 0.63827372, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66594899, + "num_input_tokens_seen": 116815095, + "step": 5438, + "time_per_iteration": 2.896838903427124 + }, + { + "auxiliary_loss_clip": 0.01481382, + "auxiliary_loss_mlp": 0.01297138, + "balance_loss_clip": 1.1451993, + "balance_loss_mlp": 1.05280685, + "epoch": 0.32701037126108523, + "flos": 28841733024480.0, + "grad_norm": 2.1164015843736337, + "language_loss": 0.74533463, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.77311981, + "num_input_tokens_seen": 116836630, + "step": 5439, + "time_per_iteration": 2.849301338195801 + }, + { + "auxiliary_loss_clip": 0.01480719, + "auxiliary_loss_mlp": 0.01292159, + "balance_loss_clip": 1.14505887, + "balance_loss_mlp": 1.04630172, + "epoch": 0.3270704945137532, + "flos": 27967593977280.0, + "grad_norm": 2.0268851749197694, + "language_loss": 0.74599665, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77372539, + "num_input_tokens_seen": 116856880, + "step": 5440, + "time_per_iteration": 2.932615280151367 + }, + { + "auxiliary_loss_clip": 0.01474843, + "auxiliary_loss_mlp": 0.01298238, + "balance_loss_clip": 1.1390934, + "balance_loss_mlp": 1.0554328, + "epoch": 0.32713061776642116, + "flos": 23692668438240.0, + "grad_norm": 2.2591527140351166, + "language_loss": 0.85122472, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.87895548, + "num_input_tokens_seen": 116873770, + "step": 5441, + "time_per_iteration": 2.866180419921875 + }, + { + "auxiliary_loss_clip": 0.01475234, + "auxiliary_loss_mlp": 0.01300313, + "balance_loss_clip": 1.13907647, + "balance_loss_mlp": 1.05293012, + "epoch": 0.3271907410190891, + "flos": 22457828946240.0, + "grad_norm": 2.412366448911782, + "language_loss": 0.86280203, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.89055747, + "num_input_tokens_seen": 116891225, + "step": 5442, + "time_per_iteration": 2.729646921157837 + }, + { + "auxiliary_loss_clip": 0.01471194, + "auxiliary_loss_mlp": 0.01305429, + "balance_loss_clip": 1.13584507, + "balance_loss_mlp": 1.06052589, + "epoch": 0.3272508642717571, + "flos": 22822322207040.0, + "grad_norm": 2.1387644466770666, + "language_loss": 0.7766481, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.80441439, + "num_input_tokens_seen": 116912300, + "step": 5443, + "time_per_iteration": 2.8231770992279053 + }, + { + "auxiliary_loss_clip": 0.0147492, + "auxiliary_loss_mlp": 0.01288331, + "balance_loss_clip": 1.1381371, + "balance_loss_mlp": 1.04590726, + "epoch": 0.32731098752442506, + "flos": 11802943857600.0, + "grad_norm": 2.1097101440875385, + "language_loss": 0.8127898, + "learning_rate": 3.142211596174343e-06, + "loss": 0.84042227, + "num_input_tokens_seen": 116929425, + "step": 5444, + "time_per_iteration": 2.7660062313079834 + }, + { + "auxiliary_loss_clip": 0.01475651, + "auxiliary_loss_mlp": 0.01283722, + "balance_loss_clip": 1.13905597, + "balance_loss_mlp": 1.03843689, + "epoch": 0.327371110777093, + "flos": 21029629872960.0, + "grad_norm": 5.5983926627184335, + "language_loss": 0.59686387, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.6244576, + "num_input_tokens_seen": 116948255, + "step": 5445, + "time_per_iteration": 2.7729170322418213 + }, + { + "auxiliary_loss_clip": 0.01479955, + "auxiliary_loss_mlp": 0.01304019, + "balance_loss_clip": 1.145051, + "balance_loss_mlp": 1.0577805, + "epoch": 0.327431234029761, + "flos": 19064097092160.0, + "grad_norm": 2.2152491864388195, + "language_loss": 0.88557315, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.91341287, + "num_input_tokens_seen": 116964905, + "step": 5446, + "time_per_iteration": 2.706502914428711 + }, + { + "auxiliary_loss_clip": 0.01480342, + "auxiliary_loss_mlp": 0.01288386, + "balance_loss_clip": 1.14556384, + "balance_loss_mlp": 1.0347085, + "epoch": 0.32749135728242895, + "flos": 25851636656640.0, + "grad_norm": 1.701356081547938, + "language_loss": 0.78771222, + "learning_rate": 3.141252301538802e-06, + "loss": 0.81539953, + "num_input_tokens_seen": 116983650, + "step": 5447, + "time_per_iteration": 2.8161654472351074 + }, + { + "auxiliary_loss_clip": 0.01470874, + "auxiliary_loss_mlp": 0.01287987, + "balance_loss_clip": 1.13357258, + "balance_loss_mlp": 1.04155731, + "epoch": 0.327551480535097, + "flos": 20122606746720.0, + "grad_norm": 2.153059615263197, + "language_loss": 0.73188686, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75947547, + "num_input_tokens_seen": 117003265, + "step": 5448, + "time_per_iteration": 2.742182731628418 + }, + { + "auxiliary_loss_clip": 0.01476373, + "auxiliary_loss_mlp": 0.01285878, + "balance_loss_clip": 1.14093113, + "balance_loss_mlp": 1.04154706, + "epoch": 0.32761160378776494, + "flos": 28806080189760.0, + "grad_norm": 1.7970619573402882, + "language_loss": 0.66909277, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69671535, + "num_input_tokens_seen": 117025370, + "step": 5449, + "time_per_iteration": 2.925917148590088 + }, + { + "auxiliary_loss_clip": 0.01474118, + "auxiliary_loss_mlp": 0.01284499, + "balance_loss_clip": 1.13717818, + "balance_loss_mlp": 1.03673482, + "epoch": 0.3276717270404329, + "flos": 26939920281120.0, + "grad_norm": 1.7311937604806436, + "language_loss": 0.65435272, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.68193889, + "num_input_tokens_seen": 117044350, + "step": 5450, + "time_per_iteration": 2.807915210723877 + }, + { + "auxiliary_loss_clip": 0.01474441, + "auxiliary_loss_mlp": 0.01285556, + "balance_loss_clip": 1.13725364, + "balance_loss_mlp": 1.04198718, + "epoch": 0.32773185029310087, + "flos": 25340815097280.0, + "grad_norm": 2.0707189144457927, + "language_loss": 0.77386463, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.80146456, + "num_input_tokens_seen": 117064450, + "step": 5451, + "time_per_iteration": 2.8098888397216797 + }, + { + "auxiliary_loss_clip": 0.0147537, + "auxiliary_loss_mlp": 0.01282986, + "balance_loss_clip": 1.14062905, + "balance_loss_mlp": 1.03426743, + "epoch": 0.32779197354576883, + "flos": 26393901024960.0, + "grad_norm": 2.924600795418039, + "language_loss": 0.70737827, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.73496181, + "num_input_tokens_seen": 117083060, + "step": 5452, + "time_per_iteration": 2.8140406608581543 + }, + { + "auxiliary_loss_clip": 0.01473906, + "auxiliary_loss_mlp": 0.01281079, + "balance_loss_clip": 1.13612223, + "balance_loss_mlp": 1.0380826, + "epoch": 0.3278520967984368, + "flos": 24901678488960.0, + "grad_norm": 1.6742957189018768, + "language_loss": 0.78735119, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.81490111, + "num_input_tokens_seen": 117101860, + "step": 5453, + "time_per_iteration": 2.776287794113159 + }, + { + "auxiliary_loss_clip": 0.01477974, + "auxiliary_loss_mlp": 0.01284024, + "balance_loss_clip": 1.14149618, + "balance_loss_mlp": 1.0347333, + "epoch": 0.32791222005110476, + "flos": 29755621147680.0, + "grad_norm": 3.0427365409952105, + "language_loss": 0.75145197, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77907199, + "num_input_tokens_seen": 117123100, + "step": 5454, + "time_per_iteration": 2.853172779083252 + }, + { + "auxiliary_loss_clip": 0.01470183, + "auxiliary_loss_mlp": 0.01280507, + "balance_loss_clip": 1.13401878, + "balance_loss_mlp": 1.03636599, + "epoch": 0.32797234330377273, + "flos": 16509913439040.0, + "grad_norm": 2.8284623913907714, + "language_loss": 0.76561362, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.7931205, + "num_input_tokens_seen": 117140515, + "step": 5455, + "time_per_iteration": 2.766874313354492 + }, + { + "auxiliary_loss_clip": 0.01475191, + "auxiliary_loss_mlp": 0.01292068, + "balance_loss_clip": 1.13801527, + "balance_loss_mlp": 1.04125178, + "epoch": 0.3280324665564407, + "flos": 26580319752960.0, + "grad_norm": 1.8109788759451395, + "language_loss": 0.74030352, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76797605, + "num_input_tokens_seen": 117161485, + "step": 5456, + "time_per_iteration": 2.885263204574585 + }, + { + "auxiliary_loss_clip": 0.01473586, + "auxiliary_loss_mlp": 0.01282834, + "balance_loss_clip": 1.13699245, + "balance_loss_mlp": 1.03659511, + "epoch": 0.32809258980910866, + "flos": 22932201179520.0, + "grad_norm": 1.7670262443877418, + "language_loss": 0.78282642, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.81039059, + "num_input_tokens_seen": 117181870, + "step": 5457, + "time_per_iteration": 2.7759742736816406 + }, + { + "auxiliary_loss_clip": 0.0146477, + "auxiliary_loss_mlp": 0.01273577, + "balance_loss_clip": 1.12823033, + "balance_loss_mlp": 1.02409601, + "epoch": 0.3281527130617766, + "flos": 22786062521760.0, + "grad_norm": 3.121410104683589, + "language_loss": 0.7862519, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81363541, + "num_input_tokens_seen": 117201380, + "step": 5458, + "time_per_iteration": 2.749307870864868 + }, + { + "auxiliary_loss_clip": 0.0146559, + "auxiliary_loss_mlp": 0.01276247, + "balance_loss_clip": 1.12861609, + "balance_loss_mlp": 1.03077126, + "epoch": 0.3282128363144446, + "flos": 21252763424160.0, + "grad_norm": 2.6145467025692266, + "language_loss": 0.73039854, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75781691, + "num_input_tokens_seen": 117221040, + "step": 5459, + "time_per_iteration": 4.329895496368408 + }, + { + "auxiliary_loss_clip": 0.01474895, + "auxiliary_loss_mlp": 0.01287227, + "balance_loss_clip": 1.13907373, + "balance_loss_mlp": 1.03812718, + "epoch": 0.32827295956711255, + "flos": 30845497754880.0, + "grad_norm": 2.047409671195547, + "language_loss": 0.84060758, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86822879, + "num_input_tokens_seen": 117241395, + "step": 5460, + "time_per_iteration": 2.8760287761688232 + }, + { + "auxiliary_loss_clip": 0.01467155, + "auxiliary_loss_mlp": 0.01275986, + "balance_loss_clip": 1.13052809, + "balance_loss_mlp": 1.0322274, + "epoch": 0.3283330828197806, + "flos": 25917025533120.0, + "grad_norm": 1.8651858382513367, + "language_loss": 0.77438426, + "learning_rate": 3.136770448642288e-06, + "loss": 0.80181575, + "num_input_tokens_seen": 117259340, + "step": 5461, + "time_per_iteration": 2.7654669284820557 + }, + { + "auxiliary_loss_clip": 0.0146591, + "auxiliary_loss_mlp": 0.01286081, + "balance_loss_clip": 1.12958801, + "balance_loss_mlp": 1.03831625, + "epoch": 0.32839320607244854, + "flos": 38585157392160.0, + "grad_norm": 1.853305228227923, + "language_loss": 0.62848997, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65600991, + "num_input_tokens_seen": 117282375, + "step": 5462, + "time_per_iteration": 2.926790475845337 + }, + { + "auxiliary_loss_clip": 0.01471562, + "auxiliary_loss_mlp": 0.01288076, + "balance_loss_clip": 1.13436794, + "balance_loss_mlp": 1.04279065, + "epoch": 0.3284533293251165, + "flos": 26653294261440.0, + "grad_norm": 1.611691669718245, + "language_loss": 0.7784577, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80605406, + "num_input_tokens_seen": 117303830, + "step": 5463, + "time_per_iteration": 2.781925916671753 + }, + { + "auxiliary_loss_clip": 0.01468575, + "auxiliary_loss_mlp": 0.01280074, + "balance_loss_clip": 1.1315521, + "balance_loss_mlp": 1.03345418, + "epoch": 0.32851345257778447, + "flos": 15305947833600.0, + "grad_norm": 4.701399592936617, + "language_loss": 0.6992076, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72669411, + "num_input_tokens_seen": 117320665, + "step": 5464, + "time_per_iteration": 2.744487762451172 + }, + { + "auxiliary_loss_clip": 0.01464983, + "auxiliary_loss_mlp": 0.01276174, + "balance_loss_clip": 1.12943244, + "balance_loss_mlp": 1.03184283, + "epoch": 0.32857357583045244, + "flos": 23515997247360.0, + "grad_norm": 2.1559095025011894, + "language_loss": 0.72682983, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.75424141, + "num_input_tokens_seen": 117339795, + "step": 5465, + "time_per_iteration": 2.7753310203552246 + }, + { + "auxiliary_loss_clip": 0.01472539, + "auxiliary_loss_mlp": 0.01279455, + "balance_loss_clip": 1.13689744, + "balance_loss_mlp": 1.03340697, + "epoch": 0.3286336990831204, + "flos": 20997276788160.0, + "grad_norm": 1.5261461274083048, + "language_loss": 0.82563329, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.85315323, + "num_input_tokens_seen": 117359525, + "step": 5466, + "time_per_iteration": 2.78334379196167 + }, + { + "auxiliary_loss_clip": 0.01463985, + "auxiliary_loss_mlp": 0.0128251, + "balance_loss_clip": 1.12867212, + "balance_loss_mlp": 1.03741574, + "epoch": 0.32869382233578837, + "flos": 23661035988480.0, + "grad_norm": 1.8269928836128901, + "language_loss": 0.79302371, + "learning_rate": 3.134847066213879e-06, + "loss": 0.82048863, + "num_input_tokens_seen": 117380320, + "step": 5467, + "time_per_iteration": 2.797015428543091 + }, + { + "auxiliary_loss_clip": 0.01461488, + "auxiliary_loss_mlp": 0.01277212, + "balance_loss_clip": 1.12666965, + "balance_loss_mlp": 1.02887499, + "epoch": 0.32875394558845633, + "flos": 25338804904800.0, + "grad_norm": 1.7000579758458199, + "language_loss": 0.74421561, + "learning_rate": 3.134526351787587e-06, + "loss": 0.77160263, + "num_input_tokens_seen": 117400695, + "step": 5468, + "time_per_iteration": 2.813950777053833 + }, + { + "auxiliary_loss_clip": 0.01470507, + "auxiliary_loss_mlp": 0.01290571, + "balance_loss_clip": 1.1338141, + "balance_loss_mlp": 1.04604876, + "epoch": 0.3288140688411243, + "flos": 14904891462240.0, + "grad_norm": 2.6323991822441077, + "language_loss": 0.78693712, + "learning_rate": 3.134205594339942e-06, + "loss": 0.8145479, + "num_input_tokens_seen": 117418800, + "step": 5469, + "time_per_iteration": 4.270619869232178 + }, + { + "auxiliary_loss_clip": 0.01464755, + "auxiliary_loss_mlp": 0.01286259, + "balance_loss_clip": 1.12955749, + "balance_loss_mlp": 1.04402542, + "epoch": 0.32887419209379226, + "flos": 18553313460960.0, + "grad_norm": 2.0049537181440424, + "language_loss": 0.8205725, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84808254, + "num_input_tokens_seen": 117438220, + "step": 5470, + "time_per_iteration": 2.739262342453003 + }, + { + "auxiliary_loss_clip": 0.01461409, + "auxiliary_loss_mlp": 0.01283576, + "balance_loss_clip": 1.12384653, + "balance_loss_mlp": 1.04172444, + "epoch": 0.3289343153464602, + "flos": 48109278952800.0, + "grad_norm": 2.1724408129466113, + "language_loss": 0.68053102, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.70798087, + "num_input_tokens_seen": 117462560, + "step": 5471, + "time_per_iteration": 3.0376553535461426 + }, + { + "auxiliary_loss_clip": 0.01463643, + "auxiliary_loss_mlp": 0.01289256, + "balance_loss_clip": 1.12679935, + "balance_loss_mlp": 1.04130101, + "epoch": 0.3289944385991282, + "flos": 27602797291200.0, + "grad_norm": 1.705041501848733, + "language_loss": 0.65018058, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67770958, + "num_input_tokens_seen": 117483665, + "step": 5472, + "time_per_iteration": 4.299004793167114 + }, + { + "auxiliary_loss_clip": 0.0146462, + "auxiliary_loss_mlp": 0.01293516, + "balance_loss_clip": 1.12735677, + "balance_loss_mlp": 1.04689622, + "epoch": 0.32905456185179616, + "flos": 20122379177760.0, + "grad_norm": 1.764306135396689, + "language_loss": 0.88446653, + "learning_rate": 3.13292213457912e-06, + "loss": 0.91204786, + "num_input_tokens_seen": 117503565, + "step": 5473, + "time_per_iteration": 2.7803094387054443 + }, + { + "auxiliary_loss_clip": 0.01468003, + "auxiliary_loss_mlp": 0.01293855, + "balance_loss_clip": 1.13033032, + "balance_loss_mlp": 1.04589963, + "epoch": 0.3291146851044642, + "flos": 23182188232320.0, + "grad_norm": 17.386139234141098, + "language_loss": 0.7789548, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80657339, + "num_input_tokens_seen": 117521460, + "step": 5474, + "time_per_iteration": 2.7822062969207764 + }, + { + "auxiliary_loss_clip": 0.01553645, + "auxiliary_loss_mlp": 0.01255714, + "balance_loss_clip": 1.2181921, + "balance_loss_mlp": 1.03980255, + "epoch": 0.32917480835713214, + "flos": 67628518336800.0, + "grad_norm": 0.8248880457276977, + "language_loss": 0.60115576, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62924939, + "num_input_tokens_seen": 117580550, + "step": 5475, + "time_per_iteration": 3.2899208068847656 + }, + { + "auxiliary_loss_clip": 0.01462466, + "auxiliary_loss_mlp": 0.01291616, + "balance_loss_clip": 1.12609577, + "balance_loss_mlp": 1.04232526, + "epoch": 0.3292349316098001, + "flos": 27967025054880.0, + "grad_norm": 3.269447768666469, + "language_loss": 0.77094698, + "learning_rate": 3.131959088630455e-06, + "loss": 0.79848784, + "num_input_tokens_seen": 117600645, + "step": 5476, + "time_per_iteration": 2.928896427154541 + }, + { + "auxiliary_loss_clip": 0.01465937, + "auxiliary_loss_mlp": 0.01293692, + "balance_loss_clip": 1.12916625, + "balance_loss_mlp": 1.05012393, + "epoch": 0.3292950548624681, + "flos": 20265597367200.0, + "grad_norm": 3.3416203430324054, + "language_loss": 0.74716783, + "learning_rate": 3.131637987449997e-06, + "loss": 0.77476406, + "num_input_tokens_seen": 117618880, + "step": 5477, + "time_per_iteration": 2.805138349533081 + }, + { + "auxiliary_loss_clip": 0.01466644, + "auxiliary_loss_mlp": 0.01295373, + "balance_loss_clip": 1.12901616, + "balance_loss_mlp": 1.05428398, + "epoch": 0.32935517811513604, + "flos": 20815068085920.0, + "grad_norm": 4.099227779565656, + "language_loss": 0.75263178, + "learning_rate": 3.131316843357713e-06, + "loss": 0.78025198, + "num_input_tokens_seen": 117636445, + "step": 5478, + "time_per_iteration": 2.777388572692871 + }, + { + "auxiliary_loss_clip": 0.01466366, + "auxiliary_loss_mlp": 0.01290638, + "balance_loss_clip": 1.12852657, + "balance_loss_mlp": 1.04611635, + "epoch": 0.329415301367804, + "flos": 18443813770080.0, + "grad_norm": 1.7800998328453022, + "language_loss": 0.80333453, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.8309046, + "num_input_tokens_seen": 117653105, + "step": 5479, + "time_per_iteration": 2.81718111038208 + }, + { + "auxiliary_loss_clip": 0.0156405, + "auxiliary_loss_mlp": 0.01246941, + "balance_loss_clip": 1.22871387, + "balance_loss_mlp": 1.03179169, + "epoch": 0.32947542462047197, + "flos": 66330376017120.0, + "grad_norm": 0.7461884214469946, + "language_loss": 0.56394756, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.59205741, + "num_input_tokens_seen": 117719225, + "step": 5480, + "time_per_iteration": 3.3780922889709473 + }, + { + "auxiliary_loss_clip": 0.01464035, + "auxiliary_loss_mlp": 0.0128361, + "balance_loss_clip": 1.12666798, + "balance_loss_mlp": 1.03718066, + "epoch": 0.32953554787313993, + "flos": 23223833716320.0, + "grad_norm": 1.9822430779393623, + "language_loss": 0.77103454, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79851097, + "num_input_tokens_seen": 117738725, + "step": 5481, + "time_per_iteration": 2.761390209197998 + }, + { + "auxiliary_loss_clip": 0.01464843, + "auxiliary_loss_mlp": 0.01277185, + "balance_loss_clip": 1.12761629, + "balance_loss_mlp": 1.02922964, + "epoch": 0.3295956711258079, + "flos": 27011567304000.0, + "grad_norm": 2.114948698270483, + "language_loss": 0.79191995, + "learning_rate": 3.130031838113899e-06, + "loss": 0.81934023, + "num_input_tokens_seen": 117757765, + "step": 5482, + "time_per_iteration": 2.8475914001464844 + }, + { + "auxiliary_loss_clip": 0.01467653, + "auxiliary_loss_mlp": 0.01286955, + "balance_loss_clip": 1.13102853, + "balance_loss_mlp": 1.04090714, + "epoch": 0.32965579437847586, + "flos": 19173710567520.0, + "grad_norm": 2.5015194116947135, + "language_loss": 0.74184442, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76939046, + "num_input_tokens_seen": 117776810, + "step": 5483, + "time_per_iteration": 2.7326414585113525 + }, + { + "auxiliary_loss_clip": 0.01472689, + "auxiliary_loss_mlp": 0.01281211, + "balance_loss_clip": 1.1371491, + "balance_loss_mlp": 1.03726089, + "epoch": 0.32971591763114383, + "flos": 30485176591680.0, + "grad_norm": 2.4682812652422705, + "language_loss": 0.75104582, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77858478, + "num_input_tokens_seen": 117797730, + "step": 5484, + "time_per_iteration": 2.879018545150757 + }, + { + "auxiliary_loss_clip": 0.01466675, + "auxiliary_loss_mlp": 0.01286737, + "balance_loss_clip": 1.12864351, + "balance_loss_mlp": 1.04335976, + "epoch": 0.3297760408838118, + "flos": 16291445051520.0, + "grad_norm": 4.763157706166684, + "language_loss": 0.71638119, + "learning_rate": 3.129067634203742e-06, + "loss": 0.74391532, + "num_input_tokens_seen": 117815365, + "step": 5485, + "time_per_iteration": 2.7645299434661865 + }, + { + "auxiliary_loss_clip": 0.01463654, + "auxiliary_loss_mlp": 0.0128384, + "balance_loss_clip": 1.1261791, + "balance_loss_mlp": 1.0374105, + "epoch": 0.32983616413647976, + "flos": 29533359944160.0, + "grad_norm": 1.9511723348003778, + "language_loss": 0.80428886, + "learning_rate": 3.128746147255388e-06, + "loss": 0.83176374, + "num_input_tokens_seen": 117836095, + "step": 5486, + "time_per_iteration": 2.7947165966033936 + }, + { + "auxiliary_loss_clip": 0.01467878, + "auxiliary_loss_mlp": 0.01283009, + "balance_loss_clip": 1.13029361, + "balance_loss_mlp": 1.03905869, + "epoch": 0.3298962873891478, + "flos": 20633504162400.0, + "grad_norm": 2.3505594081383006, + "language_loss": 0.84401846, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.87152731, + "num_input_tokens_seen": 117854655, + "step": 5487, + "time_per_iteration": 2.764371395111084 + }, + { + "auxiliary_loss_clip": 0.01469734, + "auxiliary_loss_mlp": 0.01297886, + "balance_loss_clip": 1.13124895, + "balance_loss_mlp": 1.05031252, + "epoch": 0.32995641064181574, + "flos": 14977790114400.0, + "grad_norm": 5.968361881012228, + "language_loss": 0.74496061, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.77263677, + "num_input_tokens_seen": 117873300, + "step": 5488, + "time_per_iteration": 2.733403444290161 + }, + { + "auxiliary_loss_clip": 0.01470414, + "auxiliary_loss_mlp": 0.01291204, + "balance_loss_clip": 1.13155484, + "balance_loss_mlp": 1.046682, + "epoch": 0.3300165338944837, + "flos": 18663457930560.0, + "grad_norm": 2.4641537615597473, + "language_loss": 0.72593093, + "learning_rate": 3.127781429646098e-06, + "loss": 0.75354707, + "num_input_tokens_seen": 117891540, + "step": 5489, + "time_per_iteration": 2.844667673110962 + }, + { + "auxiliary_loss_clip": 0.01462933, + "auxiliary_loss_mlp": 0.01290871, + "balance_loss_clip": 1.12491763, + "balance_loss_mlp": 1.05092657, + "epoch": 0.3300766571471517, + "flos": 25585568064000.0, + "grad_norm": 4.4324108964050035, + "language_loss": 0.88469005, + "learning_rate": 3.127459771562238e-06, + "loss": 0.91222811, + "num_input_tokens_seen": 117907690, + "step": 5490, + "time_per_iteration": 2.8296046257019043 + }, + { + "auxiliary_loss_clip": 0.01454818, + "auxiliary_loss_mlp": 0.0127835, + "balance_loss_clip": 1.11439812, + "balance_loss_mlp": 1.0359261, + "epoch": 0.33013678039981964, + "flos": 11365058878560.0, + "grad_norm": 2.0390698344389055, + "language_loss": 0.83104521, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85837692, + "num_input_tokens_seen": 117925640, + "step": 5491, + "time_per_iteration": 2.850696563720703 + }, + { + "auxiliary_loss_clip": 0.0146474, + "auxiliary_loss_mlp": 0.01281649, + "balance_loss_clip": 1.12560153, + "balance_loss_mlp": 1.0357914, + "epoch": 0.3301969036524876, + "flos": 24823204397280.0, + "grad_norm": 1.8935853012290813, + "language_loss": 0.77727008, + "learning_rate": 3.126816327146554e-06, + "loss": 0.80473393, + "num_input_tokens_seen": 117944525, + "step": 5492, + "time_per_iteration": 2.8360934257507324 + }, + { + "auxiliary_loss_clip": 0.01466577, + "auxiliary_loss_mlp": 0.01302614, + "balance_loss_clip": 1.12677991, + "balance_loss_mlp": 1.05751991, + "epoch": 0.33025702690515557, + "flos": 15962983907040.0, + "grad_norm": 3.0110064554185363, + "language_loss": 0.74185979, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76955163, + "num_input_tokens_seen": 117962515, + "step": 5493, + "time_per_iteration": 2.9046719074249268 + }, + { + "auxiliary_loss_clip": 0.0156935, + "auxiliary_loss_mlp": 0.0123822, + "balance_loss_clip": 1.233109, + "balance_loss_mlp": 1.02383423, + "epoch": 0.33031715015782354, + "flos": 59423134086720.0, + "grad_norm": 0.814375620112159, + "language_loss": 0.53826749, + "learning_rate": 3.12617271181492e-06, + "loss": 0.56634319, + "num_input_tokens_seen": 118018780, + "step": 5494, + "time_per_iteration": 3.3426403999328613 + }, + { + "auxiliary_loss_clip": 0.01468986, + "auxiliary_loss_mlp": 0.01292186, + "balance_loss_clip": 1.13141954, + "balance_loss_mlp": 1.0493803, + "epoch": 0.3303772734104915, + "flos": 23186549970720.0, + "grad_norm": 1.6248552692889746, + "language_loss": 0.86962664, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89723837, + "num_input_tokens_seen": 118038610, + "step": 5495, + "time_per_iteration": 2.7778894901275635 + }, + { + "auxiliary_loss_clip": 0.01468383, + "auxiliary_loss_mlp": 0.01286748, + "balance_loss_clip": 1.12801206, + "balance_loss_mlp": 1.04031861, + "epoch": 0.33043739666315947, + "flos": 33074633797920.0, + "grad_norm": 3.408295422054425, + "language_loss": 0.73190922, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75946057, + "num_input_tokens_seen": 118055905, + "step": 5496, + "time_per_iteration": 2.910443067550659 + }, + { + "auxiliary_loss_clip": 0.01464804, + "auxiliary_loss_mlp": 0.01291957, + "balance_loss_clip": 1.12447238, + "balance_loss_mlp": 1.04972422, + "epoch": 0.33049751991582743, + "flos": 24897506391360.0, + "grad_norm": 2.3435264163774367, + "language_loss": 0.72157413, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74914175, + "num_input_tokens_seen": 118073695, + "step": 5497, + "time_per_iteration": 4.493640422821045 + }, + { + "auxiliary_loss_clip": 0.01469977, + "auxiliary_loss_mlp": 0.01283801, + "balance_loss_clip": 1.13068461, + "balance_loss_mlp": 1.03641784, + "epoch": 0.3305576431684954, + "flos": 29463154191360.0, + "grad_norm": 7.258996105490717, + "language_loss": 0.80765349, + "learning_rate": 3.124884968794321e-06, + "loss": 0.83519125, + "num_input_tokens_seen": 118094030, + "step": 5498, + "time_per_iteration": 2.8582987785339355 + }, + { + "auxiliary_loss_clip": 0.01461829, + "auxiliary_loss_mlp": 0.0128523, + "balance_loss_clip": 1.12165141, + "balance_loss_mlp": 1.04414105, + "epoch": 0.33061776642116336, + "flos": 22634196711840.0, + "grad_norm": 2.066713639721362, + "language_loss": 0.76169384, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78916442, + "num_input_tokens_seen": 118111665, + "step": 5499, + "time_per_iteration": 2.824479103088379 + }, + { + "auxiliary_loss_clip": 0.01462207, + "auxiliary_loss_mlp": 0.01291967, + "balance_loss_clip": 1.12351227, + "balance_loss_mlp": 1.04839861, + "epoch": 0.3306778896738313, + "flos": 25778738004480.0, + "grad_norm": 1.5103403824246513, + "language_loss": 0.79036641, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81790817, + "num_input_tokens_seen": 118132435, + "step": 5500, + "time_per_iteration": 2.823617935180664 + }, + { + "auxiliary_loss_clip": 0.01471804, + "auxiliary_loss_mlp": 0.01296345, + "balance_loss_clip": 1.1321342, + "balance_loss_mlp": 1.05353975, + "epoch": 0.33073801292649935, + "flos": 36943572304800.0, + "grad_norm": 3.8781602216501696, + "language_loss": 0.66886473, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.69654626, + "num_input_tokens_seen": 118155255, + "step": 5501, + "time_per_iteration": 2.9430136680603027 + }, + { + "auxiliary_loss_clip": 0.01473867, + "auxiliary_loss_mlp": 0.01295605, + "balance_loss_clip": 1.13396096, + "balance_loss_mlp": 1.05031967, + "epoch": 0.3307981361791673, + "flos": 12969322292160.0, + "grad_norm": 2.3706114751087437, + "language_loss": 0.77437675, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.80207151, + "num_input_tokens_seen": 118169865, + "step": 5502, + "time_per_iteration": 2.722346067428589 + }, + { + "auxiliary_loss_clip": 0.01477227, + "auxiliary_loss_mlp": 0.0129571, + "balance_loss_clip": 1.13727975, + "balance_loss_mlp": 1.04813576, + "epoch": 0.3308582594318353, + "flos": 25376733429120.0, + "grad_norm": 1.6951049278234833, + "language_loss": 0.72275633, + "learning_rate": 3.123274330355824e-06, + "loss": 0.75048566, + "num_input_tokens_seen": 118190760, + "step": 5503, + "time_per_iteration": 2.818004846572876 + }, + { + "auxiliary_loss_clip": 0.01466793, + "auxiliary_loss_mlp": 0.0129604, + "balance_loss_clip": 1.12551403, + "balance_loss_mlp": 1.05418825, + "epoch": 0.33091838268450324, + "flos": 26470971774720.0, + "grad_norm": 1.5931214018494841, + "language_loss": 0.74929965, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77692795, + "num_input_tokens_seen": 118213620, + "step": 5504, + "time_per_iteration": 2.85037899017334 + }, + { + "auxiliary_loss_clip": 0.01468233, + "auxiliary_loss_mlp": 0.01292001, + "balance_loss_clip": 1.12884068, + "balance_loss_mlp": 1.04614413, + "epoch": 0.3309785059371712, + "flos": 24973401368160.0, + "grad_norm": 1.652588185149765, + "language_loss": 0.70187086, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72947323, + "num_input_tokens_seen": 118235010, + "step": 5505, + "time_per_iteration": 2.800341844558716 + }, + { + "auxiliary_loss_clip": 0.01474234, + "auxiliary_loss_mlp": 0.0128749, + "balance_loss_clip": 1.13457429, + "balance_loss_mlp": 1.04182315, + "epoch": 0.3310386291898392, + "flos": 20448678417120.0, + "grad_norm": 2.0207596666595857, + "language_loss": 0.82186735, + "learning_rate": 3.122307436058899e-06, + "loss": 0.84948456, + "num_input_tokens_seen": 118255820, + "step": 5506, + "time_per_iteration": 2.8114516735076904 + }, + { + "auxiliary_loss_clip": 0.014815, + "auxiliary_loss_mlp": 0.01283467, + "balance_loss_clip": 1.14066708, + "balance_loss_mlp": 1.03875387, + "epoch": 0.33109875244250714, + "flos": 23184767347200.0, + "grad_norm": 2.215633785100744, + "language_loss": 0.79473317, + "learning_rate": 3.121985052827606e-06, + "loss": 0.82238287, + "num_input_tokens_seen": 118274160, + "step": 5507, + "time_per_iteration": 4.388697624206543 + }, + { + "auxiliary_loss_clip": 0.01469897, + "auxiliary_loss_mlp": 0.01284767, + "balance_loss_clip": 1.12954652, + "balance_loss_mlp": 1.03471375, + "epoch": 0.3311588756951751, + "flos": 24170567990400.0, + "grad_norm": 2.1510450869364934, + "language_loss": 0.71724117, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.74478787, + "num_input_tokens_seen": 118294385, + "step": 5508, + "time_per_iteration": 4.456750154495239 + }, + { + "auxiliary_loss_clip": 0.01471725, + "auxiliary_loss_mlp": 0.01280402, + "balance_loss_clip": 1.13215625, + "balance_loss_mlp": 1.03416371, + "epoch": 0.33121899894784307, + "flos": 28148171768640.0, + "grad_norm": 3.022136490724067, + "language_loss": 0.71814442, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74566567, + "num_input_tokens_seen": 118313105, + "step": 5509, + "time_per_iteration": 4.362739324569702 + }, + { + "auxiliary_loss_clip": 0.01474408, + "auxiliary_loss_mlp": 0.01285141, + "balance_loss_clip": 1.13432586, + "balance_loss_mlp": 1.03775835, + "epoch": 0.33127912220051103, + "flos": 29570112695520.0, + "grad_norm": 2.5061242626096765, + "language_loss": 0.73260725, + "learning_rate": 3.121017647907921e-06, + "loss": 0.76020277, + "num_input_tokens_seen": 118335250, + "step": 5510, + "time_per_iteration": 2.865279197692871 + }, + { + "auxiliary_loss_clip": 0.01471796, + "auxiliary_loss_mlp": 0.01282986, + "balance_loss_clip": 1.13124156, + "balance_loss_mlp": 1.03579402, + "epoch": 0.331339245453179, + "flos": 14430595085280.0, + "grad_norm": 3.2305017353270435, + "language_loss": 0.88115644, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90870422, + "num_input_tokens_seen": 118351470, + "step": 5511, + "time_per_iteration": 2.774930238723755 + }, + { + "auxiliary_loss_clip": 0.0146711, + "auxiliary_loss_mlp": 0.01274434, + "balance_loss_clip": 1.12906158, + "balance_loss_mlp": 1.02895772, + "epoch": 0.33139936870584696, + "flos": 20889256295520.0, + "grad_norm": 1.7618718374629978, + "language_loss": 0.73372787, + "learning_rate": 3.12037249872891e-06, + "loss": 0.76114333, + "num_input_tokens_seen": 118370970, + "step": 5512, + "time_per_iteration": 2.777351140975952 + }, + { + "auxiliary_loss_clip": 0.01473124, + "auxiliary_loss_mlp": 0.01290458, + "balance_loss_clip": 1.1332078, + "balance_loss_mlp": 1.04326594, + "epoch": 0.33145949195851493, + "flos": 36287370650880.0, + "grad_norm": 2.445000485129548, + "language_loss": 0.72362292, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.75125879, + "num_input_tokens_seen": 118393125, + "step": 5513, + "time_per_iteration": 2.929598093032837 + }, + { + "auxiliary_loss_clip": 0.01469986, + "auxiliary_loss_mlp": 0.0127782, + "balance_loss_clip": 1.12930036, + "balance_loss_mlp": 1.0279572, + "epoch": 0.33151961521118295, + "flos": 14281308390240.0, + "grad_norm": 1.9618188319621592, + "language_loss": 0.68090868, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70838672, + "num_input_tokens_seen": 118410860, + "step": 5514, + "time_per_iteration": 2.7191128730773926 + }, + { + "auxiliary_loss_clip": 0.01477265, + "auxiliary_loss_mlp": 0.01292143, + "balance_loss_clip": 1.13667834, + "balance_loss_mlp": 1.04437828, + "epoch": 0.3315797384638509, + "flos": 20776267213920.0, + "grad_norm": 2.3848837608210673, + "language_loss": 0.66819221, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.69588625, + "num_input_tokens_seen": 118429570, + "step": 5515, + "time_per_iteration": 2.7797563076019287 + }, + { + "auxiliary_loss_clip": 0.01477998, + "auxiliary_loss_mlp": 0.01286018, + "balance_loss_clip": 1.13932657, + "balance_loss_mlp": 1.0361551, + "epoch": 0.3316398617165189, + "flos": 24681692975040.0, + "grad_norm": 2.9789109665305276, + "language_loss": 0.69045442, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71809459, + "num_input_tokens_seen": 118450285, + "step": 5516, + "time_per_iteration": 2.7867019176483154 + }, + { + "auxiliary_loss_clip": 0.01472753, + "auxiliary_loss_mlp": 0.01278122, + "balance_loss_clip": 1.13437557, + "balance_loss_mlp": 1.03073919, + "epoch": 0.33169998496918685, + "flos": 18589952427840.0, + "grad_norm": 2.304524066187729, + "language_loss": 0.7999903, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82749903, + "num_input_tokens_seen": 118468270, + "step": 5517, + "time_per_iteration": 2.7822322845458984 + }, + { + "auxiliary_loss_clip": 0.01469079, + "auxiliary_loss_mlp": 0.01271571, + "balance_loss_clip": 1.13027787, + "balance_loss_mlp": 1.02399755, + "epoch": 0.3317601082218548, + "flos": 20195732967840.0, + "grad_norm": 2.1031446525431945, + "language_loss": 0.74492478, + "learning_rate": 3.118436031952143e-06, + "loss": 0.7723313, + "num_input_tokens_seen": 118486615, + "step": 5518, + "time_per_iteration": 2.757127046585083 + }, + { + "auxiliary_loss_clip": 0.01556825, + "auxiliary_loss_mlp": 0.01300896, + "balance_loss_clip": 1.22190499, + "balance_loss_mlp": 1.08879852, + "epoch": 0.3318202314745228, + "flos": 68981543068320.0, + "grad_norm": 0.6457641054985283, + "language_loss": 0.54271722, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.57129443, + "num_input_tokens_seen": 118553580, + "step": 5519, + "time_per_iteration": 3.426854133605957 + }, + { + "auxiliary_loss_clip": 0.01475481, + "auxiliary_loss_mlp": 0.01281345, + "balance_loss_clip": 1.13603199, + "balance_loss_mlp": 1.03167272, + "epoch": 0.33188035472719074, + "flos": 21501878129280.0, + "grad_norm": 2.188978945512068, + "language_loss": 0.78382671, + "learning_rate": 3.117790203606336e-06, + "loss": 0.81139493, + "num_input_tokens_seen": 118570280, + "step": 5520, + "time_per_iteration": 2.7638165950775146 + }, + { + "auxiliary_loss_clip": 0.01474152, + "auxiliary_loss_mlp": 0.0127749, + "balance_loss_clip": 1.13521576, + "balance_loss_mlp": 1.02934384, + "epoch": 0.3319404779798587, + "flos": 28872682767360.0, + "grad_norm": 1.847940757830241, + "language_loss": 0.76391238, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.7914288, + "num_input_tokens_seen": 118590455, + "step": 5521, + "time_per_iteration": 2.8994553089141846 + }, + { + "auxiliary_loss_clip": 0.01470277, + "auxiliary_loss_mlp": 0.01277951, + "balance_loss_clip": 1.13133287, + "balance_loss_mlp": 1.02904212, + "epoch": 0.33200060123252667, + "flos": 23079212184960.0, + "grad_norm": 2.292333447830324, + "language_loss": 0.70234722, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72982949, + "num_input_tokens_seen": 118609495, + "step": 5522, + "time_per_iteration": 2.8944294452667236 + }, + { + "auxiliary_loss_clip": 0.01467601, + "auxiliary_loss_mlp": 0.01271994, + "balance_loss_clip": 1.12902665, + "balance_loss_mlp": 1.02785337, + "epoch": 0.33206072448519464, + "flos": 21144629147040.0, + "grad_norm": 1.6741557416074706, + "language_loss": 0.73739547, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76479149, + "num_input_tokens_seen": 118628720, + "step": 5523, + "time_per_iteration": 2.792487621307373 + }, + { + "auxiliary_loss_clip": 0.01473935, + "auxiliary_loss_mlp": 0.01271631, + "balance_loss_clip": 1.13459635, + "balance_loss_mlp": 1.02596474, + "epoch": 0.3321208477378626, + "flos": 13080794247360.0, + "grad_norm": 2.368466468902138, + "language_loss": 0.81737614, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84483182, + "num_input_tokens_seen": 118645955, + "step": 5524, + "time_per_iteration": 2.941277503967285 + }, + { + "auxiliary_loss_clip": 0.01475423, + "auxiliary_loss_mlp": 0.01272386, + "balance_loss_clip": 1.13675129, + "balance_loss_mlp": 1.02576637, + "epoch": 0.33218097099053057, + "flos": 21217565727360.0, + "grad_norm": 1.8848774839601545, + "language_loss": 0.83195621, + "learning_rate": 3.116174891188636e-06, + "loss": 0.85943425, + "num_input_tokens_seen": 118665605, + "step": 5525, + "time_per_iteration": 2.905686616897583 + }, + { + "auxiliary_loss_clip": 0.01557767, + "auxiliary_loss_mlp": 0.01237663, + "balance_loss_clip": 1.22677493, + "balance_loss_mlp": 1.01641083, + "epoch": 0.33224109424319853, + "flos": 64356005975040.0, + "grad_norm": 0.7918967809258742, + "language_loss": 0.52512717, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.55308151, + "num_input_tokens_seen": 118728155, + "step": 5526, + "time_per_iteration": 3.3170857429504395 + }, + { + "auxiliary_loss_clip": 0.01484737, + "auxiliary_loss_mlp": 0.01276767, + "balance_loss_clip": 1.14568007, + "balance_loss_mlp": 1.02747679, + "epoch": 0.33230121749586655, + "flos": 17347830729120.0, + "grad_norm": 3.799246108282623, + "language_loss": 0.7772826, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.80489761, + "num_input_tokens_seen": 118743955, + "step": 5527, + "time_per_iteration": 2.819183588027954 + }, + { + "auxiliary_loss_clip": 0.01476606, + "auxiliary_loss_mlp": 0.0127905, + "balance_loss_clip": 1.13670921, + "balance_loss_mlp": 1.03185809, + "epoch": 0.3323613407485345, + "flos": 20999211124320.0, + "grad_norm": 1.9259490148359157, + "language_loss": 0.71862316, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.7461797, + "num_input_tokens_seen": 118763275, + "step": 5528, + "time_per_iteration": 2.89255690574646 + }, + { + "auxiliary_loss_clip": 0.014784, + "auxiliary_loss_mlp": 0.01273193, + "balance_loss_clip": 1.13881803, + "balance_loss_mlp": 1.02561951, + "epoch": 0.3324214640012025, + "flos": 13154261821920.0, + "grad_norm": 2.0985201819702213, + "language_loss": 0.83017647, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.85769242, + "num_input_tokens_seen": 118781110, + "step": 5529, + "time_per_iteration": 2.808138132095337 + }, + { + "auxiliary_loss_clip": 0.0148588, + "auxiliary_loss_mlp": 0.01285179, + "balance_loss_clip": 1.14656019, + "balance_loss_mlp": 1.03684199, + "epoch": 0.33248158725387045, + "flos": 22275734028480.0, + "grad_norm": 1.9353327435146948, + "language_loss": 0.7019881, + "learning_rate": 3.114558520634423e-06, + "loss": 0.72969872, + "num_input_tokens_seen": 118800620, + "step": 5530, + "time_per_iteration": 2.804316759109497 + }, + { + "auxiliary_loss_clip": 0.01481698, + "auxiliary_loss_mlp": 0.01280865, + "balance_loss_clip": 1.14305115, + "balance_loss_mlp": 1.03157496, + "epoch": 0.3325417105065384, + "flos": 20743117637760.0, + "grad_norm": 3.802718555770102, + "language_loss": 0.76261061, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.79023623, + "num_input_tokens_seen": 118818725, + "step": 5531, + "time_per_iteration": 2.805267095565796 + }, + { + "auxiliary_loss_clip": 0.01488478, + "auxiliary_loss_mlp": 0.01304522, + "balance_loss_clip": 1.15059376, + "balance_loss_mlp": 1.05923676, + "epoch": 0.3326018337592064, + "flos": 24792861504960.0, + "grad_norm": 1.8326254533882314, + "language_loss": 0.73211175, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.76004171, + "num_input_tokens_seen": 118839390, + "step": 5532, + "time_per_iteration": 2.838864326477051 + }, + { + "auxiliary_loss_clip": 0.0148611, + "auxiliary_loss_mlp": 0.01277371, + "balance_loss_clip": 1.14906442, + "balance_loss_mlp": 1.03189516, + "epoch": 0.33266195701187434, + "flos": 14503455809280.0, + "grad_norm": 2.4735724714425804, + "language_loss": 0.65855759, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.68619239, + "num_input_tokens_seen": 118856275, + "step": 5533, + "time_per_iteration": 2.8234639167785645 + }, + { + "auxiliary_loss_clip": 0.01479615, + "auxiliary_loss_mlp": 0.01287292, + "balance_loss_clip": 1.14357209, + "balance_loss_mlp": 1.04524899, + "epoch": 0.3327220802645423, + "flos": 15306213330720.0, + "grad_norm": 2.6295290292802243, + "language_loss": 0.70961773, + "learning_rate": 3.113264663362451e-06, + "loss": 0.73728681, + "num_input_tokens_seen": 118873830, + "step": 5534, + "time_per_iteration": 2.775808811187744 + }, + { + "auxiliary_loss_clip": 0.01484664, + "auxiliary_loss_mlp": 0.01292726, + "balance_loss_clip": 1.14599442, + "balance_loss_mlp": 1.05106461, + "epoch": 0.3327822035172103, + "flos": 23479813418400.0, + "grad_norm": 2.125906756000196, + "language_loss": 0.67122811, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69900197, + "num_input_tokens_seen": 118891560, + "step": 5535, + "time_per_iteration": 2.831641674041748 + }, + { + "auxiliary_loss_clip": 0.01478876, + "auxiliary_loss_mlp": 0.01282564, + "balance_loss_clip": 1.14241648, + "balance_loss_mlp": 1.03785133, + "epoch": 0.33284232676987824, + "flos": 25376885141760.0, + "grad_norm": 3.804013979892388, + "language_loss": 0.73147976, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.75909412, + "num_input_tokens_seen": 118910260, + "step": 5536, + "time_per_iteration": 4.537033319473267 + }, + { + "auxiliary_loss_clip": 0.01483426, + "auxiliary_loss_mlp": 0.01289348, + "balance_loss_clip": 1.14612806, + "balance_loss_mlp": 1.04787755, + "epoch": 0.3329024500225462, + "flos": 23696537110560.0, + "grad_norm": 1.7559815795145297, + "language_loss": 0.81573772, + "learning_rate": 3.112293827106917e-06, + "loss": 0.84346545, + "num_input_tokens_seen": 118929985, + "step": 5537, + "time_per_iteration": 2.8433775901794434 + }, + { + "auxiliary_loss_clip": 0.01482248, + "auxiliary_loss_mlp": 0.01305463, + "balance_loss_clip": 1.14585555, + "balance_loss_mlp": 1.06189501, + "epoch": 0.33296257327521417, + "flos": 31725894948480.0, + "grad_norm": 2.1416306073619076, + "language_loss": 0.71337444, + "learning_rate": 3.111970130648789e-06, + "loss": 0.74125153, + "num_input_tokens_seen": 118951355, + "step": 5538, + "time_per_iteration": 2.8783469200134277 + }, + { + "auxiliary_loss_clip": 0.01482551, + "auxiliary_loss_mlp": 0.01296796, + "balance_loss_clip": 1.14652777, + "balance_loss_mlp": 1.05551648, + "epoch": 0.33302269652788213, + "flos": 22746768583680.0, + "grad_norm": 2.3870348460238433, + "language_loss": 0.74316096, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.77095449, + "num_input_tokens_seen": 118970910, + "step": 5539, + "time_per_iteration": 2.7846927642822266 + }, + { + "auxiliary_loss_clip": 0.01485803, + "auxiliary_loss_mlp": 0.01310201, + "balance_loss_clip": 1.14926851, + "balance_loss_mlp": 1.06510639, + "epoch": 0.33308281978055015, + "flos": 11475355060800.0, + "grad_norm": 1.905418828450765, + "language_loss": 0.71027726, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73823738, + "num_input_tokens_seen": 118989200, + "step": 5540, + "time_per_iteration": 2.754042148590088 + }, + { + "auxiliary_loss_clip": 0.01481594, + "auxiliary_loss_mlp": 0.01294547, + "balance_loss_clip": 1.14417231, + "balance_loss_mlp": 1.05269551, + "epoch": 0.3331429430332181, + "flos": 38216567890080.0, + "grad_norm": 1.612861350131954, + "language_loss": 0.60571992, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.63348126, + "num_input_tokens_seen": 119011030, + "step": 5541, + "time_per_iteration": 3.013855218887329 + }, + { + "auxiliary_loss_clip": 0.01478642, + "auxiliary_loss_mlp": 0.01292783, + "balance_loss_clip": 1.14243841, + "balance_loss_mlp": 1.04807019, + "epoch": 0.3332030662858861, + "flos": 22530879311040.0, + "grad_norm": 1.8899748012880462, + "language_loss": 0.69029254, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.71800679, + "num_input_tokens_seen": 119030620, + "step": 5542, + "time_per_iteration": 2.7845258712768555 + }, + { + "auxiliary_loss_clip": 0.01481919, + "auxiliary_loss_mlp": 0.01296813, + "balance_loss_clip": 1.14569664, + "balance_loss_mlp": 1.05362582, + "epoch": 0.33326318953855405, + "flos": 15999622873920.0, + "grad_norm": 1.9337688809170617, + "language_loss": 0.751827, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77961433, + "num_input_tokens_seen": 119048015, + "step": 5543, + "time_per_iteration": 2.7306840419769287 + }, + { + "auxiliary_loss_clip": 0.01485396, + "auxiliary_loss_mlp": 0.01291879, + "balance_loss_clip": 1.1483022, + "balance_loss_mlp": 1.04678428, + "epoch": 0.333323312791222, + "flos": 25595922451680.0, + "grad_norm": 1.6923858963351819, + "language_loss": 0.74827212, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77604485, + "num_input_tokens_seen": 119066280, + "step": 5544, + "time_per_iteration": 2.8000590801239014 + }, + { + "auxiliary_loss_clip": 0.01480138, + "auxiliary_loss_mlp": 0.01277647, + "balance_loss_clip": 1.14337647, + "balance_loss_mlp": 1.03198051, + "epoch": 0.33338343604389, + "flos": 25121929500000.0, + "grad_norm": 2.6193052581097747, + "language_loss": 0.7080791, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.73565698, + "num_input_tokens_seen": 119087680, + "step": 5545, + "time_per_iteration": 4.445868492126465 + }, + { + "auxiliary_loss_clip": 0.01483784, + "auxiliary_loss_mlp": 0.01286266, + "balance_loss_clip": 1.14620805, + "balance_loss_mlp": 1.04059947, + "epoch": 0.33344355929655795, + "flos": 16949239688160.0, + "grad_norm": 1.7382044618898573, + "language_loss": 0.68912184, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.71682239, + "num_input_tokens_seen": 119105820, + "step": 5546, + "time_per_iteration": 4.2696263790130615 + }, + { + "auxiliary_loss_clip": 0.01469484, + "auxiliary_loss_mlp": 0.01287348, + "balance_loss_clip": 1.13350272, + "balance_loss_mlp": 1.04034615, + "epoch": 0.3335036825492259, + "flos": 27891964497600.0, + "grad_norm": 2.1729429222245695, + "language_loss": 0.64919585, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.67676425, + "num_input_tokens_seen": 119126630, + "step": 5547, + "time_per_iteration": 4.317318677902222 + }, + { + "auxiliary_loss_clip": 0.01485949, + "auxiliary_loss_mlp": 0.01280832, + "balance_loss_clip": 1.14806688, + "balance_loss_mlp": 1.03573763, + "epoch": 0.3335638058018939, + "flos": 16181376438240.0, + "grad_norm": 2.387265684097939, + "language_loss": 0.85270095, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.88036877, + "num_input_tokens_seen": 119143375, + "step": 5548, + "time_per_iteration": 2.7837743759155273 + }, + { + "auxiliary_loss_clip": 0.01474443, + "auxiliary_loss_mlp": 0.01277104, + "balance_loss_clip": 1.1358037, + "balance_loss_mlp": 1.02819479, + "epoch": 0.33362392905456184, + "flos": 39899305395360.0, + "grad_norm": 2.095349046457567, + "language_loss": 0.74402344, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.77153891, + "num_input_tokens_seen": 119166450, + "step": 5549, + "time_per_iteration": 2.9446802139282227 + }, + { + "auxiliary_loss_clip": 0.01478322, + "auxiliary_loss_mlp": 0.01272584, + "balance_loss_clip": 1.14086699, + "balance_loss_mlp": 1.02062345, + "epoch": 0.3336840523072298, + "flos": 44274476154240.0, + "grad_norm": 1.8923645109030383, + "language_loss": 0.68567669, + "learning_rate": 3.108082487713921e-06, + "loss": 0.71318573, + "num_input_tokens_seen": 119189645, + "step": 5550, + "time_per_iteration": 2.9663045406341553 + }, + { + "auxiliary_loss_clip": 0.0148147, + "auxiliary_loss_mlp": 0.01275984, + "balance_loss_clip": 1.14264321, + "balance_loss_mlp": 1.03108072, + "epoch": 0.33374417555989777, + "flos": 15087479446080.0, + "grad_norm": 2.3331609123682844, + "language_loss": 0.59930873, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62688327, + "num_input_tokens_seen": 119208045, + "step": 5551, + "time_per_iteration": 2.792949914932251 + }, + { + "auxiliary_loss_clip": 0.01480461, + "auxiliary_loss_mlp": 0.01281585, + "balance_loss_clip": 1.14256692, + "balance_loss_mlp": 1.02886164, + "epoch": 0.33380429881256574, + "flos": 15850867173120.0, + "grad_norm": 2.9640114570155314, + "language_loss": 0.70407832, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.73169881, + "num_input_tokens_seen": 119224910, + "step": 5552, + "time_per_iteration": 2.8304920196533203 + }, + { + "auxiliary_loss_clip": 0.01473101, + "auxiliary_loss_mlp": 0.01285976, + "balance_loss_clip": 1.13586199, + "balance_loss_mlp": 1.03802109, + "epoch": 0.33386442206523376, + "flos": 13482040259520.0, + "grad_norm": 2.405898153818571, + "language_loss": 0.83582681, + "learning_rate": 3.107109630732192e-06, + "loss": 0.86341763, + "num_input_tokens_seen": 119243290, + "step": 5553, + "time_per_iteration": 2.846839189529419 + }, + { + "auxiliary_loss_clip": 0.01478397, + "auxiliary_loss_mlp": 0.01277851, + "balance_loss_clip": 1.14111638, + "balance_loss_mlp": 1.02951431, + "epoch": 0.3339245453179017, + "flos": 16692539351040.0, + "grad_norm": 2.6670745684177932, + "language_loss": 0.81400049, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.84156293, + "num_input_tokens_seen": 119261195, + "step": 5554, + "time_per_iteration": 2.7252728939056396 + }, + { + "auxiliary_loss_clip": 0.01479808, + "auxiliary_loss_mlp": 0.01271664, + "balance_loss_clip": 1.14244056, + "balance_loss_mlp": 1.02370882, + "epoch": 0.3339846685705697, + "flos": 24613080204960.0, + "grad_norm": 1.4972029998213165, + "language_loss": 0.81457114, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.84208584, + "num_input_tokens_seen": 119282845, + "step": 5555, + "time_per_iteration": 2.884279251098633 + }, + { + "auxiliary_loss_clip": 0.01481813, + "auxiliary_loss_mlp": 0.01278316, + "balance_loss_clip": 1.14375067, + "balance_loss_mlp": 1.03207707, + "epoch": 0.33404479182323765, + "flos": 30956324931360.0, + "grad_norm": 2.2064916978435267, + "language_loss": 0.74658155, + "learning_rate": 3.106136395915099e-06, + "loss": 0.7741828, + "num_input_tokens_seen": 119304430, + "step": 5556, + "time_per_iteration": 2.863201379776001 + }, + { + "auxiliary_loss_clip": 0.01477537, + "auxiliary_loss_mlp": 0.01270517, + "balance_loss_clip": 1.13957834, + "balance_loss_mlp": 1.02733004, + "epoch": 0.3341049150759056, + "flos": 23515542109440.0, + "grad_norm": 1.4862359092824793, + "language_loss": 0.82283092, + "learning_rate": 3.105811900403391e-06, + "loss": 0.85031152, + "num_input_tokens_seen": 119323830, + "step": 5557, + "time_per_iteration": 2.785926580429077 + }, + { + "auxiliary_loss_clip": 0.01486821, + "auxiliary_loss_mlp": 0.01296182, + "balance_loss_clip": 1.14878464, + "balance_loss_mlp": 1.04975247, + "epoch": 0.3341650383285736, + "flos": 24029246208960.0, + "grad_norm": 1.8149839753438217, + "language_loss": 0.80437779, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.83220786, + "num_input_tokens_seen": 119346340, + "step": 5558, + "time_per_iteration": 2.9015309810638428 + }, + { + "auxiliary_loss_clip": 0.01478499, + "auxiliary_loss_mlp": 0.01277845, + "balance_loss_clip": 1.14072287, + "balance_loss_mlp": 1.03160632, + "epoch": 0.33422516158124155, + "flos": 24904409316480.0, + "grad_norm": 1.6929729936128162, + "language_loss": 0.81430417, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84186757, + "num_input_tokens_seen": 119367285, + "step": 5559, + "time_per_iteration": 2.8112974166870117 + }, + { + "auxiliary_loss_clip": 0.01477646, + "auxiliary_loss_mlp": 0.01289628, + "balance_loss_clip": 1.14146733, + "balance_loss_mlp": 1.0464406, + "epoch": 0.3342852848339095, + "flos": 18335413995840.0, + "grad_norm": 1.935627449938831, + "language_loss": 0.72220868, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74988145, + "num_input_tokens_seen": 119385370, + "step": 5560, + "time_per_iteration": 2.8033101558685303 + }, + { + "auxiliary_loss_clip": 0.01475525, + "auxiliary_loss_mlp": 0.01292243, + "balance_loss_clip": 1.14030313, + "balance_loss_mlp": 1.04543185, + "epoch": 0.3343454080865775, + "flos": 30049263876960.0, + "grad_norm": 1.5378443966156408, + "language_loss": 0.75154173, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77921945, + "num_input_tokens_seen": 119409150, + "step": 5561, + "time_per_iteration": 2.863959550857544 + }, + { + "auxiliary_loss_clip": 0.01481834, + "auxiliary_loss_mlp": 0.01288599, + "balance_loss_clip": 1.14542758, + "balance_loss_mlp": 1.0417881, + "epoch": 0.33440553133924544, + "flos": 16400679245280.0, + "grad_norm": 1.9318958167258138, + "language_loss": 0.69449401, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.72219831, + "num_input_tokens_seen": 119426475, + "step": 5562, + "time_per_iteration": 2.769331455230713 + }, + { + "auxiliary_loss_clip": 0.01475961, + "auxiliary_loss_mlp": 0.0128006, + "balance_loss_clip": 1.14127946, + "balance_loss_mlp": 1.03343987, + "epoch": 0.3344656545919134, + "flos": 24244301062080.0, + "grad_norm": 4.812023693056992, + "language_loss": 0.65044314, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67800337, + "num_input_tokens_seen": 119446900, + "step": 5563, + "time_per_iteration": 2.7681727409362793 + }, + { + "auxiliary_loss_clip": 0.01470921, + "auxiliary_loss_mlp": 0.01291939, + "balance_loss_clip": 1.1351651, + "balance_loss_mlp": 1.04627275, + "epoch": 0.3345257778445814, + "flos": 52122725206560.0, + "grad_norm": 1.5431800010281387, + "language_loss": 0.74247694, + "learning_rate": 3.103539258400766e-06, + "loss": 0.7701056, + "num_input_tokens_seen": 119470945, + "step": 5564, + "time_per_iteration": 3.041846513748169 + }, + { + "auxiliary_loss_clip": 0.01585805, + "auxiliary_loss_mlp": 0.01261803, + "balance_loss_clip": 1.25875866, + "balance_loss_mlp": 1.05046844, + "epoch": 0.33458590109724934, + "flos": 68054607658080.0, + "grad_norm": 0.7837561873831289, + "language_loss": 0.55419385, + "learning_rate": 3.103214427773745e-06, + "loss": 0.58266997, + "num_input_tokens_seen": 119529925, + "step": 5565, + "time_per_iteration": 3.280453681945801 + }, + { + "auxiliary_loss_clip": 0.01476407, + "auxiliary_loss_mlp": 0.01277701, + "balance_loss_clip": 1.14114499, + "balance_loss_mlp": 1.03413236, + "epoch": 0.3346460243499173, + "flos": 37417830753600.0, + "grad_norm": 2.6415445963217197, + "language_loss": 0.64965975, + "learning_rate": 3.102889555312721e-06, + "loss": 0.67720085, + "num_input_tokens_seen": 119550700, + "step": 5566, + "time_per_iteration": 2.932225465774536 + }, + { + "auxiliary_loss_clip": 0.0146911, + "auxiliary_loss_mlp": 0.01289761, + "balance_loss_clip": 1.13443482, + "balance_loss_mlp": 1.04657364, + "epoch": 0.3347061476025853, + "flos": 18699338334240.0, + "grad_norm": 2.2760590455356753, + "language_loss": 0.7803244, + "learning_rate": 3.102564641030016e-06, + "loss": 0.80791312, + "num_input_tokens_seen": 119569295, + "step": 5567, + "time_per_iteration": 2.782686710357666 + }, + { + "auxiliary_loss_clip": 0.01466554, + "auxiliary_loss_mlp": 0.01293238, + "balance_loss_clip": 1.13193011, + "balance_loss_mlp": 1.0489068, + "epoch": 0.3347662708552533, + "flos": 13919545956960.0, + "grad_norm": 1.9335653661669225, + "language_loss": 0.76312679, + "learning_rate": 3.102239684937949e-06, + "loss": 0.79072469, + "num_input_tokens_seen": 119587375, + "step": 5568, + "time_per_iteration": 2.7379939556121826 + }, + { + "auxiliary_loss_clip": 0.01463738, + "auxiliary_loss_mlp": 0.01281612, + "balance_loss_clip": 1.12819433, + "balance_loss_mlp": 1.03461039, + "epoch": 0.33482639410792125, + "flos": 19752158764800.0, + "grad_norm": 5.800941828992449, + "language_loss": 0.71231896, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73977244, + "num_input_tokens_seen": 119604530, + "step": 5569, + "time_per_iteration": 2.8055920600891113 + }, + { + "auxiliary_loss_clip": 0.01453913, + "auxiliary_loss_mlp": 0.01273665, + "balance_loss_clip": 1.12042868, + "balance_loss_mlp": 1.02570975, + "epoch": 0.3348865173605892, + "flos": 16104609113760.0, + "grad_norm": 2.855253241295563, + "language_loss": 0.89834714, + "learning_rate": 3.10158964737502e-06, + "loss": 0.92562294, + "num_input_tokens_seen": 119621025, + "step": 5570, + "time_per_iteration": 2.6927149295806885 + }, + { + "auxiliary_loss_clip": 0.0146418, + "auxiliary_loss_mlp": 0.01278164, + "balance_loss_clip": 1.13141775, + "balance_loss_mlp": 1.03020859, + "epoch": 0.3349466406132572, + "flos": 25011254036160.0, + "grad_norm": 1.7705998396237361, + "language_loss": 0.80013758, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82756102, + "num_input_tokens_seen": 119641725, + "step": 5571, + "time_per_iteration": 2.834042549133301 + }, + { + "auxiliary_loss_clip": 0.01657188, + "auxiliary_loss_mlp": 0.01227371, + "balance_loss_clip": 1.32580781, + "balance_loss_mlp": 1.00764465, + "epoch": 0.33500676386592515, + "flos": 54326714150880.0, + "grad_norm": 0.89587703789259, + "language_loss": 0.5600282, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.58887386, + "num_input_tokens_seen": 119693560, + "step": 5572, + "time_per_iteration": 3.3275156021118164 + }, + { + "auxiliary_loss_clip": 0.01469916, + "auxiliary_loss_mlp": 0.01283586, + "balance_loss_clip": 1.13758802, + "balance_loss_mlp": 1.03830147, + "epoch": 0.3350668871185931, + "flos": 26799774272640.0, + "grad_norm": 2.096830703349717, + "language_loss": 0.78933656, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.81687152, + "num_input_tokens_seen": 119712935, + "step": 5573, + "time_per_iteration": 4.404458045959473 + }, + { + "auxiliary_loss_clip": 0.01463046, + "auxiliary_loss_mlp": 0.01283015, + "balance_loss_clip": 1.13003767, + "balance_loss_mlp": 1.04001927, + "epoch": 0.3351270103712611, + "flos": 33513315268320.0, + "grad_norm": 2.280945558688729, + "language_loss": 0.72720027, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.75466084, + "num_input_tokens_seen": 119731680, + "step": 5574, + "time_per_iteration": 2.8226563930511475 + }, + { + "auxiliary_loss_clip": 0.01465014, + "auxiliary_loss_mlp": 0.01289662, + "balance_loss_clip": 1.13148189, + "balance_loss_mlp": 1.04666615, + "epoch": 0.33518713362392905, + "flos": 26508710658240.0, + "grad_norm": 1.6771668912477342, + "language_loss": 0.88166064, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.9092074, + "num_input_tokens_seen": 119752155, + "step": 5575, + "time_per_iteration": 2.808206796646118 + }, + { + "auxiliary_loss_clip": 0.01476603, + "auxiliary_loss_mlp": 0.01290409, + "balance_loss_clip": 1.14350319, + "balance_loss_mlp": 1.04092729, + "epoch": 0.335247256876597, + "flos": 17233931371680.0, + "grad_norm": 3.044552776702612, + "language_loss": 0.83032, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.85799015, + "num_input_tokens_seen": 119769195, + "step": 5576, + "time_per_iteration": 2.7292637825012207 + }, + { + "auxiliary_loss_clip": 0.01467333, + "auxiliary_loss_mlp": 0.01284007, + "balance_loss_clip": 1.13428354, + "balance_loss_mlp": 1.03528857, + "epoch": 0.335307380129265, + "flos": 25631726999040.0, + "grad_norm": 5.259204931655385, + "language_loss": 0.73086512, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75837851, + "num_input_tokens_seen": 119786810, + "step": 5577, + "time_per_iteration": 2.9210317134857178 + }, + { + "auxiliary_loss_clip": 0.01479956, + "auxiliary_loss_mlp": 0.01294071, + "balance_loss_clip": 1.14626074, + "balance_loss_mlp": 1.04535294, + "epoch": 0.33536750338193294, + "flos": 19681990940160.0, + "grad_norm": 1.6192408026085214, + "language_loss": 0.81740916, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.84514952, + "num_input_tokens_seen": 119805395, + "step": 5578, + "time_per_iteration": 2.791811943054199 + }, + { + "auxiliary_loss_clip": 0.0148193, + "auxiliary_loss_mlp": 0.0127767, + "balance_loss_clip": 1.15046155, + "balance_loss_mlp": 1.03333831, + "epoch": 0.3354276266346009, + "flos": 18334579576320.0, + "grad_norm": 1.8064420514649815, + "language_loss": 0.71784568, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.74544156, + "num_input_tokens_seen": 119823135, + "step": 5579, + "time_per_iteration": 2.82057785987854 + }, + { + "auxiliary_loss_clip": 0.01470367, + "auxiliary_loss_mlp": 0.01290459, + "balance_loss_clip": 1.13939905, + "balance_loss_mlp": 1.0398339, + "epoch": 0.3354877498872689, + "flos": 17860510768320.0, + "grad_norm": 1.8174416415395995, + "language_loss": 0.81499422, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.84260249, + "num_input_tokens_seen": 119842265, + "step": 5580, + "time_per_iteration": 2.7667789459228516 + }, + { + "auxiliary_loss_clip": 0.01469784, + "auxiliary_loss_mlp": 0.0128229, + "balance_loss_clip": 1.13786066, + "balance_loss_mlp": 1.03338099, + "epoch": 0.3355478731399369, + "flos": 24720076637280.0, + "grad_norm": 2.159552312511624, + "language_loss": 0.77949435, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80701506, + "num_input_tokens_seen": 119862500, + "step": 5581, + "time_per_iteration": 2.786968231201172 + }, + { + "auxiliary_loss_clip": 0.01470115, + "auxiliary_loss_mlp": 0.01303808, + "balance_loss_clip": 1.1380434, + "balance_loss_mlp": 1.05470872, + "epoch": 0.33560799639260486, + "flos": 16875810041760.0, + "grad_norm": 2.5578283420916463, + "language_loss": 0.74541199, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.77315128, + "num_input_tokens_seen": 119880160, + "step": 5582, + "time_per_iteration": 2.746875047683716 + }, + { + "auxiliary_loss_clip": 0.01468248, + "auxiliary_loss_mlp": 0.01289647, + "balance_loss_clip": 1.13746762, + "balance_loss_mlp": 1.04207349, + "epoch": 0.3356681196452728, + "flos": 18335527780320.0, + "grad_norm": 2.0739426838972626, + "language_loss": 0.82245624, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.85003519, + "num_input_tokens_seen": 119899040, + "step": 5583, + "time_per_iteration": 4.425646781921387 + }, + { + "auxiliary_loss_clip": 0.014739, + "auxiliary_loss_mlp": 0.0128345, + "balance_loss_clip": 1.14338231, + "balance_loss_mlp": 1.03816462, + "epoch": 0.3357282428979408, + "flos": 34753464702720.0, + "grad_norm": 1.820955184634493, + "language_loss": 0.77739906, + "learning_rate": 3.097034711451581e-06, + "loss": 0.80497265, + "num_input_tokens_seen": 119921120, + "step": 5584, + "time_per_iteration": 2.893049955368042 + }, + { + "auxiliary_loss_clip": 0.01466472, + "auxiliary_loss_mlp": 0.01288105, + "balance_loss_clip": 1.13447428, + "balance_loss_mlp": 1.04072237, + "epoch": 0.33578836615060875, + "flos": 21582248628960.0, + "grad_norm": 1.9860378202462778, + "language_loss": 0.76161361, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78915942, + "num_input_tokens_seen": 119940165, + "step": 5585, + "time_per_iteration": 5.879533052444458 + }, + { + "auxiliary_loss_clip": 0.0147387, + "auxiliary_loss_mlp": 0.01293719, + "balance_loss_clip": 1.14330745, + "balance_loss_mlp": 1.05243945, + "epoch": 0.3358484894032767, + "flos": 24532330423680.0, + "grad_norm": 1.9673781058147044, + "language_loss": 0.77715582, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.80483174, + "num_input_tokens_seen": 119959730, + "step": 5586, + "time_per_iteration": 2.793911933898926 + }, + { + "auxiliary_loss_clip": 0.01476558, + "auxiliary_loss_mlp": 0.01291478, + "balance_loss_clip": 1.14351702, + "balance_loss_mlp": 1.04123425, + "epoch": 0.3359086126559447, + "flos": 22457791018080.0, + "grad_norm": 1.8620298548985077, + "language_loss": 0.81072658, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.83840704, + "num_input_tokens_seen": 119979315, + "step": 5587, + "time_per_iteration": 2.9088337421417236 + }, + { + "auxiliary_loss_clip": 0.01478905, + "auxiliary_loss_mlp": 0.01291583, + "balance_loss_clip": 1.14631152, + "balance_loss_mlp": 1.04896772, + "epoch": 0.33596873590861265, + "flos": 16545490417440.0, + "grad_norm": 1.8732602966162666, + "language_loss": 0.67211574, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69982058, + "num_input_tokens_seen": 119996140, + "step": 5588, + "time_per_iteration": 2.7793638706207275 + }, + { + "auxiliary_loss_clip": 0.01470457, + "auxiliary_loss_mlp": 0.01284745, + "balance_loss_clip": 1.13816905, + "balance_loss_mlp": 1.03736234, + "epoch": 0.3360288591612806, + "flos": 31178510278560.0, + "grad_norm": 2.157672967373414, + "language_loss": 0.70161349, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72916555, + "num_input_tokens_seen": 120017720, + "step": 5589, + "time_per_iteration": 2.8756933212280273 + }, + { + "auxiliary_loss_clip": 0.01471513, + "auxiliary_loss_mlp": 0.01280921, + "balance_loss_clip": 1.14051545, + "balance_loss_mlp": 1.0339191, + "epoch": 0.3360889824139486, + "flos": 23699230009920.0, + "grad_norm": 2.0574121027276884, + "language_loss": 0.67018735, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69771171, + "num_input_tokens_seen": 120036335, + "step": 5590, + "time_per_iteration": 2.8198795318603516 + }, + { + "auxiliary_loss_clip": 0.01481405, + "auxiliary_loss_mlp": 0.01288992, + "balance_loss_clip": 1.14836264, + "balance_loss_mlp": 1.04447031, + "epoch": 0.33614910566661654, + "flos": 19320152650560.0, + "grad_norm": 3.3039528126830398, + "language_loss": 0.73866475, + "learning_rate": 3.094754183798047e-06, + "loss": 0.76636875, + "num_input_tokens_seen": 120056120, + "step": 5591, + "time_per_iteration": 2.814504384994507 + }, + { + "auxiliary_loss_clip": 0.01474022, + "auxiliary_loss_mlp": 0.01270311, + "balance_loss_clip": 1.14115405, + "balance_loss_mlp": 1.02369046, + "epoch": 0.3362092289192845, + "flos": 16474146819840.0, + "grad_norm": 2.1245899215891373, + "language_loss": 0.69778913, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72523248, + "num_input_tokens_seen": 120073650, + "step": 5592, + "time_per_iteration": 2.757006883621216 + }, + { + "auxiliary_loss_clip": 0.01469186, + "auxiliary_loss_mlp": 0.01283212, + "balance_loss_clip": 1.13822103, + "balance_loss_mlp": 1.03697324, + "epoch": 0.33626935217195253, + "flos": 24245894044800.0, + "grad_norm": 2.385129722321487, + "language_loss": 0.76619804, + "learning_rate": 3.094102230664423e-06, + "loss": 0.79372203, + "num_input_tokens_seen": 120093260, + "step": 5593, + "time_per_iteration": 2.794821262359619 + }, + { + "auxiliary_loss_clip": 0.01473459, + "auxiliary_loss_mlp": 0.01278955, + "balance_loss_clip": 1.14167595, + "balance_loss_mlp": 1.0281384, + "epoch": 0.3363294754246205, + "flos": 19720791812160.0, + "grad_norm": 2.060620723497528, + "language_loss": 0.72045141, + "learning_rate": 3.093776191858731e-06, + "loss": 0.74797553, + "num_input_tokens_seen": 120111830, + "step": 5594, + "time_per_iteration": 2.7292404174804688 + }, + { + "auxiliary_loss_clip": 0.01470523, + "auxiliary_loss_mlp": 0.01286396, + "balance_loss_clip": 1.13761413, + "balance_loss_mlp": 1.04015732, + "epoch": 0.33638959867728846, + "flos": 22598202523680.0, + "grad_norm": 1.5824785289498884, + "language_loss": 0.79830396, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82587314, + "num_input_tokens_seen": 120130470, + "step": 5595, + "time_per_iteration": 2.783569097518921 + }, + { + "auxiliary_loss_clip": 0.01470142, + "auxiliary_loss_mlp": 0.01282568, + "balance_loss_clip": 1.13754797, + "balance_loss_mlp": 1.03842735, + "epoch": 0.3364497219299564, + "flos": 20996783722080.0, + "grad_norm": 2.3750075615102397, + "language_loss": 0.81291032, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.84043741, + "num_input_tokens_seen": 120150735, + "step": 5596, + "time_per_iteration": 2.792645215988159 + }, + { + "auxiliary_loss_clip": 0.01466654, + "auxiliary_loss_mlp": 0.01274329, + "balance_loss_clip": 1.1342237, + "balance_loss_mlp": 1.03018832, + "epoch": 0.3365098451826244, + "flos": 25230974052960.0, + "grad_norm": 2.342632556708469, + "language_loss": 0.7572124, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.78462219, + "num_input_tokens_seen": 120173230, + "step": 5597, + "time_per_iteration": 2.78816556930542 + }, + { + "auxiliary_loss_clip": 0.01468616, + "auxiliary_loss_mlp": 0.01274302, + "balance_loss_clip": 1.13626981, + "balance_loss_mlp": 1.02806365, + "epoch": 0.33656996843529235, + "flos": 24574013835840.0, + "grad_norm": 1.8790036842076097, + "language_loss": 0.78768873, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.8151179, + "num_input_tokens_seen": 120191860, + "step": 5598, + "time_per_iteration": 2.856306314468384 + }, + { + "auxiliary_loss_clip": 0.01479018, + "auxiliary_loss_mlp": 0.01289835, + "balance_loss_clip": 1.1462189, + "balance_loss_mlp": 1.04168928, + "epoch": 0.3366300916879603, + "flos": 44094505213440.0, + "grad_norm": 1.6130200255292269, + "language_loss": 0.64538616, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.67307472, + "num_input_tokens_seen": 120219195, + "step": 5599, + "time_per_iteration": 2.9372942447662354 + }, + { + "auxiliary_loss_clip": 0.01468999, + "auxiliary_loss_mlp": 0.01297575, + "balance_loss_clip": 1.13687432, + "balance_loss_mlp": 1.04637754, + "epoch": 0.3366902149406283, + "flos": 13881503648160.0, + "grad_norm": 12.551577368091651, + "language_loss": 0.82545513, + "learning_rate": 3.091819088459249e-06, + "loss": 0.85312092, + "num_input_tokens_seen": 120232950, + "step": 5600, + "time_per_iteration": 2.731825828552246 + }, + { + "auxiliary_loss_clip": 0.01470788, + "auxiliary_loss_mlp": 0.01282987, + "balance_loss_clip": 1.13739967, + "balance_loss_mlp": 1.03693891, + "epoch": 0.33675033819329625, + "flos": 16254957797280.0, + "grad_norm": 2.311500792172738, + "language_loss": 0.83346295, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.86100072, + "num_input_tokens_seen": 120248865, + "step": 5601, + "time_per_iteration": 2.71966552734375 + }, + { + "auxiliary_loss_clip": 0.01474928, + "auxiliary_loss_mlp": 0.01284401, + "balance_loss_clip": 1.14069939, + "balance_loss_mlp": 1.04121399, + "epoch": 0.3368104614459642, + "flos": 17057715318720.0, + "grad_norm": 1.8919098834309431, + "language_loss": 0.82873762, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85633099, + "num_input_tokens_seen": 120267820, + "step": 5602, + "time_per_iteration": 2.7717466354370117 + }, + { + "auxiliary_loss_clip": 0.01469359, + "auxiliary_loss_mlp": 0.01287228, + "balance_loss_clip": 1.13507366, + "balance_loss_mlp": 1.04041755, + "epoch": 0.3368705846986322, + "flos": 17860586624640.0, + "grad_norm": 1.8849666758782406, + "language_loss": 0.69757092, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.72513676, + "num_input_tokens_seen": 120286540, + "step": 5603, + "time_per_iteration": 2.74322247505188 + }, + { + "auxiliary_loss_clip": 0.01470711, + "auxiliary_loss_mlp": 0.01281433, + "balance_loss_clip": 1.1375196, + "balance_loss_mlp": 1.03233373, + "epoch": 0.33693070795130015, + "flos": 22931708113440.0, + "grad_norm": 1.478751394446481, + "language_loss": 0.83370543, + "learning_rate": 3.090513524656898e-06, + "loss": 0.86122686, + "num_input_tokens_seen": 120307305, + "step": 5604, + "time_per_iteration": 2.81323504447937 + }, + { + "auxiliary_loss_clip": 0.01474934, + "auxiliary_loss_mlp": 0.01278466, + "balance_loss_clip": 1.1428206, + "balance_loss_mlp": 1.0314641, + "epoch": 0.3369908312039681, + "flos": 22019602613760.0, + "grad_norm": 1.8470844378398852, + "language_loss": 0.73329127, + "learning_rate": 3.090187030294409e-06, + "loss": 0.76082528, + "num_input_tokens_seen": 120327845, + "step": 5605, + "time_per_iteration": 2.80825138092041 + }, + { + "auxiliary_loss_clip": 0.01471412, + "auxiliary_loss_mlp": 0.01292907, + "balance_loss_clip": 1.13843191, + "balance_loss_mlp": 1.04495132, + "epoch": 0.33705095445663613, + "flos": 11803588636320.0, + "grad_norm": 2.749286910551207, + "language_loss": 0.84053624, + "learning_rate": 3.089860494591919e-06, + "loss": 0.86817944, + "num_input_tokens_seen": 120343255, + "step": 5606, + "time_per_iteration": 2.973982572555542 + }, + { + "auxiliary_loss_clip": 0.01465313, + "auxiliary_loss_mlp": 0.01280034, + "balance_loss_clip": 1.1315949, + "balance_loss_mlp": 1.03551221, + "epoch": 0.3371110777093041, + "flos": 25048765350720.0, + "grad_norm": 1.7233579957019487, + "language_loss": 0.68175125, + "learning_rate": 3.089533917561809e-06, + "loss": 0.70920479, + "num_input_tokens_seen": 120361745, + "step": 5607, + "time_per_iteration": 2.82804536819458 + }, + { + "auxiliary_loss_clip": 0.01469391, + "auxiliary_loss_mlp": 0.01286124, + "balance_loss_clip": 1.13591373, + "balance_loss_mlp": 1.03797793, + "epoch": 0.33717120096197206, + "flos": 26581495525920.0, + "grad_norm": 2.178394866623158, + "language_loss": 0.71319425, + "learning_rate": 3.089207299216464e-06, + "loss": 0.74074948, + "num_input_tokens_seen": 120380565, + "step": 5608, + "time_per_iteration": 2.7727766036987305 + }, + { + "auxiliary_loss_clip": 0.01475715, + "auxiliary_loss_mlp": 0.01292686, + "balance_loss_clip": 1.14186239, + "balance_loss_mlp": 1.04682851, + "epoch": 0.33723132421464, + "flos": 15160340170080.0, + "grad_norm": 2.2643088512354885, + "language_loss": 0.79552734, + "learning_rate": 3.088880639568269e-06, + "loss": 0.82321131, + "num_input_tokens_seen": 120399235, + "step": 5609, + "time_per_iteration": 2.852607488632202 + }, + { + "auxiliary_loss_clip": 0.0148227, + "auxiliary_loss_mlp": 0.01297775, + "balance_loss_clip": 1.15001249, + "balance_loss_mlp": 1.04962897, + "epoch": 0.337291447467308, + "flos": 23438092078080.0, + "grad_norm": 1.6927231218801657, + "language_loss": 0.82485533, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.85265577, + "num_input_tokens_seen": 120420095, + "step": 5610, + "time_per_iteration": 2.773667097091675 + }, + { + "auxiliary_loss_clip": 0.01474956, + "auxiliary_loss_mlp": 0.01282893, + "balance_loss_clip": 1.14155817, + "balance_loss_mlp": 1.03760791, + "epoch": 0.33735157071997596, + "flos": 17240075733600.0, + "grad_norm": 1.9873893398809996, + "language_loss": 0.82196569, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84954423, + "num_input_tokens_seen": 120437690, + "step": 5611, + "time_per_iteration": 4.392494440078735 + }, + { + "auxiliary_loss_clip": 0.01480239, + "auxiliary_loss_mlp": 0.01293357, + "balance_loss_clip": 1.14888263, + "balance_loss_mlp": 1.04540133, + "epoch": 0.3374116939726439, + "flos": 28259947149120.0, + "grad_norm": 1.7274282256437674, + "language_loss": 0.79445708, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.82219303, + "num_input_tokens_seen": 120459240, + "step": 5612, + "time_per_iteration": 2.827087163925171 + }, + { + "auxiliary_loss_clip": 0.01469141, + "auxiliary_loss_mlp": 0.01280199, + "balance_loss_clip": 1.13722861, + "balance_loss_mlp": 1.03205347, + "epoch": 0.3374718172253119, + "flos": 35922384324000.0, + "grad_norm": 2.53696268094692, + "language_loss": 0.70104527, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72853863, + "num_input_tokens_seen": 120481090, + "step": 5613, + "time_per_iteration": 2.8464345932006836 + }, + { + "auxiliary_loss_clip": 0.01481668, + "auxiliary_loss_mlp": 0.01276923, + "balance_loss_clip": 1.15011764, + "balance_loss_mlp": 1.02801442, + "epoch": 0.33753194047797985, + "flos": 18188630559360.0, + "grad_norm": 1.9495806245278164, + "language_loss": 0.79868573, + "learning_rate": 3.087246722218144e-06, + "loss": 0.82627165, + "num_input_tokens_seen": 120500045, + "step": 5614, + "time_per_iteration": 2.7839772701263428 + }, + { + "auxiliary_loss_clip": 0.0148, + "auxiliary_loss_mlp": 0.01279724, + "balance_loss_clip": 1.1485064, + "balance_loss_mlp": 1.03043365, + "epoch": 0.3375920637306478, + "flos": 23151162633120.0, + "grad_norm": 1.9175118291204485, + "language_loss": 0.91054332, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93814057, + "num_input_tokens_seen": 120521125, + "step": 5615, + "time_per_iteration": 2.757164716720581 + }, + { + "auxiliary_loss_clip": 0.01471927, + "auxiliary_loss_mlp": 0.01277535, + "balance_loss_clip": 1.14122498, + "balance_loss_mlp": 1.03167772, + "epoch": 0.3376521869833158, + "flos": 23114447809920.0, + "grad_norm": 1.6991770558522508, + "language_loss": 0.80923402, + "learning_rate": 3.086592866591809e-06, + "loss": 0.83672857, + "num_input_tokens_seen": 120539180, + "step": 5616, + "time_per_iteration": 2.7779345512390137 + }, + { + "auxiliary_loss_clip": 0.01476598, + "auxiliary_loss_mlp": 0.01293658, + "balance_loss_clip": 1.1443063, + "balance_loss_mlp": 1.04322314, + "epoch": 0.33771231023598375, + "flos": 19276231476960.0, + "grad_norm": 1.767679935789972, + "language_loss": 0.8408981, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.86860067, + "num_input_tokens_seen": 120556280, + "step": 5617, + "time_per_iteration": 2.715818166732788 + }, + { + "auxiliary_loss_clip": 0.01477219, + "auxiliary_loss_mlp": 0.01283474, + "balance_loss_clip": 1.14441013, + "balance_loss_mlp": 1.03838015, + "epoch": 0.3377724334886517, + "flos": 18152143305120.0, + "grad_norm": 1.6928708566308985, + "language_loss": 0.80347431, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.83108127, + "num_input_tokens_seen": 120575395, + "step": 5618, + "time_per_iteration": 2.6393556594848633 + }, + { + "auxiliary_loss_clip": 0.01476908, + "auxiliary_loss_mlp": 0.01283872, + "balance_loss_clip": 1.14406025, + "balance_loss_mlp": 1.03820539, + "epoch": 0.3378325567413197, + "flos": 25778396651040.0, + "grad_norm": 1.700229314935368, + "language_loss": 0.70890152, + "learning_rate": 3.085611774155481e-06, + "loss": 0.73650932, + "num_input_tokens_seen": 120596075, + "step": 5619, + "time_per_iteration": 2.6855738162994385 + }, + { + "auxiliary_loss_clip": 0.01483051, + "auxiliary_loss_mlp": 0.01289713, + "balance_loss_clip": 1.14942157, + "balance_loss_mlp": 1.04786146, + "epoch": 0.3378926799939877, + "flos": 21318835007520.0, + "grad_norm": 2.3716614813432044, + "language_loss": 0.70493811, + "learning_rate": 3.085284660993821e-06, + "loss": 0.73266578, + "num_input_tokens_seen": 120614195, + "step": 5620, + "time_per_iteration": 2.6690738201141357 + }, + { + "auxiliary_loss_clip": 0.01475648, + "auxiliary_loss_mlp": 0.01278096, + "balance_loss_clip": 1.14286053, + "balance_loss_mlp": 1.02861452, + "epoch": 0.33795280324665566, + "flos": 24902361195840.0, + "grad_norm": 2.7899171645534464, + "language_loss": 0.68100977, + "learning_rate": 3.084957506678058e-06, + "loss": 0.70854723, + "num_input_tokens_seen": 120634475, + "step": 5621, + "time_per_iteration": 2.762396812438965 + }, + { + "auxiliary_loss_clip": 0.01468664, + "auxiliary_loss_mlp": 0.01281271, + "balance_loss_clip": 1.13707685, + "balance_loss_mlp": 1.0369401, + "epoch": 0.33801292649932363, + "flos": 24756260466240.0, + "grad_norm": 2.7477770402365245, + "language_loss": 0.83258295, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.86008227, + "num_input_tokens_seen": 120654980, + "step": 5622, + "time_per_iteration": 4.203803300857544 + }, + { + "auxiliary_loss_clip": 0.01477653, + "auxiliary_loss_mlp": 0.01281793, + "balance_loss_clip": 1.1455102, + "balance_loss_mlp": 1.03555453, + "epoch": 0.3380730497519916, + "flos": 26726382554400.0, + "grad_norm": 1.483663673739704, + "language_loss": 0.73735785, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.76495224, + "num_input_tokens_seen": 120676245, + "step": 5623, + "time_per_iteration": 4.317176580429077 + }, + { + "auxiliary_loss_clip": 0.01648229, + "auxiliary_loss_mlp": 0.0124353, + "balance_loss_clip": 1.31834984, + "balance_loss_mlp": 1.02914429, + "epoch": 0.33813317300465956, + "flos": 70042821478560.0, + "grad_norm": 0.752573092600494, + "language_loss": 0.54979229, + "learning_rate": 3.083975796930215e-06, + "loss": 0.57870984, + "num_input_tokens_seen": 120741965, + "step": 5624, + "time_per_iteration": 3.4221997261047363 + }, + { + "auxiliary_loss_clip": 0.01475513, + "auxiliary_loss_mlp": 0.01292766, + "balance_loss_clip": 1.14356482, + "balance_loss_mlp": 1.04614627, + "epoch": 0.3381932962573275, + "flos": 24099603674400.0, + "grad_norm": 2.5144734583909325, + "language_loss": 0.73504204, + "learning_rate": 3.083648478122111e-06, + "loss": 0.76272482, + "num_input_tokens_seen": 120760410, + "step": 5625, + "time_per_iteration": 2.7828497886657715 + }, + { + "auxiliary_loss_clip": 0.01468182, + "auxiliary_loss_mlp": 0.0127461, + "balance_loss_clip": 1.13531566, + "balance_loss_mlp": 1.02779889, + "epoch": 0.3382534195099955, + "flos": 19280100149280.0, + "grad_norm": 2.0103099440466172, + "language_loss": 0.71045756, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.73788548, + "num_input_tokens_seen": 120777705, + "step": 5626, + "time_per_iteration": 2.7319436073303223 + }, + { + "auxiliary_loss_clip": 0.01487221, + "auxiliary_loss_mlp": 0.01283547, + "balance_loss_clip": 1.15362108, + "balance_loss_mlp": 1.0403595, + "epoch": 0.33831354276266346, + "flos": 25228470794400.0, + "grad_norm": 2.4353805423758086, + "language_loss": 0.81351209, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.84121972, + "num_input_tokens_seen": 120798660, + "step": 5627, + "time_per_iteration": 2.799802541732788 + }, + { + "auxiliary_loss_clip": 0.01482062, + "auxiliary_loss_mlp": 0.01278267, + "balance_loss_clip": 1.14871025, + "balance_loss_mlp": 1.02821386, + "epoch": 0.3383736660153314, + "flos": 23114220240960.0, + "grad_norm": 4.355209521437975, + "language_loss": 0.80483353, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.8324368, + "num_input_tokens_seen": 120816705, + "step": 5628, + "time_per_iteration": 2.772775650024414 + }, + { + "auxiliary_loss_clip": 0.01479949, + "auxiliary_loss_mlp": 0.01286875, + "balance_loss_clip": 1.14766693, + "balance_loss_mlp": 1.03834796, + "epoch": 0.3384337892679994, + "flos": 23479396208640.0, + "grad_norm": 2.0804472228080577, + "language_loss": 0.77839488, + "learning_rate": 3.082338792093254e-06, + "loss": 0.80606318, + "num_input_tokens_seen": 120835375, + "step": 5629, + "time_per_iteration": 2.8003456592559814 + }, + { + "auxiliary_loss_clip": 0.0146997, + "auxiliary_loss_mlp": 0.01296126, + "balance_loss_clip": 1.13716912, + "balance_loss_mlp": 1.04759812, + "epoch": 0.33849391252066735, + "flos": 19427755933440.0, + "grad_norm": 1.8053212709335515, + "language_loss": 0.85207152, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87973249, + "num_input_tokens_seen": 120854260, + "step": 5630, + "time_per_iteration": 2.7691662311553955 + }, + { + "auxiliary_loss_clip": 0.01479368, + "auxiliary_loss_mlp": 0.01280916, + "balance_loss_clip": 1.14590693, + "balance_loss_mlp": 1.03315163, + "epoch": 0.3385540357733353, + "flos": 21066723977760.0, + "grad_norm": 1.9790460884270344, + "language_loss": 0.72219503, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.74979794, + "num_input_tokens_seen": 120871590, + "step": 5631, + "time_per_iteration": 2.8177106380462646 + }, + { + "auxiliary_loss_clip": 0.01617556, + "auxiliary_loss_mlp": 0.0123925, + "balance_loss_clip": 1.2879976, + "balance_loss_mlp": 1.02181244, + "epoch": 0.3386141590260033, + "flos": 69213628029600.0, + "grad_norm": 0.8417006666867294, + "language_loss": 0.56098604, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58955413, + "num_input_tokens_seen": 120925550, + "step": 5632, + "time_per_iteration": 3.4005990028381348 + }, + { + "auxiliary_loss_clip": 0.01473262, + "auxiliary_loss_mlp": 0.01276764, + "balance_loss_clip": 1.13954747, + "balance_loss_mlp": 1.02671087, + "epoch": 0.3386742822786713, + "flos": 25521810098400.0, + "grad_norm": 1.615700705347726, + "language_loss": 0.80284953, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.83034974, + "num_input_tokens_seen": 120947620, + "step": 5633, + "time_per_iteration": 2.808847427368164 + }, + { + "auxiliary_loss_clip": 0.01468618, + "auxiliary_loss_mlp": 0.01278649, + "balance_loss_clip": 1.13589764, + "balance_loss_mlp": 1.03183794, + "epoch": 0.33873440553133927, + "flos": 23625383153760.0, + "grad_norm": 2.217974839775267, + "language_loss": 0.59135342, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61882603, + "num_input_tokens_seen": 120965205, + "step": 5634, + "time_per_iteration": 2.812204360961914 + }, + { + "auxiliary_loss_clip": 0.01472323, + "auxiliary_loss_mlp": 0.01283909, + "balance_loss_clip": 1.13884175, + "balance_loss_mlp": 1.03767061, + "epoch": 0.33879452878400723, + "flos": 17090599397760.0, + "grad_norm": 1.865292204311642, + "language_loss": 0.93122518, + "learning_rate": 3.080373032026589e-06, + "loss": 0.95878756, + "num_input_tokens_seen": 120983560, + "step": 5635, + "time_per_iteration": 2.740095615386963 + }, + { + "auxiliary_loss_clip": 0.0147267, + "auxiliary_loss_mlp": 0.01282637, + "balance_loss_clip": 1.13827813, + "balance_loss_mlp": 1.04097629, + "epoch": 0.3388546520366752, + "flos": 15743870740800.0, + "grad_norm": 1.7588578185715473, + "language_loss": 0.75170112, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.7792542, + "num_input_tokens_seen": 121001400, + "step": 5636, + "time_per_iteration": 2.7871036529541016 + }, + { + "auxiliary_loss_clip": 0.01478656, + "auxiliary_loss_mlp": 0.01292401, + "balance_loss_clip": 1.14521646, + "balance_loss_mlp": 1.04749715, + "epoch": 0.33891477528934316, + "flos": 22420924482240.0, + "grad_norm": 1.4952317205649028, + "language_loss": 0.8352955, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.86300611, + "num_input_tokens_seen": 121021760, + "step": 5637, + "time_per_iteration": 2.7649505138397217 + }, + { + "auxiliary_loss_clip": 0.01478708, + "auxiliary_loss_mlp": 0.01302625, + "balance_loss_clip": 1.14572084, + "balance_loss_mlp": 1.05524218, + "epoch": 0.3389748985420111, + "flos": 17276752628640.0, + "grad_norm": 6.041468021785179, + "language_loss": 0.69209617, + "learning_rate": 3.079389598759495e-06, + "loss": 0.71990955, + "num_input_tokens_seen": 121041070, + "step": 5638, + "time_per_iteration": 2.7555229663848877 + }, + { + "auxiliary_loss_clip": 0.01474125, + "auxiliary_loss_mlp": 0.01294275, + "balance_loss_clip": 1.13964391, + "balance_loss_mlp": 1.04937172, + "epoch": 0.3390350217946791, + "flos": 27747418822560.0, + "grad_norm": 18.05579815478616, + "language_loss": 0.80868655, + "learning_rate": 3.079061705792765e-06, + "loss": 0.83637059, + "num_input_tokens_seen": 121060890, + "step": 5639, + "time_per_iteration": 2.8207321166992188 + }, + { + "auxiliary_loss_clip": 0.01474041, + "auxiliary_loss_mlp": 0.01299804, + "balance_loss_clip": 1.14081383, + "balance_loss_mlp": 1.05356574, + "epoch": 0.33909514504734706, + "flos": 20341909553760.0, + "grad_norm": 3.8083334971447322, + "language_loss": 0.680318, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70805645, + "num_input_tokens_seen": 121079135, + "step": 5640, + "time_per_iteration": 2.746413469314575 + }, + { + "auxiliary_loss_clip": 0.01476006, + "auxiliary_loss_mlp": 0.01287385, + "balance_loss_clip": 1.14259458, + "balance_loss_mlp": 1.04038358, + "epoch": 0.339155268300015, + "flos": 14831803169280.0, + "grad_norm": 2.474954361988976, + "language_loss": 0.70085198, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72848582, + "num_input_tokens_seen": 121097685, + "step": 5641, + "time_per_iteration": 2.796677350997925 + }, + { + "auxiliary_loss_clip": 0.01473146, + "auxiliary_loss_mlp": 0.01299196, + "balance_loss_clip": 1.13939583, + "balance_loss_mlp": 1.05162179, + "epoch": 0.339215391552683, + "flos": 26070370541280.0, + "grad_norm": 2.2535821216041283, + "language_loss": 0.8767072, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.90443063, + "num_input_tokens_seen": 121115640, + "step": 5642, + "time_per_iteration": 2.756453275680542 + }, + { + "auxiliary_loss_clip": 0.01467538, + "auxiliary_loss_mlp": 0.01268276, + "balance_loss_clip": 1.13410532, + "balance_loss_mlp": 1.02585185, + "epoch": 0.33927551480535095, + "flos": 14576468245920.0, + "grad_norm": 2.0367286699240976, + "language_loss": 0.83475268, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86211085, + "num_input_tokens_seen": 121132485, + "step": 5643, + "time_per_iteration": 2.7549521923065186 + }, + { + "auxiliary_loss_clip": 0.01469488, + "auxiliary_loss_mlp": 0.01285793, + "balance_loss_clip": 1.13567328, + "balance_loss_mlp": 1.04050827, + "epoch": 0.3393356380580189, + "flos": 23807857353120.0, + "grad_norm": 1.6094614526601267, + "language_loss": 0.77146113, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79901397, + "num_input_tokens_seen": 121152935, + "step": 5644, + "time_per_iteration": 2.7934792041778564 + }, + { + "auxiliary_loss_clip": 0.01471315, + "auxiliary_loss_mlp": 0.01295845, + "balance_loss_clip": 1.1372478, + "balance_loss_mlp": 1.05017853, + "epoch": 0.3393957613106869, + "flos": 17349802993440.0, + "grad_norm": 3.147102847638376, + "language_loss": 0.63805783, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.6657294, + "num_input_tokens_seen": 121169835, + "step": 5645, + "time_per_iteration": 2.8048713207244873 + }, + { + "auxiliary_loss_clip": 0.01465534, + "auxiliary_loss_mlp": 0.01284179, + "balance_loss_clip": 1.1322763, + "balance_loss_mlp": 1.04099238, + "epoch": 0.3394558845633549, + "flos": 28436201130240.0, + "grad_norm": 1.9998371350388975, + "language_loss": 0.76661593, + "learning_rate": 3.076765310014552e-06, + "loss": 0.79411304, + "num_input_tokens_seen": 121190290, + "step": 5646, + "time_per_iteration": 2.816800594329834 + }, + { + "auxiliary_loss_clip": 0.01468041, + "auxiliary_loss_mlp": 0.01285029, + "balance_loss_clip": 1.13430893, + "balance_loss_mlp": 1.03573871, + "epoch": 0.33951600781602287, + "flos": 22088898090720.0, + "grad_norm": 2.2049422226855233, + "language_loss": 0.79030699, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81783772, + "num_input_tokens_seen": 121209060, + "step": 5647, + "time_per_iteration": 2.770927667617798 + }, + { + "auxiliary_loss_clip": 0.01471248, + "auxiliary_loss_mlp": 0.01293298, + "balance_loss_clip": 1.13684034, + "balance_loss_mlp": 1.0476315, + "epoch": 0.33957613106869083, + "flos": 23880528436320.0, + "grad_norm": 2.3916404604996515, + "language_loss": 0.77845478, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.80610025, + "num_input_tokens_seen": 121227480, + "step": 5648, + "time_per_iteration": 2.792726993560791 + }, + { + "auxiliary_loss_clip": 0.01583818, + "auxiliary_loss_mlp": 0.01228523, + "balance_loss_clip": 1.25511146, + "balance_loss_mlp": 1.01184845, + "epoch": 0.3396362543213588, + "flos": 71249935121280.0, + "grad_norm": 0.783514339773441, + "language_loss": 0.56245178, + "learning_rate": 3.075780527680754e-06, + "loss": 0.59057522, + "num_input_tokens_seen": 121291305, + "step": 5649, + "time_per_iteration": 4.9700987339019775 + }, + { + "auxiliary_loss_clip": 0.01469572, + "auxiliary_loss_mlp": 0.01296204, + "balance_loss_clip": 1.13632774, + "balance_loss_mlp": 1.05339861, + "epoch": 0.33969637757402676, + "flos": 25924004314560.0, + "grad_norm": 1.4973958213817495, + "language_loss": 0.85596377, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.88362145, + "num_input_tokens_seen": 121312740, + "step": 5650, + "time_per_iteration": 3.018540143966675 + }, + { + "auxiliary_loss_clip": 0.0146735, + "auxiliary_loss_mlp": 0.01284228, + "balance_loss_clip": 1.13414729, + "balance_loss_mlp": 1.04123163, + "epoch": 0.33975650082669473, + "flos": 35264627615520.0, + "grad_norm": 1.8946055610730417, + "language_loss": 0.70762384, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73513961, + "num_input_tokens_seen": 121334220, + "step": 5651, + "time_per_iteration": 3.0032217502593994 + }, + { + "auxiliary_loss_clip": 0.01475366, + "auxiliary_loss_mlp": 0.01293032, + "balance_loss_clip": 1.14307308, + "balance_loss_mlp": 1.04698396, + "epoch": 0.3398166240793627, + "flos": 16648466464800.0, + "grad_norm": 2.173514900853333, + "language_loss": 0.81067228, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83835626, + "num_input_tokens_seen": 121351870, + "step": 5652, + "time_per_iteration": 2.8760335445404053 + }, + { + "auxiliary_loss_clip": 0.01475492, + "auxiliary_loss_mlp": 0.01287207, + "balance_loss_clip": 1.14245927, + "balance_loss_mlp": 1.03944206, + "epoch": 0.33987674733203066, + "flos": 24064557690240.0, + "grad_norm": 2.2551059433603404, + "language_loss": 0.7733078, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.80093479, + "num_input_tokens_seen": 121373400, + "step": 5653, + "time_per_iteration": 2.9055933952331543 + }, + { + "auxiliary_loss_clip": 0.01467177, + "auxiliary_loss_mlp": 0.01283178, + "balance_loss_clip": 1.13478529, + "balance_loss_mlp": 1.03732133, + "epoch": 0.3399368705846986, + "flos": 13251776214240.0, + "grad_norm": 2.9813961267504205, + "language_loss": 0.85612893, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.88363254, + "num_input_tokens_seen": 121385225, + "step": 5654, + "time_per_iteration": 2.772965431213379 + }, + { + "auxiliary_loss_clip": 0.01481856, + "auxiliary_loss_mlp": 0.01289479, + "balance_loss_clip": 1.1494081, + "balance_loss_mlp": 1.04381299, + "epoch": 0.3399969938373666, + "flos": 27015094622880.0, + "grad_norm": 4.527868675483943, + "language_loss": 0.65377778, + "learning_rate": 3.073809861919351e-06, + "loss": 0.68149114, + "num_input_tokens_seen": 121404735, + "step": 5655, + "time_per_iteration": 2.7854366302490234 + }, + { + "auxiliary_loss_clip": 0.01476949, + "auxiliary_loss_mlp": 0.01276295, + "balance_loss_clip": 1.14560807, + "balance_loss_mlp": 1.02986526, + "epoch": 0.34005711709003456, + "flos": 28552717530720.0, + "grad_norm": 1.497038713767409, + "language_loss": 0.7679655, + "learning_rate": 3.073481275036697e-06, + "loss": 0.79549795, + "num_input_tokens_seen": 121426780, + "step": 5656, + "time_per_iteration": 2.854497194290161 + }, + { + "auxiliary_loss_clip": 0.01475604, + "auxiliary_loss_mlp": 0.01279713, + "balance_loss_clip": 1.14277744, + "balance_loss_mlp": 1.02870631, + "epoch": 0.3401172403427025, + "flos": 21619191021120.0, + "grad_norm": 2.021450463863121, + "language_loss": 0.83618021, + "learning_rate": 3.073152647447525e-06, + "loss": 0.86373335, + "num_input_tokens_seen": 121447245, + "step": 5657, + "time_per_iteration": 2.7770776748657227 + }, + { + "auxiliary_loss_clip": 0.0147762, + "auxiliary_loss_mlp": 0.0128907, + "balance_loss_clip": 1.14535403, + "balance_loss_mlp": 1.04531062, + "epoch": 0.3401773635953705, + "flos": 25888199767200.0, + "grad_norm": 2.04796009481265, + "language_loss": 0.85641271, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.88407964, + "num_input_tokens_seen": 121468165, + "step": 5658, + "time_per_iteration": 2.82828688621521 + }, + { + "auxiliary_loss_clip": 0.01575917, + "auxiliary_loss_mlp": 0.01236816, + "balance_loss_clip": 1.24869335, + "balance_loss_mlp": 1.01785278, + "epoch": 0.3402374868480385, + "flos": 65514609136800.0, + "grad_norm": 0.8130901789003192, + "language_loss": 0.59949267, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62762004, + "num_input_tokens_seen": 121523795, + "step": 5659, + "time_per_iteration": 4.817779541015625 + }, + { + "auxiliary_loss_clip": 0.0147893, + "auxiliary_loss_mlp": 0.01285856, + "balance_loss_clip": 1.14705706, + "balance_loss_mlp": 1.04343224, + "epoch": 0.34029761010070647, + "flos": 24062850923040.0, + "grad_norm": 2.0706037299949096, + "language_loss": 0.68055826, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70820618, + "num_input_tokens_seen": 121542950, + "step": 5660, + "time_per_iteration": 5.459131240844727 + }, + { + "auxiliary_loss_clip": 0.01487232, + "auxiliary_loss_mlp": 0.01296557, + "balance_loss_clip": 1.15525019, + "balance_loss_mlp": 1.05432379, + "epoch": 0.34035773335337444, + "flos": 27602190440640.0, + "grad_norm": 1.958273442626037, + "language_loss": 0.67191577, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69975364, + "num_input_tokens_seen": 121562765, + "step": 5661, + "time_per_iteration": 4.385195732116699 + }, + { + "auxiliary_loss_clip": 0.01479848, + "auxiliary_loss_mlp": 0.01281422, + "balance_loss_clip": 1.14846504, + "balance_loss_mlp": 1.03937995, + "epoch": 0.3404178566060424, + "flos": 20814840516960.0, + "grad_norm": 1.6917012305263046, + "language_loss": 0.78913563, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81674838, + "num_input_tokens_seen": 121581610, + "step": 5662, + "time_per_iteration": 2.8397417068481445 + }, + { + "auxiliary_loss_clip": 0.01479956, + "auxiliary_loss_mlp": 0.01279118, + "balance_loss_clip": 1.14804435, + "balance_loss_mlp": 1.03326035, + "epoch": 0.34047797985871037, + "flos": 26835996029760.0, + "grad_norm": 1.9951266678691741, + "language_loss": 0.73579693, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.76338768, + "num_input_tokens_seen": 121601885, + "step": 5663, + "time_per_iteration": 2.875729560852051 + }, + { + "auxiliary_loss_clip": 0.01481975, + "auxiliary_loss_mlp": 0.01281032, + "balance_loss_clip": 1.14990485, + "balance_loss_mlp": 1.0384171, + "epoch": 0.34053810311137833, + "flos": 19684683839520.0, + "grad_norm": 1.7378111323099956, + "language_loss": 0.86536765, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.89299774, + "num_input_tokens_seen": 121621335, + "step": 5664, + "time_per_iteration": 2.789811372756958 + }, + { + "auxiliary_loss_clip": 0.01474121, + "auxiliary_loss_mlp": 0.012932, + "balance_loss_clip": 1.14311922, + "balance_loss_mlp": 1.04543555, + "epoch": 0.3405982263640463, + "flos": 21727894220640.0, + "grad_norm": 1.8554542815644743, + "language_loss": 0.69187492, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71954811, + "num_input_tokens_seen": 121641310, + "step": 5665, + "time_per_iteration": 2.8487844467163086 + }, + { + "auxiliary_loss_clip": 0.01478926, + "auxiliary_loss_mlp": 0.01284799, + "balance_loss_clip": 1.14831376, + "balance_loss_mlp": 1.03856015, + "epoch": 0.34065834961671426, + "flos": 18043629746400.0, + "grad_norm": 3.1790870906658024, + "language_loss": 0.73350072, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.76113796, + "num_input_tokens_seen": 121659625, + "step": 5666, + "time_per_iteration": 2.8866348266601562 + }, + { + "auxiliary_loss_clip": 0.01478685, + "auxiliary_loss_mlp": 0.01285383, + "balance_loss_clip": 1.14777732, + "balance_loss_mlp": 1.03876269, + "epoch": 0.3407184728693822, + "flos": 21399471004320.0, + "grad_norm": 1.9574037961607245, + "language_loss": 0.73725796, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.7648986, + "num_input_tokens_seen": 121679205, + "step": 5667, + "time_per_iteration": 2.809516429901123 + }, + { + "auxiliary_loss_clip": 0.01574199, + "auxiliary_loss_mlp": 0.01245544, + "balance_loss_clip": 1.2480427, + "balance_loss_mlp": 1.03115845, + "epoch": 0.3407785961220502, + "flos": 68695220473920.0, + "grad_norm": 0.841570537422335, + "language_loss": 0.63305557, + "learning_rate": 3.069535060901597e-06, + "loss": 0.66125309, + "num_input_tokens_seen": 121751085, + "step": 5668, + "time_per_iteration": 3.563627004623413 + }, + { + "auxiliary_loss_clip": 0.01481011, + "auxiliary_loss_mlp": 0.01289624, + "balance_loss_clip": 1.1499207, + "balance_loss_mlp": 1.04567373, + "epoch": 0.34083871937471816, + "flos": 14066139752640.0, + "grad_norm": 2.972019933010326, + "language_loss": 0.71994531, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74765164, + "num_input_tokens_seen": 121768565, + "step": 5669, + "time_per_iteration": 2.826014280319214 + }, + { + "auxiliary_loss_clip": 0.0148029, + "auxiliary_loss_mlp": 0.01282209, + "balance_loss_clip": 1.14966846, + "balance_loss_mlp": 1.03616142, + "epoch": 0.3408988426273861, + "flos": 17086958294400.0, + "grad_norm": 2.3765339721918894, + "language_loss": 0.80155617, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82918113, + "num_input_tokens_seen": 121784925, + "step": 5670, + "time_per_iteration": 2.817551851272583 + }, + { + "auxiliary_loss_clip": 0.01481234, + "auxiliary_loss_mlp": 0.01287177, + "balance_loss_clip": 1.14995635, + "balance_loss_mlp": 1.03998494, + "epoch": 0.3409589658800541, + "flos": 24026856734880.0, + "grad_norm": 1.728423821798475, + "language_loss": 0.7720021, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79968619, + "num_input_tokens_seen": 121804425, + "step": 5671, + "time_per_iteration": 2.9341728687286377 + }, + { + "auxiliary_loss_clip": 0.01484892, + "auxiliary_loss_mlp": 0.01274936, + "balance_loss_clip": 1.15366733, + "balance_loss_mlp": 1.02526402, + "epoch": 0.34101908913272205, + "flos": 21144136080960.0, + "grad_norm": 1.7929725328276631, + "language_loss": 0.74109751, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.76869571, + "num_input_tokens_seen": 121825145, + "step": 5672, + "time_per_iteration": 2.829925060272217 + }, + { + "auxiliary_loss_clip": 0.01479567, + "auxiliary_loss_mlp": 0.01279186, + "balance_loss_clip": 1.14847326, + "balance_loss_mlp": 1.03332925, + "epoch": 0.3410792123853901, + "flos": 15703628598720.0, + "grad_norm": 1.7829209560802108, + "language_loss": 0.73949003, + "learning_rate": 3.06788908010777e-06, + "loss": 0.76707757, + "num_input_tokens_seen": 121842185, + "step": 5673, + "time_per_iteration": 2.8176863193511963 + }, + { + "auxiliary_loss_clip": 0.01482057, + "auxiliary_loss_mlp": 0.01284677, + "balance_loss_clip": 1.15131891, + "balance_loss_mlp": 1.03710318, + "epoch": 0.34113933563805804, + "flos": 23038059767040.0, + "grad_norm": 3.3828833403305656, + "language_loss": 0.7976476, + "learning_rate": 3.067559762415682e-06, + "loss": 0.82531494, + "num_input_tokens_seen": 121862260, + "step": 5674, + "time_per_iteration": 2.8255085945129395 + }, + { + "auxiliary_loss_clip": 0.01584595, + "auxiliary_loss_mlp": 0.01232475, + "balance_loss_clip": 1.25983381, + "balance_loss_mlp": 1.0142746, + "epoch": 0.341199458890726, + "flos": 69620146056000.0, + "grad_norm": 0.7892884913039852, + "language_loss": 0.56097472, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58914542, + "num_input_tokens_seen": 121923560, + "step": 5675, + "time_per_iteration": 3.6328306198120117 + }, + { + "auxiliary_loss_clip": 0.0148468, + "auxiliary_loss_mlp": 0.01275817, + "balance_loss_clip": 1.15261531, + "balance_loss_mlp": 1.02957809, + "epoch": 0.34125958214339397, + "flos": 22348556824320.0, + "grad_norm": 1.8312804871790467, + "language_loss": 0.79310882, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.82071376, + "num_input_tokens_seen": 121943515, + "step": 5676, + "time_per_iteration": 2.863379716873169 + }, + { + "auxiliary_loss_clip": 0.01480931, + "auxiliary_loss_mlp": 0.01276411, + "balance_loss_clip": 1.14916122, + "balance_loss_mlp": 1.02979052, + "epoch": 0.34131970539606193, + "flos": 21874108734720.0, + "grad_norm": 1.8562010653507977, + "language_loss": 0.86379123, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.89136463, + "num_input_tokens_seen": 121962540, + "step": 5677, + "time_per_iteration": 2.83668851852417 + }, + { + "auxiliary_loss_clip": 0.01484057, + "auxiliary_loss_mlp": 0.01281057, + "balance_loss_clip": 1.15213656, + "balance_loss_mlp": 1.03233838, + "epoch": 0.3413798286487299, + "flos": 24938014030560.0, + "grad_norm": 2.087306031678184, + "language_loss": 0.79754549, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.82519662, + "num_input_tokens_seen": 121979830, + "step": 5678, + "time_per_iteration": 2.9514055252075195 + }, + { + "auxiliary_loss_clip": 0.01481351, + "auxiliary_loss_mlp": 0.01276382, + "balance_loss_clip": 1.15013194, + "balance_loss_mlp": 1.02918935, + "epoch": 0.34143995190139786, + "flos": 25376733429120.0, + "grad_norm": 2.528061350248105, + "language_loss": 0.75568199, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.78325939, + "num_input_tokens_seen": 121999055, + "step": 5679, + "time_per_iteration": 2.8415908813476562 + }, + { + "auxiliary_loss_clip": 0.01587337, + "auxiliary_loss_mlp": 0.01231003, + "balance_loss_clip": 1.26353645, + "balance_loss_mlp": 1.014328, + "epoch": 0.34150007515406583, + "flos": 67790017899360.0, + "grad_norm": 0.7170751790521233, + "language_loss": 0.59430754, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.622491, + "num_input_tokens_seen": 122067015, + "step": 5680, + "time_per_iteration": 3.4537744522094727 + }, + { + "auxiliary_loss_clip": 0.01479441, + "auxiliary_loss_mlp": 0.01286128, + "balance_loss_clip": 1.14827347, + "balance_loss_mlp": 1.04465759, + "epoch": 0.3415601984067338, + "flos": 20304474095520.0, + "grad_norm": 2.3507383283468144, + "language_loss": 0.72122467, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74888039, + "num_input_tokens_seen": 122085295, + "step": 5681, + "time_per_iteration": 2.9054903984069824 + }, + { + "auxiliary_loss_clip": 0.01472534, + "auxiliary_loss_mlp": 0.01289491, + "balance_loss_clip": 1.14219975, + "balance_loss_mlp": 1.0449692, + "epoch": 0.34162032165940176, + "flos": 26033579861760.0, + "grad_norm": 1.9279571857499973, + "language_loss": 0.7164855, + "learning_rate": 3.064923764577233e-06, + "loss": 0.7441057, + "num_input_tokens_seen": 122104020, + "step": 5682, + "time_per_iteration": 2.765176773071289 + }, + { + "auxiliary_loss_clip": 0.01477256, + "auxiliary_loss_mlp": 0.01282128, + "balance_loss_clip": 1.1454767, + "balance_loss_mlp": 1.0362705, + "epoch": 0.3416804449120697, + "flos": 28805852620800.0, + "grad_norm": 2.612268160987914, + "language_loss": 0.84267139, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.87026525, + "num_input_tokens_seen": 122125080, + "step": 5683, + "time_per_iteration": 2.8530430793762207 + }, + { + "auxiliary_loss_clip": 0.01486013, + "auxiliary_loss_mlp": 0.01295894, + "balance_loss_clip": 1.15539622, + "balance_loss_mlp": 1.05137229, + "epoch": 0.3417405681647377, + "flos": 22603815891360.0, + "grad_norm": 1.7778128232287718, + "language_loss": 0.71262437, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.74044347, + "num_input_tokens_seen": 122146350, + "step": 5684, + "time_per_iteration": 2.7944180965423584 + }, + { + "auxiliary_loss_clip": 0.0147889, + "auxiliary_loss_mlp": 0.01272956, + "balance_loss_clip": 1.14878762, + "balance_loss_mlp": 1.02919698, + "epoch": 0.34180069141740566, + "flos": 24718483654560.0, + "grad_norm": 1.38497334420569, + "language_loss": 0.75202626, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77954471, + "num_input_tokens_seen": 122168085, + "step": 5685, + "time_per_iteration": 2.7941346168518066 + }, + { + "auxiliary_loss_clip": 0.01478986, + "auxiliary_loss_mlp": 0.01291951, + "balance_loss_clip": 1.14893675, + "balance_loss_mlp": 1.05086219, + "epoch": 0.3418608146700737, + "flos": 30521398348800.0, + "grad_norm": 1.8681268607652937, + "language_loss": 0.70650887, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.73421818, + "num_input_tokens_seen": 122191040, + "step": 5686, + "time_per_iteration": 2.8204450607299805 + }, + { + "auxiliary_loss_clip": 0.01477098, + "auxiliary_loss_mlp": 0.01286981, + "balance_loss_clip": 1.14680588, + "balance_loss_mlp": 1.04131484, + "epoch": 0.34192093792274164, + "flos": 15124156341120.0, + "grad_norm": 2.497207157264697, + "language_loss": 0.78026932, + "learning_rate": 3.06327495310661e-06, + "loss": 0.80791014, + "num_input_tokens_seen": 122209225, + "step": 5687, + "time_per_iteration": 4.331718444824219 + }, + { + "auxiliary_loss_clip": 0.0148451, + "auxiliary_loss_mlp": 0.01288932, + "balance_loss_clip": 1.1544708, + "balance_loss_mlp": 1.04727066, + "epoch": 0.3419810611754096, + "flos": 13189459518720.0, + "grad_norm": 2.9457235371833055, + "language_loss": 0.8695457, + "learning_rate": 3.062945069803981e-06, + "loss": 0.89728016, + "num_input_tokens_seen": 122226160, + "step": 5688, + "time_per_iteration": 2.7819623947143555 + }, + { + "auxiliary_loss_clip": 0.01477704, + "auxiliary_loss_mlp": 0.01286295, + "balance_loss_clip": 1.14614582, + "balance_loss_mlp": 1.03643262, + "epoch": 0.34204118442807757, + "flos": 19538279684640.0, + "grad_norm": 2.1809465548106672, + "language_loss": 0.79660308, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82424307, + "num_input_tokens_seen": 122243115, + "step": 5689, + "time_per_iteration": 2.6839752197265625 + }, + { + "auxiliary_loss_clip": 0.01472056, + "auxiliary_loss_mlp": 0.01303061, + "balance_loss_clip": 1.14097524, + "balance_loss_mlp": 1.05796695, + "epoch": 0.34210130768074554, + "flos": 15196637783520.0, + "grad_norm": 1.8768426400502716, + "language_loss": 0.73399842, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.76174963, + "num_input_tokens_seen": 122261105, + "step": 5690, + "time_per_iteration": 2.7436468601226807 + }, + { + "auxiliary_loss_clip": 0.01471081, + "auxiliary_loss_mlp": 0.01272597, + "balance_loss_clip": 1.14031255, + "balance_loss_mlp": 1.02750289, + "epoch": 0.3421614309334135, + "flos": 24938545024800.0, + "grad_norm": 2.5870787772876436, + "language_loss": 0.75989413, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78733087, + "num_input_tokens_seen": 122279995, + "step": 5691, + "time_per_iteration": 2.7537357807159424 + }, + { + "auxiliary_loss_clip": 0.01466338, + "auxiliary_loss_mlp": 0.0127594, + "balance_loss_clip": 1.13602376, + "balance_loss_mlp": 1.03122675, + "epoch": 0.34222155418608147, + "flos": 21910823557920.0, + "grad_norm": 3.3542479431582946, + "language_loss": 0.68070751, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70813024, + "num_input_tokens_seen": 122299070, + "step": 5692, + "time_per_iteration": 2.8707940578460693 + }, + { + "auxiliary_loss_clip": 0.01479659, + "auxiliary_loss_mlp": 0.01291738, + "balance_loss_clip": 1.14875627, + "balance_loss_mlp": 1.0456903, + "epoch": 0.34228167743874943, + "flos": 18116035332480.0, + "grad_norm": 2.1700455276868653, + "language_loss": 0.72623253, + "learning_rate": 3.06129504893632e-06, + "loss": 0.75394642, + "num_input_tokens_seen": 122316800, + "step": 5693, + "time_per_iteration": 2.8524975776672363 + }, + { + "auxiliary_loss_clip": 0.01471399, + "auxiliary_loss_mlp": 0.01280306, + "balance_loss_clip": 1.14125633, + "balance_loss_mlp": 1.03463972, + "epoch": 0.3423418006914174, + "flos": 21290767804800.0, + "grad_norm": 3.474413110766876, + "language_loss": 0.75336874, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.78088582, + "num_input_tokens_seen": 122335275, + "step": 5694, + "time_per_iteration": 2.8288381099700928 + }, + { + "auxiliary_loss_clip": 0.01485199, + "auxiliary_loss_mlp": 0.01288862, + "balance_loss_clip": 1.1552422, + "balance_loss_mlp": 1.04605651, + "epoch": 0.34240192394408536, + "flos": 19825247057760.0, + "grad_norm": 2.2084611045441354, + "language_loss": 0.79588652, + "learning_rate": 3.060634758790747e-06, + "loss": 0.82362711, + "num_input_tokens_seen": 122353215, + "step": 5695, + "time_per_iteration": 2.816906213760376 + }, + { + "auxiliary_loss_clip": 0.01480874, + "auxiliary_loss_mlp": 0.01278342, + "balance_loss_clip": 1.1515975, + "balance_loss_mlp": 1.03229415, + "epoch": 0.3424620471967533, + "flos": 24537867935040.0, + "grad_norm": 1.8409453936228717, + "language_loss": 0.73339337, + "learning_rate": 3.060304553382635e-06, + "loss": 0.76098549, + "num_input_tokens_seen": 122372495, + "step": 5696, + "time_per_iteration": 2.7630581855773926 + }, + { + "auxiliary_loss_clip": 0.01488518, + "auxiliary_loss_mlp": 0.0128862, + "balance_loss_clip": 1.1591537, + "balance_loss_mlp": 1.04466975, + "epoch": 0.3425221704494213, + "flos": 25851295303200.0, + "grad_norm": 2.449470583858621, + "language_loss": 0.7132529, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.74102432, + "num_input_tokens_seen": 122394600, + "step": 5697, + "time_per_iteration": 4.329521179199219 + }, + { + "auxiliary_loss_clip": 0.01494652, + "auxiliary_loss_mlp": 0.01284549, + "balance_loss_clip": 1.16411996, + "balance_loss_mlp": 1.04021716, + "epoch": 0.34258229370208926, + "flos": 21542499552960.0, + "grad_norm": 2.5128531742327826, + "language_loss": 0.82252407, + "learning_rate": 3.05964402195837e-06, + "loss": 0.85031617, + "num_input_tokens_seen": 122414700, + "step": 5698, + "time_per_iteration": 4.897622585296631 + }, + { + "auxiliary_loss_clip": 0.01480821, + "auxiliary_loss_mlp": 0.0127966, + "balance_loss_clip": 1.15103805, + "balance_loss_mlp": 1.02941561, + "epoch": 0.3426424169547573, + "flos": 23654664057600.0, + "grad_norm": 5.234864969481042, + "language_loss": 0.68468201, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71228683, + "num_input_tokens_seen": 122432760, + "step": 5699, + "time_per_iteration": 4.3260273933410645 + }, + { + "auxiliary_loss_clip": 0.01480546, + "auxiliary_loss_mlp": 0.01270969, + "balance_loss_clip": 1.15011048, + "balance_loss_mlp": 1.02568436, + "epoch": 0.34270254020742524, + "flos": 24647215913280.0, + "grad_norm": 3.062056956972389, + "language_loss": 0.72938478, + "learning_rate": 3.058983329806877e-06, + "loss": 0.75689995, + "num_input_tokens_seen": 122449105, + "step": 5700, + "time_per_iteration": 2.7884817123413086 + }, + { + "auxiliary_loss_clip": 0.01493495, + "auxiliary_loss_mlp": 0.01285513, + "balance_loss_clip": 1.16560984, + "balance_loss_mlp": 1.03812957, + "epoch": 0.3427626634600932, + "flos": 20998907699040.0, + "grad_norm": 2.4735130918057275, + "language_loss": 0.82160372, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.84939384, + "num_input_tokens_seen": 122468700, + "step": 5701, + "time_per_iteration": 2.889112949371338 + }, + { + "auxiliary_loss_clip": 0.01479075, + "auxiliary_loss_mlp": 0.01278819, + "balance_loss_clip": 1.14959002, + "balance_loss_mlp": 1.0323894, + "epoch": 0.3428227867127612, + "flos": 21435920330400.0, + "grad_norm": 1.7539281108312346, + "language_loss": 0.71435213, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.74193108, + "num_input_tokens_seen": 122488160, + "step": 5702, + "time_per_iteration": 2.7520205974578857 + }, + { + "auxiliary_loss_clip": 0.01682324, + "auxiliary_loss_mlp": 0.01220589, + "balance_loss_clip": 1.36352456, + "balance_loss_mlp": 1.00315094, + "epoch": 0.34288290996542914, + "flos": 55737959336640.0, + "grad_norm": 0.7859673504789878, + "language_loss": 0.57266057, + "learning_rate": 3.057991990435309e-06, + "loss": 0.60168964, + "num_input_tokens_seen": 122542890, + "step": 5703, + "time_per_iteration": 3.176565647125244 + }, + { + "auxiliary_loss_clip": 0.01499541, + "auxiliary_loss_mlp": 0.01286647, + "balance_loss_clip": 1.17110491, + "balance_loss_mlp": 1.0369755, + "epoch": 0.3429430332180971, + "flos": 20158866432000.0, + "grad_norm": 4.372867277891704, + "language_loss": 0.74894738, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77680928, + "num_input_tokens_seen": 122561770, + "step": 5704, + "time_per_iteration": 2.790480852127075 + }, + { + "auxiliary_loss_clip": 0.01497547, + "auxiliary_loss_mlp": 0.01293047, + "balance_loss_clip": 1.1679132, + "balance_loss_mlp": 1.04833448, + "epoch": 0.34300315647076507, + "flos": 17967696841440.0, + "grad_norm": 2.4880124058720003, + "language_loss": 0.72957265, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75747859, + "num_input_tokens_seen": 122580580, + "step": 5705, + "time_per_iteration": 2.811514139175415 + }, + { + "auxiliary_loss_clip": 0.0148346, + "auxiliary_loss_mlp": 0.01296332, + "balance_loss_clip": 1.15544665, + "balance_loss_mlp": 1.04646921, + "epoch": 0.34306327972343303, + "flos": 22088784306240.0, + "grad_norm": 2.1340353851348297, + "language_loss": 0.79589581, + "learning_rate": 3.057000289991289e-06, + "loss": 0.82369369, + "num_input_tokens_seen": 122599810, + "step": 5706, + "time_per_iteration": 2.8212292194366455 + }, + { + "auxiliary_loss_clip": 0.01506429, + "auxiliary_loss_mlp": 0.01296471, + "balance_loss_clip": 1.17765284, + "balance_loss_mlp": 1.04889727, + "epoch": 0.343123402976101, + "flos": 18444686117760.0, + "grad_norm": 2.1787196870328254, + "language_loss": 0.82803571, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85606468, + "num_input_tokens_seen": 122616035, + "step": 5707, + "time_per_iteration": 2.814704179763794 + }, + { + "auxiliary_loss_clip": 0.01504621, + "auxiliary_loss_mlp": 0.01286443, + "balance_loss_clip": 1.17567086, + "balance_loss_mlp": 1.04230201, + "epoch": 0.34318352622876896, + "flos": 17165508242400.0, + "grad_norm": 3.0927827308378, + "language_loss": 0.75091839, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77882898, + "num_input_tokens_seen": 122633785, + "step": 5708, + "time_per_iteration": 2.766028881072998 + }, + { + "auxiliary_loss_clip": 0.01489868, + "auxiliary_loss_mlp": 0.0129526, + "balance_loss_clip": 1.16140258, + "balance_loss_mlp": 1.05169177, + "epoch": 0.34324364948143693, + "flos": 26690615935200.0, + "grad_norm": 1.6696148687418466, + "language_loss": 0.80928683, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83713818, + "num_input_tokens_seen": 122652100, + "step": 5709, + "time_per_iteration": 2.851217031478882 + }, + { + "auxiliary_loss_clip": 0.01503457, + "auxiliary_loss_mlp": 0.01282037, + "balance_loss_clip": 1.17460167, + "balance_loss_mlp": 1.03713298, + "epoch": 0.3433037727341049, + "flos": 21253711628160.0, + "grad_norm": 2.7269490755288768, + "language_loss": 0.79446852, + "learning_rate": 3.055677461649329e-06, + "loss": 0.82232344, + "num_input_tokens_seen": 122669720, + "step": 5710, + "time_per_iteration": 2.809539318084717 + }, + { + "auxiliary_loss_clip": 0.01497758, + "auxiliary_loss_mlp": 0.01288791, + "balance_loss_clip": 1.16842628, + "balance_loss_mlp": 1.04102683, + "epoch": 0.34336389598677286, + "flos": 20631418113600.0, + "grad_norm": 2.2437634682545347, + "language_loss": 0.69907331, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72693878, + "num_input_tokens_seen": 122688715, + "step": 5711, + "time_per_iteration": 2.9170327186584473 + }, + { + "auxiliary_loss_clip": 0.0149892, + "auxiliary_loss_mlp": 0.01281856, + "balance_loss_clip": 1.16965771, + "balance_loss_mlp": 1.03714335, + "epoch": 0.3434240192394409, + "flos": 14540360273280.0, + "grad_norm": 1.988383942556602, + "language_loss": 0.67735219, + "learning_rate": 3.055015807239812e-06, + "loss": 0.7051599, + "num_input_tokens_seen": 122706970, + "step": 5712, + "time_per_iteration": 2.846966028213501 + }, + { + "auxiliary_loss_clip": 0.01687075, + "auxiliary_loss_mlp": 0.0126149, + "balance_loss_clip": 1.36820161, + "balance_loss_mlp": 1.04710388, + "epoch": 0.34348414249210885, + "flos": 58056986211840.0, + "grad_norm": 0.8454363002555857, + "language_loss": 0.57966781, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60915351, + "num_input_tokens_seen": 122758095, + "step": 5713, + "time_per_iteration": 3.321902275085449 + }, + { + "auxiliary_loss_clip": 0.01500274, + "auxiliary_loss_mlp": 0.01291769, + "balance_loss_clip": 1.17230296, + "balance_loss_mlp": 1.04514861, + "epoch": 0.3435442657447768, + "flos": 20706554527200.0, + "grad_norm": 1.7949847701020238, + "language_loss": 0.80903196, + "learning_rate": 3.054353992805076e-06, + "loss": 0.83695239, + "num_input_tokens_seen": 122777815, + "step": 5714, + "time_per_iteration": 2.9028279781341553 + }, + { + "auxiliary_loss_clip": 0.01501981, + "auxiliary_loss_mlp": 0.01290297, + "balance_loss_clip": 1.17285705, + "balance_loss_mlp": 1.04291415, + "epoch": 0.3436043889974448, + "flos": 22932504604800.0, + "grad_norm": 1.9726082126363849, + "language_loss": 0.71655905, + "learning_rate": 3.05402302560962e-06, + "loss": 0.7444818, + "num_input_tokens_seen": 122797555, + "step": 5715, + "time_per_iteration": 2.8440682888031006 + }, + { + "auxiliary_loss_clip": 0.01687309, + "auxiliary_loss_mlp": 0.0124852, + "balance_loss_clip": 1.36817086, + "balance_loss_mlp": 1.03337097, + "epoch": 0.34366451225011274, + "flos": 58410062732160.0, + "grad_norm": 1.2370150722753213, + "language_loss": 0.65750241, + "learning_rate": 3.053692018445505e-06, + "loss": 0.68686068, + "num_input_tokens_seen": 122863955, + "step": 5716, + "time_per_iteration": 3.3511509895324707 + }, + { + "auxiliary_loss_clip": 0.01505712, + "auxiliary_loss_mlp": 0.0128048, + "balance_loss_clip": 1.17729783, + "balance_loss_mlp": 1.03443146, + "epoch": 0.3437246355027807, + "flos": 15598111364640.0, + "grad_norm": 1.8688056120549852, + "language_loss": 0.74434793, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.77220988, + "num_input_tokens_seen": 122883000, + "step": 5717, + "time_per_iteration": 2.8095860481262207 + }, + { + "auxiliary_loss_clip": 0.01499624, + "auxiliary_loss_mlp": 0.01280115, + "balance_loss_clip": 1.17166555, + "balance_loss_mlp": 1.03559268, + "epoch": 0.34378475875544867, + "flos": 27674709811200.0, + "grad_norm": 2.070514045335462, + "language_loss": 0.75442445, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.78222179, + "num_input_tokens_seen": 122903265, + "step": 5718, + "time_per_iteration": 2.9675347805023193 + }, + { + "auxiliary_loss_clip": 0.01498688, + "auxiliary_loss_mlp": 0.01287041, + "balance_loss_clip": 1.17033005, + "balance_loss_mlp": 1.04232824, + "epoch": 0.34384488200811664, + "flos": 31434224483520.0, + "grad_norm": 2.377879737006369, + "language_loss": 0.63907743, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66693473, + "num_input_tokens_seen": 122923860, + "step": 5719, + "time_per_iteration": 2.983792781829834 + }, + { + "auxiliary_loss_clip": 0.01502684, + "auxiliary_loss_mlp": 0.01294098, + "balance_loss_clip": 1.17348862, + "balance_loss_mlp": 1.0474782, + "epoch": 0.3439050052607846, + "flos": 24902247411360.0, + "grad_norm": 2.1208581520995606, + "language_loss": 0.73389304, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.76186085, + "num_input_tokens_seen": 122945305, + "step": 5720, + "time_per_iteration": 2.8484432697296143 + }, + { + "auxiliary_loss_clip": 0.01508944, + "auxiliary_loss_mlp": 0.01305217, + "balance_loss_clip": 1.17927325, + "balance_loss_mlp": 1.06279337, + "epoch": 0.34396512851345257, + "flos": 18152029520640.0, + "grad_norm": 1.7792628424242407, + "language_loss": 0.74472934, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.77287096, + "num_input_tokens_seen": 122962535, + "step": 5721, + "time_per_iteration": 2.8156578540802 + }, + { + "auxiliary_loss_clip": 0.01502839, + "auxiliary_loss_mlp": 0.01273808, + "balance_loss_clip": 1.17437768, + "balance_loss_mlp": 1.02508974, + "epoch": 0.34402525176612053, + "flos": 16036299768960.0, + "grad_norm": 2.144554709362242, + "language_loss": 0.80296105, + "learning_rate": 3.051705136821992e-06, + "loss": 0.83072758, + "num_input_tokens_seen": 122979750, + "step": 5722, + "time_per_iteration": 2.749423027038574 + }, + { + "auxiliary_loss_clip": 0.01500301, + "auxiliary_loss_mlp": 0.01278818, + "balance_loss_clip": 1.17183518, + "balance_loss_mlp": 1.03677559, + "epoch": 0.3440853750187885, + "flos": 21180775047840.0, + "grad_norm": 1.7732766636916522, + "language_loss": 0.81415939, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84195054, + "num_input_tokens_seen": 122998955, + "step": 5723, + "time_per_iteration": 2.79878306388855 + }, + { + "auxiliary_loss_clip": 0.01499159, + "auxiliary_loss_mlp": 0.01286509, + "balance_loss_clip": 1.17087197, + "balance_loss_mlp": 1.04103291, + "epoch": 0.34414549827145646, + "flos": 12679548235200.0, + "grad_norm": 2.1423109928724724, + "language_loss": 0.81377375, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.8416304, + "num_input_tokens_seen": 123016165, + "step": 5724, + "time_per_iteration": 2.7112479209899902 + }, + { + "auxiliary_loss_clip": 0.01499393, + "auxiliary_loss_mlp": 0.01284793, + "balance_loss_clip": 1.17052364, + "balance_loss_mlp": 1.03855479, + "epoch": 0.3442056215241244, + "flos": 31287175549920.0, + "grad_norm": 2.0478728213648223, + "language_loss": 0.6889168, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71675867, + "num_input_tokens_seen": 123036900, + "step": 5725, + "time_per_iteration": 4.442259311676025 + }, + { + "auxiliary_loss_clip": 0.01497538, + "auxiliary_loss_mlp": 0.01281739, + "balance_loss_clip": 1.16885126, + "balance_loss_mlp": 1.0292064, + "epoch": 0.34426574477679245, + "flos": 23369517236160.0, + "grad_norm": 1.568547067496701, + "language_loss": 0.69289768, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.72069049, + "num_input_tokens_seen": 123057480, + "step": 5726, + "time_per_iteration": 3.0351386070251465 + }, + { + "auxiliary_loss_clip": 0.0149448, + "auxiliary_loss_mlp": 0.01288507, + "balance_loss_clip": 1.16671634, + "balance_loss_mlp": 1.04608274, + "epoch": 0.3443258680294604, + "flos": 24537754150560.0, + "grad_norm": 2.0944276865623968, + "language_loss": 0.73619241, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.76402229, + "num_input_tokens_seen": 123076890, + "step": 5727, + "time_per_iteration": 2.7915334701538086 + }, + { + "auxiliary_loss_clip": 0.01505338, + "auxiliary_loss_mlp": 0.01286248, + "balance_loss_clip": 1.1771549, + "balance_loss_mlp": 1.0392468, + "epoch": 0.3443859912821284, + "flos": 20232220222080.0, + "grad_norm": 3.9405176966738473, + "language_loss": 0.88091135, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90882719, + "num_input_tokens_seen": 123092530, + "step": 5728, + "time_per_iteration": 2.738186836242676 + }, + { + "auxiliary_loss_clip": 0.01502922, + "auxiliary_loss_mlp": 0.01286432, + "balance_loss_clip": 1.17507386, + "balance_loss_mlp": 1.04210091, + "epoch": 0.34444611453479634, + "flos": 24318830625120.0, + "grad_norm": 2.069892192862021, + "language_loss": 0.70432913, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.73222268, + "num_input_tokens_seen": 123110560, + "step": 5729, + "time_per_iteration": 2.852323055267334 + }, + { + "auxiliary_loss_clip": 0.01506914, + "auxiliary_loss_mlp": 0.01271225, + "balance_loss_clip": 1.17863309, + "balance_loss_mlp": 1.02727473, + "epoch": 0.3445062377874643, + "flos": 16985347660800.0, + "grad_norm": 2.0000717471953986, + "language_loss": 0.73571724, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.76349866, + "num_input_tokens_seen": 123128655, + "step": 5730, + "time_per_iteration": 2.764259099960327 + }, + { + "auxiliary_loss_clip": 0.0149953, + "auxiliary_loss_mlp": 0.01274676, + "balance_loss_clip": 1.17208791, + "balance_loss_mlp": 1.02862811, + "epoch": 0.3445663610401323, + "flos": 20304777520800.0, + "grad_norm": 2.185804762721448, + "language_loss": 0.79747277, + "learning_rate": 3.048722123283578e-06, + "loss": 0.82521474, + "num_input_tokens_seen": 123145130, + "step": 5731, + "time_per_iteration": 2.8113880157470703 + }, + { + "auxiliary_loss_clip": 0.01505454, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 1.17787981, + "balance_loss_mlp": 1.02742076, + "epoch": 0.34462648429280024, + "flos": 15889933542240.0, + "grad_norm": 2.1028032241016965, + "language_loss": 0.78603232, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.81382346, + "num_input_tokens_seen": 123162265, + "step": 5732, + "time_per_iteration": 2.8467636108398438 + }, + { + "auxiliary_loss_clip": 0.0168113, + "auxiliary_loss_mlp": 0.01221436, + "balance_loss_clip": 1.36664367, + "balance_loss_mlp": 1.00094604, + "epoch": 0.3446866075454682, + "flos": 59317616852640.0, + "grad_norm": 0.7427947633718295, + "language_loss": 0.53495741, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.56398308, + "num_input_tokens_seen": 123218620, + "step": 5733, + "time_per_iteration": 3.3715689182281494 + }, + { + "auxiliary_loss_clip": 0.01514176, + "auxiliary_loss_mlp": 0.01278382, + "balance_loss_clip": 1.18528783, + "balance_loss_mlp": 1.03614879, + "epoch": 0.34474673079813617, + "flos": 22345863924960.0, + "grad_norm": 1.8020183987325538, + "language_loss": 0.8352133, + "learning_rate": 3.047727069167207e-06, + "loss": 0.86313891, + "num_input_tokens_seen": 123237325, + "step": 5734, + "time_per_iteration": 2.9195823669433594 + }, + { + "auxiliary_loss_clip": 0.01497463, + "auxiliary_loss_mlp": 0.01282121, + "balance_loss_clip": 1.17099142, + "balance_loss_mlp": 1.03511882, + "epoch": 0.34480685405080413, + "flos": 27672472049760.0, + "grad_norm": 2.0969783965672555, + "language_loss": 0.92732239, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.95511824, + "num_input_tokens_seen": 123258650, + "step": 5735, + "time_per_iteration": 2.8945250511169434 + }, + { + "auxiliary_loss_clip": 0.01512768, + "auxiliary_loss_mlp": 0.01286592, + "balance_loss_clip": 1.18606639, + "balance_loss_mlp": 1.04264212, + "epoch": 0.3448669773034721, + "flos": 22458246156000.0, + "grad_norm": 1.7710042878475896, + "language_loss": 0.77037346, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.79836714, + "num_input_tokens_seen": 123277155, + "step": 5736, + "time_per_iteration": 6.258164644241333 + }, + { + "auxiliary_loss_clip": 0.01504494, + "auxiliary_loss_mlp": 0.01292829, + "balance_loss_clip": 1.17696714, + "balance_loss_mlp": 1.05021405, + "epoch": 0.34492710055614006, + "flos": 24938507096640.0, + "grad_norm": 1.8888989785709636, + "language_loss": 0.78930956, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.81728274, + "num_input_tokens_seen": 123297640, + "step": 5737, + "time_per_iteration": 2.8170011043548584 + }, + { + "auxiliary_loss_clip": 0.01512152, + "auxiliary_loss_mlp": 0.01290767, + "balance_loss_clip": 1.18432546, + "balance_loss_mlp": 1.04300237, + "epoch": 0.34498722380880803, + "flos": 20122910172000.0, + "grad_norm": 2.600787364338154, + "language_loss": 0.71780813, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.74583733, + "num_input_tokens_seen": 123314370, + "step": 5738, + "time_per_iteration": 4.306297302246094 + }, + { + "auxiliary_loss_clip": 0.01504801, + "auxiliary_loss_mlp": 0.01292587, + "balance_loss_clip": 1.17679763, + "balance_loss_mlp": 1.04825568, + "epoch": 0.34504734706147605, + "flos": 28440600796800.0, + "grad_norm": 3.5086680942856474, + "language_loss": 0.81746143, + "learning_rate": 3.046067851209389e-06, + "loss": 0.84543526, + "num_input_tokens_seen": 123336085, + "step": 5739, + "time_per_iteration": 2.9019670486450195 + }, + { + "auxiliary_loss_clip": 0.01513396, + "auxiliary_loss_mlp": 0.01294139, + "balance_loss_clip": 1.18510234, + "balance_loss_mlp": 1.0477097, + "epoch": 0.345107470314144, + "flos": 22676676615360.0, + "grad_norm": 2.019940471275447, + "language_loss": 0.82782984, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85590518, + "num_input_tokens_seen": 123354460, + "step": 5740, + "time_per_iteration": 2.7606794834136963 + }, + { + "auxiliary_loss_clip": 0.01512281, + "auxiliary_loss_mlp": 0.01292711, + "balance_loss_clip": 1.18547893, + "balance_loss_mlp": 1.04818869, + "epoch": 0.345167593566812, + "flos": 20632783527360.0, + "grad_norm": 2.2954946606342395, + "language_loss": 0.76940429, + "learning_rate": 3.045403886269181e-06, + "loss": 0.79745424, + "num_input_tokens_seen": 123373420, + "step": 5741, + "time_per_iteration": 2.7622833251953125 + }, + { + "auxiliary_loss_clip": 0.01502672, + "auxiliary_loss_mlp": 0.01277775, + "balance_loss_clip": 1.17547023, + "balance_loss_mlp": 1.03058243, + "epoch": 0.34522771681947995, + "flos": 26216812624320.0, + "grad_norm": 1.6074287331930035, + "language_loss": 0.77234304, + "learning_rate": 3.045071844330053e-06, + "loss": 0.80014753, + "num_input_tokens_seen": 123394730, + "step": 5742, + "time_per_iteration": 2.821929454803467 + }, + { + "auxiliary_loss_clip": 0.01498115, + "auxiliary_loss_mlp": 0.01301103, + "balance_loss_clip": 1.17173374, + "balance_loss_mlp": 1.06173062, + "epoch": 0.3452878400721479, + "flos": 19064362589280.0, + "grad_norm": 2.2259183140409946, + "language_loss": 0.76398945, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.7919817, + "num_input_tokens_seen": 123412895, + "step": 5743, + "time_per_iteration": 2.775200843811035 + }, + { + "auxiliary_loss_clip": 0.0150863, + "auxiliary_loss_mlp": 0.01292984, + "balance_loss_clip": 1.18109977, + "balance_loss_mlp": 1.05056, + "epoch": 0.3453479633248159, + "flos": 27932320424160.0, + "grad_norm": 1.6655276662980043, + "language_loss": 0.70168692, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72970307, + "num_input_tokens_seen": 123432320, + "step": 5744, + "time_per_iteration": 2.844086170196533 + }, + { + "auxiliary_loss_clip": 0.01508663, + "auxiliary_loss_mlp": 0.01288702, + "balance_loss_clip": 1.18190241, + "balance_loss_mlp": 1.04761279, + "epoch": 0.34540808657748384, + "flos": 19607954443200.0, + "grad_norm": 1.7468368793520708, + "language_loss": 0.79676318, + "learning_rate": 3.044075480787665e-06, + "loss": 0.82473677, + "num_input_tokens_seen": 123450980, + "step": 5745, + "time_per_iteration": 2.753908157348633 + }, + { + "auxiliary_loss_clip": 0.01507905, + "auxiliary_loss_mlp": 0.01294266, + "balance_loss_clip": 1.18047237, + "balance_loss_mlp": 1.05088806, + "epoch": 0.3454682098301518, + "flos": 20413708289280.0, + "grad_norm": 1.8460222681311786, + "language_loss": 0.89236796, + "learning_rate": 3.043743280407182e-06, + "loss": 0.92038965, + "num_input_tokens_seen": 123469365, + "step": 5746, + "time_per_iteration": 2.7670414447784424 + }, + { + "auxiliary_loss_clip": 0.01510271, + "auxiliary_loss_mlp": 0.01277081, + "balance_loss_clip": 1.18292642, + "balance_loss_mlp": 1.03122365, + "epoch": 0.34552833308281977, + "flos": 21327292987200.0, + "grad_norm": 2.4039999475818363, + "language_loss": 0.64815772, + "learning_rate": 3.043411040447849e-06, + "loss": 0.67603123, + "num_input_tokens_seen": 123489425, + "step": 5747, + "time_per_iteration": 2.9055778980255127 + }, + { + "auxiliary_loss_clip": 0.01505715, + "auxiliary_loss_mlp": 0.01275857, + "balance_loss_clip": 1.17913318, + "balance_loss_mlp": 1.03476834, + "epoch": 0.34558845633548774, + "flos": 36246407873760.0, + "grad_norm": 1.7551451653428147, + "language_loss": 0.73180187, + "learning_rate": 3.043078760922264e-06, + "loss": 0.75961769, + "num_input_tokens_seen": 123509970, + "step": 5748, + "time_per_iteration": 2.8855762481689453 + }, + { + "auxiliary_loss_clip": 0.01504254, + "auxiliary_loss_mlp": 0.01283324, + "balance_loss_clip": 1.17797101, + "balance_loss_mlp": 1.04433322, + "epoch": 0.3456485795881557, + "flos": 22452481075680.0, + "grad_norm": 1.8387330429594209, + "language_loss": 0.75481653, + "learning_rate": 3.042746441843029e-06, + "loss": 0.78269231, + "num_input_tokens_seen": 123531055, + "step": 5749, + "time_per_iteration": 2.8220698833465576 + }, + { + "auxiliary_loss_clip": 0.01671284, + "auxiliary_loss_mlp": 0.01250511, + "balance_loss_clip": 1.35921395, + "balance_loss_mlp": 1.03917694, + "epoch": 0.34570870284082367, + "flos": 62010656956800.0, + "grad_norm": 0.8869431481422702, + "language_loss": 0.62605542, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.65527338, + "num_input_tokens_seen": 123584720, + "step": 5750, + "time_per_iteration": 3.206876516342163 + }, + { + "auxiliary_loss_clip": 0.01506785, + "auxiliary_loss_mlp": 0.0128204, + "balance_loss_clip": 1.1812228, + "balance_loss_mlp": 1.04190493, + "epoch": 0.34576882609349163, + "flos": 22784317826400.0, + "grad_norm": 13.550289819369715, + "language_loss": 0.80502343, + "learning_rate": 3.042081685074012e-06, + "loss": 0.83291167, + "num_input_tokens_seen": 123604465, + "step": 5751, + "time_per_iteration": 2.8999969959259033 + }, + { + "auxiliary_loss_clip": 0.0150696, + "auxiliary_loss_mlp": 0.01291526, + "balance_loss_clip": 1.17964435, + "balance_loss_mlp": 1.05291641, + "epoch": 0.34582894934615965, + "flos": 12350442312000.0, + "grad_norm": 2.070248236456644, + "language_loss": 0.84316391, + "learning_rate": 3.041749247409439e-06, + "loss": 0.87114871, + "num_input_tokens_seen": 123622320, + "step": 5752, + "time_per_iteration": 2.8042681217193604 + }, + { + "auxiliary_loss_clip": 0.01676467, + "auxiliary_loss_mlp": 0.01237122, + "balance_loss_clip": 1.36514199, + "balance_loss_mlp": 1.02578735, + "epoch": 0.3458890725988276, + "flos": 70173978513120.0, + "grad_norm": 0.7363188893122885, + "language_loss": 0.63009155, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.65922749, + "num_input_tokens_seen": 123678010, + "step": 5753, + "time_per_iteration": 3.2379252910614014 + }, + { + "auxiliary_loss_clip": 0.01516502, + "auxiliary_loss_mlp": 0.01284473, + "balance_loss_clip": 1.18805146, + "balance_loss_mlp": 1.04204905, + "epoch": 0.3459491958514956, + "flos": 17094505998240.0, + "grad_norm": 2.50003248996437, + "language_loss": 0.70603216, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73404193, + "num_input_tokens_seen": 123696830, + "step": 5754, + "time_per_iteration": 2.7730724811553955 + }, + { + "auxiliary_loss_clip": 0.01508689, + "auxiliary_loss_mlp": 0.01279055, + "balance_loss_clip": 1.18237758, + "balance_loss_mlp": 1.03300714, + "epoch": 0.34600931910416355, + "flos": 16652638562400.0, + "grad_norm": 1.7321619757940478, + "language_loss": 0.73008251, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75795996, + "num_input_tokens_seen": 123714360, + "step": 5755, + "time_per_iteration": 2.8281946182250977 + }, + { + "auxiliary_loss_clip": 0.01510334, + "auxiliary_loss_mlp": 0.01277134, + "balance_loss_clip": 1.1846441, + "balance_loss_mlp": 1.03146768, + "epoch": 0.3460694423568315, + "flos": 38549314916640.0, + "grad_norm": 1.5108988797134086, + "language_loss": 0.72819513, + "learning_rate": 3.040419101844869e-06, + "loss": 0.75606984, + "num_input_tokens_seen": 123739250, + "step": 5756, + "time_per_iteration": 2.9476375579833984 + }, + { + "auxiliary_loss_clip": 0.01683712, + "auxiliary_loss_mlp": 0.01237709, + "balance_loss_clip": 1.3730191, + "balance_loss_mlp": 1.02484894, + "epoch": 0.3461295656094995, + "flos": 72088800615360.0, + "grad_norm": 0.7109263354650579, + "language_loss": 0.62425196, + "learning_rate": 3.040086466790207e-06, + "loss": 0.65346611, + "num_input_tokens_seen": 123802845, + "step": 5757, + "time_per_iteration": 3.33195424079895 + }, + { + "auxiliary_loss_clip": 0.01681639, + "auxiliary_loss_mlp": 0.01235771, + "balance_loss_clip": 1.37178004, + "balance_loss_mlp": 1.02367401, + "epoch": 0.34618968886216744, + "flos": 65466060727680.0, + "grad_norm": 0.8251972538708621, + "language_loss": 0.59186798, + "learning_rate": 3.039753792295362e-06, + "loss": 0.62104213, + "num_input_tokens_seen": 123861805, + "step": 5758, + "time_per_iteration": 3.263295888900757 + }, + { + "auxiliary_loss_clip": 0.01525089, + "auxiliary_loss_mlp": 0.01283697, + "balance_loss_clip": 1.19980597, + "balance_loss_mlp": 1.04012871, + "epoch": 0.3462498121148354, + "flos": 23474541404160.0, + "grad_norm": 1.843690879298152, + "language_loss": 0.72077721, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.74886513, + "num_input_tokens_seen": 123881820, + "step": 5759, + "time_per_iteration": 2.8870840072631836 + }, + { + "auxiliary_loss_clip": 0.01507143, + "auxiliary_loss_mlp": 0.01284705, + "balance_loss_clip": 1.18241167, + "balance_loss_mlp": 1.04399753, + "epoch": 0.3463099353675034, + "flos": 24173109177120.0, + "grad_norm": 1.817381617413299, + "language_loss": 0.83625335, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.86417186, + "num_input_tokens_seen": 123903700, + "step": 5760, + "time_per_iteration": 2.882352590560913 + }, + { + "auxiliary_loss_clip": 0.01676463, + "auxiliary_loss_mlp": 0.01249718, + "balance_loss_clip": 1.3694011, + "balance_loss_mlp": 1.0406723, + "epoch": 0.34637005862017134, + "flos": 63706024903680.0, + "grad_norm": 0.8406082336650457, + "language_loss": 0.5649426, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.59420437, + "num_input_tokens_seen": 123960075, + "step": 5761, + "time_per_iteration": 3.397055149078369 + }, + { + "auxiliary_loss_clip": 0.01514333, + "auxiliary_loss_mlp": 0.01275831, + "balance_loss_clip": 1.18848085, + "balance_loss_mlp": 1.0339787, + "epoch": 0.3464301818728393, + "flos": 13146865830720.0, + "grad_norm": 2.6841275817792476, + "language_loss": 0.95715535, + "learning_rate": 3.038422700166474e-06, + "loss": 0.985057, + "num_input_tokens_seen": 123975805, + "step": 5762, + "time_per_iteration": 3.00024151802063 + }, + { + "auxiliary_loss_clip": 0.01509454, + "auxiliary_loss_mlp": 0.01287523, + "balance_loss_clip": 1.18386126, + "balance_loss_mlp": 1.04567146, + "epoch": 0.34649030512550727, + "flos": 29317774096800.0, + "grad_norm": 1.8277664992628844, + "language_loss": 0.69852275, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.72649252, + "num_input_tokens_seen": 123997530, + "step": 5763, + "time_per_iteration": 4.557919502258301 + }, + { + "auxiliary_loss_clip": 0.01520388, + "auxiliary_loss_mlp": 0.01289435, + "balance_loss_clip": 1.19433725, + "balance_loss_mlp": 1.04453135, + "epoch": 0.34655042837817523, + "flos": 23733062292960.0, + "grad_norm": 1.8650699205331849, + "language_loss": 0.83765757, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.8657558, + "num_input_tokens_seen": 124016375, + "step": 5764, + "time_per_iteration": 2.830314874649048 + }, + { + "auxiliary_loss_clip": 0.01521393, + "auxiliary_loss_mlp": 0.01291751, + "balance_loss_clip": 1.19555473, + "balance_loss_mlp": 1.04818273, + "epoch": 0.34661055163084326, + "flos": 22056393293280.0, + "grad_norm": 3.10654368653933, + "language_loss": 0.67951548, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.70764685, + "num_input_tokens_seen": 124033975, + "step": 5765, + "time_per_iteration": 2.7729697227478027 + }, + { + "auxiliary_loss_clip": 0.01525429, + "auxiliary_loss_mlp": 0.01288216, + "balance_loss_clip": 1.19913149, + "balance_loss_mlp": 1.04560089, + "epoch": 0.3466706748835112, + "flos": 21801399723360.0, + "grad_norm": 1.987781060340692, + "language_loss": 0.76991749, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79805392, + "num_input_tokens_seen": 124051930, + "step": 5766, + "time_per_iteration": 2.875878095626831 + }, + { + "auxiliary_loss_clip": 0.01506994, + "auxiliary_loss_mlp": 0.01290371, + "balance_loss_clip": 1.18092453, + "balance_loss_mlp": 1.05023551, + "epoch": 0.3467307981361792, + "flos": 19463598408960.0, + "grad_norm": 1.6624387630841284, + "language_loss": 0.734842, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.76281565, + "num_input_tokens_seen": 124071220, + "step": 5767, + "time_per_iteration": 2.850152015686035 + }, + { + "auxiliary_loss_clip": 0.01504034, + "auxiliary_loss_mlp": 0.01287389, + "balance_loss_clip": 1.17853379, + "balance_loss_mlp": 1.04286647, + "epoch": 0.34679092138884715, + "flos": 24829803897120.0, + "grad_norm": 2.187964265441294, + "language_loss": 0.77904844, + "learning_rate": 3.036424880912893e-06, + "loss": 0.80696261, + "num_input_tokens_seen": 124090140, + "step": 5768, + "time_per_iteration": 3.1928365230560303 + }, + { + "auxiliary_loss_clip": 0.01610079, + "auxiliary_loss_mlp": 0.0125827, + "balance_loss_clip": 1.30175936, + "balance_loss_mlp": 1.04922485, + "epoch": 0.3468510446415151, + "flos": 63242007058080.0, + "grad_norm": 0.7705321471790287, + "language_loss": 0.57380235, + "learning_rate": 3.036091773408956e-06, + "loss": 0.60248584, + "num_input_tokens_seen": 124152025, + "step": 5769, + "time_per_iteration": 3.3301498889923096 + }, + { + "auxiliary_loss_clip": 0.0151081, + "auxiliary_loss_mlp": 0.01292814, + "balance_loss_clip": 1.18362641, + "balance_loss_mlp": 1.04600286, + "epoch": 0.3469111678941831, + "flos": 12121809177600.0, + "grad_norm": 3.304650797510915, + "language_loss": 0.86361295, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.89164925, + "num_input_tokens_seen": 124165795, + "step": 5770, + "time_per_iteration": 2.800597906112671 + }, + { + "auxiliary_loss_clip": 0.01586785, + "auxiliary_loss_mlp": 0.01272118, + "balance_loss_clip": 1.27772772, + "balance_loss_mlp": 1.06383514, + "epoch": 0.34697129114685105, + "flos": 65940091607520.0, + "grad_norm": 0.7763487833405288, + "language_loss": 0.59730005, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.62588906, + "num_input_tokens_seen": 124222925, + "step": 5771, + "time_per_iteration": 3.1185455322265625 + }, + { + "auxiliary_loss_clip": 0.01514324, + "auxiliary_loss_mlp": 0.01284771, + "balance_loss_clip": 1.1874305, + "balance_loss_mlp": 1.04024863, + "epoch": 0.347031414399519, + "flos": 34456787720640.0, + "grad_norm": 2.037299288865917, + "language_loss": 0.71855223, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.74654323, + "num_input_tokens_seen": 124240915, + "step": 5772, + "time_per_iteration": 2.8803870677948 + }, + { + "auxiliary_loss_clip": 0.01508702, + "auxiliary_loss_mlp": 0.01279596, + "balance_loss_clip": 1.18384576, + "balance_loss_mlp": 1.03354764, + "epoch": 0.347091537652187, + "flos": 26946633565440.0, + "grad_norm": 11.00718133202892, + "language_loss": 0.76301801, + "learning_rate": 3.034758950632507e-06, + "loss": 0.79090095, + "num_input_tokens_seen": 124262770, + "step": 5773, + "time_per_iteration": 2.819829225540161 + }, + { + "auxiliary_loss_clip": 0.01507119, + "auxiliary_loss_mlp": 0.01280989, + "balance_loss_clip": 1.18148506, + "balance_loss_mlp": 1.03303385, + "epoch": 0.34715166090485494, + "flos": 21144363649920.0, + "grad_norm": 2.2370198121640765, + "language_loss": 0.70368207, + "learning_rate": 3.034425646811396e-06, + "loss": 0.73156315, + "num_input_tokens_seen": 124280950, + "step": 5774, + "time_per_iteration": 6.748900651931763 + }, + { + "auxiliary_loss_clip": 0.01512068, + "auxiliary_loss_mlp": 0.01278923, + "balance_loss_clip": 1.18651569, + "balance_loss_mlp": 1.03440058, + "epoch": 0.3472117841575229, + "flos": 23480306484480.0, + "grad_norm": 1.7461870058310658, + "language_loss": 0.76335371, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.79126358, + "num_input_tokens_seen": 124299540, + "step": 5775, + "time_per_iteration": 2.889427423477173 + }, + { + "auxiliary_loss_clip": 0.01514675, + "auxiliary_loss_mlp": 0.01275752, + "balance_loss_clip": 1.18898797, + "balance_loss_mlp": 1.02531695, + "epoch": 0.34727190741019087, + "flos": 17494690021920.0, + "grad_norm": 2.5288440219459254, + "language_loss": 0.77507359, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.80297786, + "num_input_tokens_seen": 124316285, + "step": 5776, + "time_per_iteration": 4.365558385848999 + }, + { + "auxiliary_loss_clip": 0.01579613, + "auxiliary_loss_mlp": 0.01234192, + "balance_loss_clip": 1.26813614, + "balance_loss_mlp": 1.02209473, + "epoch": 0.34733203066285884, + "flos": 65272814566560.0, + "grad_norm": 0.8804831818042895, + "language_loss": 0.63277042, + "learning_rate": 3.033425500045478e-06, + "loss": 0.66090846, + "num_input_tokens_seen": 124376650, + "step": 5777, + "time_per_iteration": 3.340200185775757 + }, + { + "auxiliary_loss_clip": 0.01504243, + "auxiliary_loss_mlp": 0.01283092, + "balance_loss_clip": 1.1781919, + "balance_loss_mlp": 1.032848, + "epoch": 0.3473921539155268, + "flos": 28661306945760.0, + "grad_norm": 2.453256927240527, + "language_loss": 0.64750946, + "learning_rate": 3.033092039398119e-06, + "loss": 0.67538285, + "num_input_tokens_seen": 124396475, + "step": 5778, + "time_per_iteration": 2.820413589477539 + }, + { + "auxiliary_loss_clip": 0.01515369, + "auxiliary_loss_mlp": 0.01284454, + "balance_loss_clip": 1.18821323, + "balance_loss_mlp": 1.03764343, + "epoch": 0.3474522771681948, + "flos": 40839705666720.0, + "grad_norm": 2.1809635988018994, + "language_loss": 0.71778446, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.74578273, + "num_input_tokens_seen": 124416480, + "step": 5779, + "time_per_iteration": 2.9417052268981934 + }, + { + "auxiliary_loss_clip": 0.01515701, + "auxiliary_loss_mlp": 0.01282575, + "balance_loss_clip": 1.18956876, + "balance_loss_mlp": 1.03461993, + "epoch": 0.3475124004208628, + "flos": 24610918299840.0, + "grad_norm": 4.129414608424925, + "language_loss": 0.62308913, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.65107191, + "num_input_tokens_seen": 124435950, + "step": 5780, + "time_per_iteration": 2.792219400405884 + }, + { + "auxiliary_loss_clip": 0.01514171, + "auxiliary_loss_mlp": 0.0128378, + "balance_loss_clip": 1.18828607, + "balance_loss_mlp": 1.03868592, + "epoch": 0.34757252367353075, + "flos": 22713543151200.0, + "grad_norm": 1.7734373150181864, + "language_loss": 0.72030747, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74828696, + "num_input_tokens_seen": 124455410, + "step": 5781, + "time_per_iteration": 2.7841837406158447 + }, + { + "auxiliary_loss_clip": 0.0151194, + "auxiliary_loss_mlp": 0.01282057, + "balance_loss_clip": 1.18544996, + "balance_loss_mlp": 1.03677213, + "epoch": 0.3476326469261987, + "flos": 19830215646720.0, + "grad_norm": 2.1734671398205405, + "language_loss": 0.77177286, + "learning_rate": 3.031757805185612e-06, + "loss": 0.7997129, + "num_input_tokens_seen": 124474870, + "step": 5782, + "time_per_iteration": 2.7669410705566406 + }, + { + "auxiliary_loss_clip": 0.01519075, + "auxiliary_loss_mlp": 0.01285808, + "balance_loss_clip": 1.19032836, + "balance_loss_mlp": 1.04109538, + "epoch": 0.3476927701788667, + "flos": 19940170475520.0, + "grad_norm": 2.290048947683242, + "language_loss": 0.62778032, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.65582913, + "num_input_tokens_seen": 124494105, + "step": 5783, + "time_per_iteration": 2.8730993270874023 + }, + { + "auxiliary_loss_clip": 0.01505235, + "auxiliary_loss_mlp": 0.01284366, + "balance_loss_clip": 1.17740488, + "balance_loss_mlp": 1.0457561, + "epoch": 0.34775289343153465, + "flos": 20736897419520.0, + "grad_norm": 1.665835351619322, + "language_loss": 0.8872515, + "learning_rate": 3.031090453282605e-06, + "loss": 0.91514754, + "num_input_tokens_seen": 124512030, + "step": 5784, + "time_per_iteration": 2.7777841091156006 + }, + { + "auxiliary_loss_clip": 0.01512962, + "auxiliary_loss_mlp": 0.0127302, + "balance_loss_clip": 1.18618655, + "balance_loss_mlp": 1.02887964, + "epoch": 0.3478130166842026, + "flos": 19356905401920.0, + "grad_norm": 2.1645011454134773, + "language_loss": 0.81647944, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.84433925, + "num_input_tokens_seen": 124530980, + "step": 5785, + "time_per_iteration": 2.800091028213501 + }, + { + "auxiliary_loss_clip": 0.01522908, + "auxiliary_loss_mlp": 0.0128231, + "balance_loss_clip": 1.19619596, + "balance_loss_mlp": 1.03950429, + "epoch": 0.3478731399368706, + "flos": 22053207327840.0, + "grad_norm": 1.799871789068097, + "language_loss": 0.80467296, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.83272511, + "num_input_tokens_seen": 124549330, + "step": 5786, + "time_per_iteration": 2.751084804534912 + }, + { + "auxiliary_loss_clip": 0.01505341, + "auxiliary_loss_mlp": 0.01277165, + "balance_loss_clip": 1.17893612, + "balance_loss_mlp": 1.03435969, + "epoch": 0.34793326318953854, + "flos": 18043440105600.0, + "grad_norm": 1.7813806484055885, + "language_loss": 0.75189769, + "learning_rate": 3.030089132216836e-06, + "loss": 0.77972275, + "num_input_tokens_seen": 124567200, + "step": 5787, + "time_per_iteration": 2.7772462368011475 + }, + { + "auxiliary_loss_clip": 0.0150426, + "auxiliary_loss_mlp": 0.01281956, + "balance_loss_clip": 1.17801929, + "balance_loss_mlp": 1.04048622, + "epoch": 0.3479933864422065, + "flos": 29317243102560.0, + "grad_norm": 1.7798048941007336, + "language_loss": 0.81384146, + "learning_rate": 3.029755280389203e-06, + "loss": 0.84170365, + "num_input_tokens_seen": 124587025, + "step": 5788, + "time_per_iteration": 2.869027853012085 + }, + { + "auxiliary_loss_clip": 0.01523736, + "auxiliary_loss_mlp": 0.01296227, + "balance_loss_clip": 1.19686639, + "balance_loss_mlp": 1.04865289, + "epoch": 0.3480535096948745, + "flos": 20122796387520.0, + "grad_norm": 1.844980639230532, + "language_loss": 0.85494268, + "learning_rate": 3.029421389513147e-06, + "loss": 0.88314235, + "num_input_tokens_seen": 124605860, + "step": 5789, + "time_per_iteration": 2.8723950386047363 + }, + { + "auxiliary_loss_clip": 0.0151481, + "auxiliary_loss_mlp": 0.01294892, + "balance_loss_clip": 1.18845797, + "balance_loss_mlp": 1.04884458, + "epoch": 0.34811363294754244, + "flos": 18550810202400.0, + "grad_norm": 3.4940540622320446, + "language_loss": 0.8516593, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87975633, + "num_input_tokens_seen": 124624270, + "step": 5790, + "time_per_iteration": 2.966134548187256 + }, + { + "auxiliary_loss_clip": 0.01512524, + "auxiliary_loss_mlp": 0.01278078, + "balance_loss_clip": 1.18631995, + "balance_loss_mlp": 1.02955091, + "epoch": 0.3481737562002104, + "flos": 26872748781120.0, + "grad_norm": 5.39231026917717, + "language_loss": 0.81538522, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.84329122, + "num_input_tokens_seen": 124644005, + "step": 5791, + "time_per_iteration": 2.760885715484619 + }, + { + "auxiliary_loss_clip": 0.01515962, + "auxiliary_loss_mlp": 0.01276134, + "balance_loss_clip": 1.18937469, + "balance_loss_mlp": 1.02970433, + "epoch": 0.3482338794528784, + "flos": 28910952645120.0, + "grad_norm": 2.515044994812503, + "language_loss": 0.77907705, + "learning_rate": 3.028419482721056e-06, + "loss": 0.80699807, + "num_input_tokens_seen": 124663020, + "step": 5792, + "time_per_iteration": 2.791049003601074 + }, + { + "auxiliary_loss_clip": 0.01503184, + "auxiliary_loss_mlp": 0.01270432, + "balance_loss_clip": 1.17714512, + "balance_loss_mlp": 1.02667284, + "epoch": 0.3482940027055464, + "flos": 22202987088960.0, + "grad_norm": 1.7185107623293425, + "language_loss": 0.8179152, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.84565139, + "num_input_tokens_seen": 124682975, + "step": 5793, + "time_per_iteration": 2.7404448986053467 + }, + { + "auxiliary_loss_clip": 0.01515569, + "auxiliary_loss_mlp": 0.01292899, + "balance_loss_clip": 1.18913388, + "balance_loss_mlp": 1.04990244, + "epoch": 0.34835412595821436, + "flos": 20304777520800.0, + "grad_norm": 2.286768131940103, + "language_loss": 0.76318908, + "learning_rate": 3.027751349849706e-06, + "loss": 0.79127371, + "num_input_tokens_seen": 124701340, + "step": 5794, + "time_per_iteration": 2.814466714859009 + }, + { + "auxiliary_loss_clip": 0.01504448, + "auxiliary_loss_mlp": 0.01283187, + "balance_loss_clip": 1.17748332, + "balance_loss_mlp": 1.04267013, + "epoch": 0.3484142492108823, + "flos": 20451826454400.0, + "grad_norm": 1.8855823223824613, + "language_loss": 0.57833314, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.60620952, + "num_input_tokens_seen": 124719165, + "step": 5795, + "time_per_iteration": 2.752012252807617 + }, + { + "auxiliary_loss_clip": 0.01496979, + "auxiliary_loss_mlp": 0.01274109, + "balance_loss_clip": 1.17169094, + "balance_loss_mlp": 1.03244829, + "epoch": 0.3484743724635503, + "flos": 24355242023040.0, + "grad_norm": 2.159628828429485, + "language_loss": 0.82655466, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.85426557, + "num_input_tokens_seen": 124738670, + "step": 5796, + "time_per_iteration": 2.8046483993530273 + }, + { + "auxiliary_loss_clip": 0.01503168, + "auxiliary_loss_mlp": 0.01278377, + "balance_loss_clip": 1.17628717, + "balance_loss_mlp": 1.03805089, + "epoch": 0.34853449571621825, + "flos": 24355317879360.0, + "grad_norm": 1.5966906010275734, + "language_loss": 0.83935267, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.86716807, + "num_input_tokens_seen": 124758760, + "step": 5797, + "time_per_iteration": 2.793006181716919 + }, + { + "auxiliary_loss_clip": 0.01506453, + "auxiliary_loss_mlp": 0.01279005, + "balance_loss_clip": 1.17981029, + "balance_loss_mlp": 1.03581858, + "epoch": 0.3485946189688862, + "flos": 27269936480160.0, + "grad_norm": 1.6617102419185203, + "language_loss": 0.73478985, + "learning_rate": 3.026414616539167e-06, + "loss": 0.76264453, + "num_input_tokens_seen": 124777765, + "step": 5798, + "time_per_iteration": 2.774806499481201 + }, + { + "auxiliary_loss_clip": 0.01493576, + "auxiliary_loss_mlp": 0.01273184, + "balance_loss_clip": 1.16673124, + "balance_loss_mlp": 1.02732706, + "epoch": 0.3486547422215542, + "flos": 20158752647520.0, + "grad_norm": 1.9817224346403997, + "language_loss": 0.76565546, + "learning_rate": 3.026080335875485e-06, + "loss": 0.79332304, + "num_input_tokens_seen": 124796775, + "step": 5799, + "time_per_iteration": 2.7969706058502197 + }, + { + "auxiliary_loss_clip": 0.0149542, + "auxiliary_loss_mlp": 0.01286113, + "balance_loss_clip": 1.16988814, + "balance_loss_mlp": 1.04330707, + "epoch": 0.34871486547422215, + "flos": 20232447791040.0, + "grad_norm": 1.755556378620573, + "language_loss": 0.76004994, + "learning_rate": 3.025746016302734e-06, + "loss": 0.78786528, + "num_input_tokens_seen": 124815825, + "step": 5800, + "time_per_iteration": 2.766710042953491 + }, + { + "auxiliary_loss_clip": 0.01505051, + "auxiliary_loss_mlp": 0.01307702, + "balance_loss_clip": 1.17727637, + "balance_loss_mlp": 1.06394243, + "epoch": 0.3487749887268901, + "flos": 44056538760960.0, + "grad_norm": 1.776903635067612, + "language_loss": 0.67266601, + "learning_rate": 3.025411657833591e-06, + "loss": 0.70079362, + "num_input_tokens_seen": 124838420, + "step": 5801, + "time_per_iteration": 4.577158451080322 + }, + { + "auxiliary_loss_clip": 0.01517051, + "auxiliary_loss_mlp": 0.01289245, + "balance_loss_clip": 1.19158542, + "balance_loss_mlp": 1.04910946, + "epoch": 0.3488351119795581, + "flos": 23297415075360.0, + "grad_norm": 2.294233149636997, + "language_loss": 0.76746881, + "learning_rate": 3.025077260480735e-06, + "loss": 0.79553181, + "num_input_tokens_seen": 124857320, + "step": 5802, + "time_per_iteration": 2.808084011077881 + }, + { + "auxiliary_loss_clip": 0.01503677, + "auxiliary_loss_mlp": 0.01292048, + "balance_loss_clip": 1.17706156, + "balance_loss_mlp": 1.05401075, + "epoch": 0.34889523523222604, + "flos": 19936719012960.0, + "grad_norm": 1.7962812401448316, + "language_loss": 0.792117, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.82007432, + "num_input_tokens_seen": 124875685, + "step": 5803, + "time_per_iteration": 2.752047538757324 + }, + { + "auxiliary_loss_clip": 0.01493722, + "auxiliary_loss_mlp": 0.01287806, + "balance_loss_clip": 1.16768837, + "balance_loss_mlp": 1.04233062, + "epoch": 0.348955358484894, + "flos": 30448841050080.0, + "grad_norm": 2.018831378721225, + "language_loss": 0.67843431, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.70624959, + "num_input_tokens_seen": 124895960, + "step": 5804, + "time_per_iteration": 2.9117422103881836 + }, + { + "auxiliary_loss_clip": 0.01508068, + "auxiliary_loss_mlp": 0.01297141, + "balance_loss_clip": 1.18300271, + "balance_loss_mlp": 1.05872238, + "epoch": 0.349015481737562, + "flos": 18001680837120.0, + "grad_norm": 1.9936610441203033, + "language_loss": 0.76116824, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78922033, + "num_input_tokens_seen": 124914140, + "step": 5805, + "time_per_iteration": 2.7863309383392334 + }, + { + "auxiliary_loss_clip": 0.01504743, + "auxiliary_loss_mlp": 0.01283282, + "balance_loss_clip": 1.17792046, + "balance_loss_mlp": 1.03723383, + "epoch": 0.34907560499023, + "flos": 27200944428480.0, + "grad_norm": 2.5960552883950583, + "language_loss": 0.67375255, + "learning_rate": 3.023739282485814e-06, + "loss": 0.70163286, + "num_input_tokens_seen": 124934180, + "step": 5806, + "time_per_iteration": 2.7942593097686768 + }, + { + "auxiliary_loss_clip": 0.0151468, + "auxiliary_loss_mlp": 0.01275551, + "balance_loss_clip": 1.18756425, + "balance_loss_mlp": 1.03007543, + "epoch": 0.34913572824289796, + "flos": 30229500314880.0, + "grad_norm": 1.6932005170899374, + "language_loss": 0.71850067, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74640298, + "num_input_tokens_seen": 124956060, + "step": 5807, + "time_per_iteration": 2.8491320610046387 + }, + { + "auxiliary_loss_clip": 0.01501891, + "auxiliary_loss_mlp": 0.01271329, + "balance_loss_clip": 1.17590797, + "balance_loss_mlp": 1.02757001, + "epoch": 0.3491958514955659, + "flos": 29974544673120.0, + "grad_norm": 1.8376703058851913, + "language_loss": 0.74024308, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.76797521, + "num_input_tokens_seen": 124976070, + "step": 5808, + "time_per_iteration": 2.818056106567383 + }, + { + "auxiliary_loss_clip": 0.01507026, + "auxiliary_loss_mlp": 0.01279992, + "balance_loss_clip": 1.18140745, + "balance_loss_mlp": 1.03909421, + "epoch": 0.3492559747482339, + "flos": 22785455671200.0, + "grad_norm": 1.8843897152273898, + "language_loss": 0.84155071, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86942089, + "num_input_tokens_seen": 124996995, + "step": 5809, + "time_per_iteration": 2.8342268466949463 + }, + { + "auxiliary_loss_clip": 0.01509186, + "auxiliary_loss_mlp": 0.01276109, + "balance_loss_clip": 1.18386579, + "balance_loss_mlp": 1.03501964, + "epoch": 0.34931609800090185, + "flos": 26070446397600.0, + "grad_norm": 1.9741693330357846, + "language_loss": 0.80512464, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.83297759, + "num_input_tokens_seen": 125015600, + "step": 5810, + "time_per_iteration": 2.810553550720215 + }, + { + "auxiliary_loss_clip": 0.01506976, + "auxiliary_loss_mlp": 0.01283891, + "balance_loss_clip": 1.179968, + "balance_loss_mlp": 1.04318357, + "epoch": 0.3493762212535698, + "flos": 29244609947520.0, + "grad_norm": 1.7264519843585604, + "language_loss": 0.7579326, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.78584129, + "num_input_tokens_seen": 125035290, + "step": 5811, + "time_per_iteration": 2.849614381790161 + }, + { + "auxiliary_loss_clip": 0.01499538, + "auxiliary_loss_mlp": 0.01276942, + "balance_loss_clip": 1.17347002, + "balance_loss_mlp": 1.03547215, + "epoch": 0.3494363445062378, + "flos": 27128538842400.0, + "grad_norm": 4.1612139038986555, + "language_loss": 0.80291075, + "learning_rate": 3.021731151138386e-06, + "loss": 0.8306756, + "num_input_tokens_seen": 125057130, + "step": 5812, + "time_per_iteration": 5.971084117889404 + }, + { + "auxiliary_loss_clip": 0.01496108, + "auxiliary_loss_mlp": 0.01285274, + "balance_loss_clip": 1.16947591, + "balance_loss_mlp": 1.04361272, + "epoch": 0.34949646775890575, + "flos": 12277847085120.0, + "grad_norm": 2.2937716160695754, + "language_loss": 0.69640249, + "learning_rate": 3.021396326901918e-06, + "loss": 0.72421628, + "num_input_tokens_seen": 125073720, + "step": 5813, + "time_per_iteration": 4.412616491317749 + }, + { + "auxiliary_loss_clip": 0.01501378, + "auxiliary_loss_mlp": 0.01287675, + "balance_loss_clip": 1.17508984, + "balance_loss_mlp": 1.04677701, + "epoch": 0.3495565910115737, + "flos": 17167442578560.0, + "grad_norm": 2.1253918333562987, + "language_loss": 0.76809371, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.79598415, + "num_input_tokens_seen": 125090635, + "step": 5814, + "time_per_iteration": 2.7503840923309326 + }, + { + "auxiliary_loss_clip": 0.01492614, + "auxiliary_loss_mlp": 0.01283758, + "balance_loss_clip": 1.16705751, + "balance_loss_mlp": 1.03999913, + "epoch": 0.3496167142642417, + "flos": 26467899593760.0, + "grad_norm": 1.6664829655875466, + "language_loss": 0.84502304, + "learning_rate": 3.020726562247328e-06, + "loss": 0.87278676, + "num_input_tokens_seen": 125110070, + "step": 5815, + "time_per_iteration": 2.8729329109191895 + }, + { + "auxiliary_loss_clip": 0.01490976, + "auxiliary_loss_mlp": 0.01276955, + "balance_loss_clip": 1.16687512, + "balance_loss_mlp": 1.03643799, + "epoch": 0.34967683751690964, + "flos": 17416443499200.0, + "grad_norm": 3.135787415101193, + "language_loss": 0.77547967, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.803159, + "num_input_tokens_seen": 125125730, + "step": 5816, + "time_per_iteration": 2.811204433441162 + }, + { + "auxiliary_loss_clip": 0.01502392, + "auxiliary_loss_mlp": 0.01282588, + "balance_loss_clip": 1.17590547, + "balance_loss_mlp": 1.04035461, + "epoch": 0.3497369607695776, + "flos": 22602147052320.0, + "grad_norm": 1.977080719643501, + "language_loss": 0.58838904, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.61623883, + "num_input_tokens_seen": 125146195, + "step": 5817, + "time_per_iteration": 2.8938326835632324 + }, + { + "auxiliary_loss_clip": 0.01607631, + "auxiliary_loss_mlp": 0.01251404, + "balance_loss_clip": 1.29458237, + "balance_loss_mlp": 1.03930664, + "epoch": 0.34979708402224563, + "flos": 68535769032000.0, + "grad_norm": 0.866868933898131, + "language_loss": 0.59906757, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.62765789, + "num_input_tokens_seen": 125207790, + "step": 5818, + "time_per_iteration": 3.4358022212982178 + }, + { + "auxiliary_loss_clip": 0.01501555, + "auxiliary_loss_mlp": 0.01277718, + "balance_loss_clip": 1.17600024, + "balance_loss_mlp": 1.03586578, + "epoch": 0.3498572072749136, + "flos": 18991653577920.0, + "grad_norm": 1.7577726422214974, + "language_loss": 0.83634472, + "learning_rate": 3.019386568567123e-06, + "loss": 0.86413741, + "num_input_tokens_seen": 125226220, + "step": 5819, + "time_per_iteration": 2.725022315979004 + }, + { + "auxiliary_loss_clip": 0.01499373, + "auxiliary_loss_mlp": 0.01278807, + "balance_loss_clip": 1.17211723, + "balance_loss_mlp": 1.03733659, + "epoch": 0.34991733052758156, + "flos": 27821379463200.0, + "grad_norm": 1.744740642272792, + "language_loss": 0.71388811, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.74166989, + "num_input_tokens_seen": 125247485, + "step": 5820, + "time_per_iteration": 2.770850419998169 + }, + { + "auxiliary_loss_clip": 0.01500221, + "auxiliary_loss_mlp": 0.01287133, + "balance_loss_clip": 1.17351055, + "balance_loss_mlp": 1.04547155, + "epoch": 0.3499774537802495, + "flos": 33587048340000.0, + "grad_norm": 1.6448807321565806, + "language_loss": 0.70466888, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7325424, + "num_input_tokens_seen": 125268625, + "step": 5821, + "time_per_iteration": 2.853496789932251 + }, + { + "auxiliary_loss_clip": 0.01500013, + "auxiliary_loss_mlp": 0.01317363, + "balance_loss_clip": 1.17310762, + "balance_loss_mlp": 1.07341325, + "epoch": 0.3500375770329175, + "flos": 23478978998880.0, + "grad_norm": 4.040684165671832, + "language_loss": 0.74068308, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.76885688, + "num_input_tokens_seen": 125287530, + "step": 5822, + "time_per_iteration": 2.776305675506592 + }, + { + "auxiliary_loss_clip": 0.01511027, + "auxiliary_loss_mlp": 0.01272856, + "balance_loss_clip": 1.18406439, + "balance_loss_mlp": 1.02814293, + "epoch": 0.35009770028558546, + "flos": 19028064975840.0, + "grad_norm": 1.6348624168992285, + "language_loss": 0.78500646, + "learning_rate": 3.018045956403094e-06, + "loss": 0.81284535, + "num_input_tokens_seen": 125307020, + "step": 5823, + "time_per_iteration": 2.755917549133301 + }, + { + "auxiliary_loss_clip": 0.01657232, + "auxiliary_loss_mlp": 0.01225433, + "balance_loss_clip": 1.34128737, + "balance_loss_mlp": 1.01104736, + "epoch": 0.3501578235382534, + "flos": 68358528918720.0, + "grad_norm": 0.7142006578494621, + "language_loss": 0.59098393, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61981064, + "num_input_tokens_seen": 125370445, + "step": 5824, + "time_per_iteration": 3.3377580642700195 + }, + { + "auxiliary_loss_clip": 0.01505935, + "auxiliary_loss_mlp": 0.01270848, + "balance_loss_clip": 1.17896223, + "balance_loss_mlp": 1.03071249, + "epoch": 0.3502179467909214, + "flos": 21252990993120.0, + "grad_norm": 1.942854762116785, + "language_loss": 0.84810495, + "learning_rate": 3.017375418643811e-06, + "loss": 0.87587285, + "num_input_tokens_seen": 125388900, + "step": 5825, + "time_per_iteration": 2.81308650970459 + }, + { + "auxiliary_loss_clip": 0.01521024, + "auxiliary_loss_mlp": 0.01295862, + "balance_loss_clip": 1.19424987, + "balance_loss_mlp": 1.05515456, + "epoch": 0.35027807004358935, + "flos": 11944303567200.0, + "grad_norm": 4.225157754122331, + "language_loss": 0.83233547, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.86050439, + "num_input_tokens_seen": 125402675, + "step": 5826, + "time_per_iteration": 2.6911027431488037 + }, + { + "auxiliary_loss_clip": 0.01512237, + "auxiliary_loss_mlp": 0.01301384, + "balance_loss_clip": 1.18554735, + "balance_loss_mlp": 1.06105769, + "epoch": 0.3503381932962573, + "flos": 21473090291520.0, + "grad_norm": 1.6159317037281842, + "language_loss": 0.81018263, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.83831882, + "num_input_tokens_seen": 125421360, + "step": 5827, + "time_per_iteration": 2.7650716304779053 + }, + { + "auxiliary_loss_clip": 0.01521707, + "auxiliary_loss_mlp": 0.01289617, + "balance_loss_clip": 1.19609821, + "balance_loss_mlp": 1.04642987, + "epoch": 0.3503983165489253, + "flos": 21253218562080.0, + "grad_norm": 2.274220385250538, + "language_loss": 0.70855141, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73666465, + "num_input_tokens_seen": 125440000, + "step": 5828, + "time_per_iteration": 2.786228656768799 + }, + { + "auxiliary_loss_clip": 0.01522977, + "auxiliary_loss_mlp": 0.01293155, + "balance_loss_clip": 1.19817448, + "balance_loss_mlp": 1.05054021, + "epoch": 0.35045843980159325, + "flos": 27818041785120.0, + "grad_norm": 1.8734482743983525, + "language_loss": 0.79522115, + "learning_rate": 3.016033880279248e-06, + "loss": 0.82338244, + "num_input_tokens_seen": 125460390, + "step": 5829, + "time_per_iteration": 2.839310884475708 + }, + { + "auxiliary_loss_clip": 0.01523592, + "auxiliary_loss_mlp": 0.01284296, + "balance_loss_clip": 1.19783986, + "balance_loss_mlp": 1.03901076, + "epoch": 0.3505185630542612, + "flos": 25923511248480.0, + "grad_norm": 3.5974364246457364, + "language_loss": 0.72395551, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.75203443, + "num_input_tokens_seen": 125478410, + "step": 5830, + "time_per_iteration": 2.849459171295166 + }, + { + "auxiliary_loss_clip": 0.01523422, + "auxiliary_loss_mlp": 0.01298503, + "balance_loss_clip": 1.19832015, + "balance_loss_mlp": 1.06027484, + "epoch": 0.35057868630692923, + "flos": 20523852758880.0, + "grad_norm": 2.298277120178467, + "language_loss": 0.88537216, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.91359144, + "num_input_tokens_seen": 125495975, + "step": 5831, + "time_per_iteration": 2.8434033393859863 + }, + { + "auxiliary_loss_clip": 0.01520962, + "auxiliary_loss_mlp": 0.01293239, + "balance_loss_clip": 1.19542515, + "balance_loss_mlp": 1.0523411, + "epoch": 0.3506388095595972, + "flos": 20450726537760.0, + "grad_norm": 4.841609177208415, + "language_loss": 0.78631198, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.81445396, + "num_input_tokens_seen": 125515035, + "step": 5832, + "time_per_iteration": 2.7989017963409424 + }, + { + "auxiliary_loss_clip": 0.01526085, + "auxiliary_loss_mlp": 0.01289494, + "balance_loss_clip": 1.20004964, + "balance_loss_mlp": 1.04821396, + "epoch": 0.35069893281226516, + "flos": 23111754910560.0, + "grad_norm": 2.1414819102261684, + "language_loss": 0.71456712, + "learning_rate": 3.014691725465008e-06, + "loss": 0.74272293, + "num_input_tokens_seen": 125535555, + "step": 5833, + "time_per_iteration": 2.763704776763916 + }, + { + "auxiliary_loss_clip": 0.01527841, + "auxiliary_loss_mlp": 0.01288471, + "balance_loss_clip": 1.20289636, + "balance_loss_mlp": 1.0517689, + "epoch": 0.35075905606493313, + "flos": 27274487859360.0, + "grad_norm": 3.151426902282095, + "language_loss": 0.80897564, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83713871, + "num_input_tokens_seen": 125558195, + "step": 5834, + "time_per_iteration": 2.833094358444214 + }, + { + "auxiliary_loss_clip": 0.01532607, + "auxiliary_loss_mlp": 0.01304773, + "balance_loss_clip": 1.2088083, + "balance_loss_mlp": 1.06788027, + "epoch": 0.3508191793176011, + "flos": 19130244531840.0, + "grad_norm": 2.8299422444530653, + "language_loss": 0.83855057, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.86692435, + "num_input_tokens_seen": 125575375, + "step": 5835, + "time_per_iteration": 2.717958450317383 + }, + { + "auxiliary_loss_clip": 0.01530418, + "auxiliary_loss_mlp": 0.01293487, + "balance_loss_clip": 1.20657074, + "balance_loss_mlp": 1.05564046, + "epoch": 0.35087930257026906, + "flos": 25560004119840.0, + "grad_norm": 1.5078624736756712, + "language_loss": 0.76541615, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.79365516, + "num_input_tokens_seen": 125596745, + "step": 5836, + "time_per_iteration": 2.833934783935547 + }, + { + "auxiliary_loss_clip": 0.01538744, + "auxiliary_loss_mlp": 0.01297862, + "balance_loss_clip": 1.21435952, + "balance_loss_mlp": 1.05791783, + "epoch": 0.350939425822937, + "flos": 18006194288160.0, + "grad_norm": 2.356754668536605, + "language_loss": 0.77293915, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.80130517, + "num_input_tokens_seen": 125613980, + "step": 5837, + "time_per_iteration": 2.736382246017456 + }, + { + "auxiliary_loss_clip": 0.01536754, + "auxiliary_loss_mlp": 0.01277175, + "balance_loss_clip": 1.21168518, + "balance_loss_mlp": 1.03704, + "epoch": 0.350999549075605, + "flos": 22275392675040.0, + "grad_norm": 1.7034638295927567, + "language_loss": 0.67870355, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70684284, + "num_input_tokens_seen": 125632100, + "step": 5838, + "time_per_iteration": 2.7973434925079346 + }, + { + "auxiliary_loss_clip": 0.01534131, + "auxiliary_loss_mlp": 0.01278191, + "balance_loss_clip": 1.20979714, + "balance_loss_mlp": 1.03710175, + "epoch": 0.35105967232827295, + "flos": 14394031974720.0, + "grad_norm": 2.041549623507254, + "language_loss": 0.83557111, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.86369431, + "num_input_tokens_seen": 125649190, + "step": 5839, + "time_per_iteration": 4.4637532234191895 + }, + { + "auxiliary_loss_clip": 0.01533263, + "auxiliary_loss_mlp": 0.01286885, + "balance_loss_clip": 1.20741749, + "balance_loss_mlp": 1.04293525, + "epoch": 0.3511197955809409, + "flos": 25084683682560.0, + "grad_norm": 1.679269564683537, + "language_loss": 0.58859348, + "learning_rate": 3.012341473657572e-06, + "loss": 0.61679494, + "num_input_tokens_seen": 125668680, + "step": 5840, + "time_per_iteration": 2.8490004539489746 + }, + { + "auxiliary_loss_clip": 0.01538674, + "auxiliary_loss_mlp": 0.01281287, + "balance_loss_clip": 1.21389329, + "balance_loss_mlp": 1.03771853, + "epoch": 0.3511799188336089, + "flos": 25886493000000.0, + "grad_norm": 2.9589875110147332, + "language_loss": 0.86632979, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89452934, + "num_input_tokens_seen": 125686935, + "step": 5841, + "time_per_iteration": 2.863569498062134 + }, + { + "auxiliary_loss_clip": 0.0153507, + "auxiliary_loss_mlp": 0.01278016, + "balance_loss_clip": 1.21017599, + "balance_loss_mlp": 1.02853513, + "epoch": 0.35124004208627685, + "flos": 20085778139040.0, + "grad_norm": 2.030789946122563, + "language_loss": 0.75554597, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.78367686, + "num_input_tokens_seen": 125707180, + "step": 5842, + "time_per_iteration": 2.7951977252960205 + }, + { + "auxiliary_loss_clip": 0.01541333, + "auxiliary_loss_mlp": 0.01288264, + "balance_loss_clip": 1.21662509, + "balance_loss_mlp": 1.04069066, + "epoch": 0.3513001653389448, + "flos": 17785260570240.0, + "grad_norm": 2.739063689066819, + "language_loss": 0.68591809, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.71421409, + "num_input_tokens_seen": 125722780, + "step": 5843, + "time_per_iteration": 2.7240071296691895 + }, + { + "auxiliary_loss_clip": 0.01532995, + "auxiliary_loss_mlp": 0.01295545, + "balance_loss_clip": 1.20877898, + "balance_loss_mlp": 1.05273974, + "epoch": 0.3513602885916128, + "flos": 29389686616800.0, + "grad_norm": 1.8994889953180771, + "language_loss": 0.65295386, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68123925, + "num_input_tokens_seen": 125742110, + "step": 5844, + "time_per_iteration": 2.8801615238189697 + }, + { + "auxiliary_loss_clip": 0.01540735, + "auxiliary_loss_mlp": 0.01285749, + "balance_loss_clip": 1.21719933, + "balance_loss_mlp": 1.03893781, + "epoch": 0.3514204118442808, + "flos": 16181717791680.0, + "grad_norm": 2.3323138736728226, + "language_loss": 0.75712675, + "learning_rate": 3.010661570469245e-06, + "loss": 0.78539157, + "num_input_tokens_seen": 125759980, + "step": 5845, + "time_per_iteration": 2.780409097671509 + }, + { + "auxiliary_loss_clip": 0.01550205, + "auxiliary_loss_mlp": 0.01279338, + "balance_loss_clip": 1.2250514, + "balance_loss_mlp": 1.03462565, + "epoch": 0.35148053509694877, + "flos": 23836531406400.0, + "grad_norm": 3.477154675582666, + "language_loss": 0.72492898, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75322437, + "num_input_tokens_seen": 125772660, + "step": 5846, + "time_per_iteration": 2.87054705619812 + }, + { + "auxiliary_loss_clip": 0.01537884, + "auxiliary_loss_mlp": 0.01277176, + "balance_loss_clip": 1.21339202, + "balance_loss_mlp": 1.03513336, + "epoch": 0.35154065834961673, + "flos": 20993142618720.0, + "grad_norm": 1.8906918919409728, + "language_loss": 0.75671601, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.78486669, + "num_input_tokens_seen": 125791935, + "step": 5847, + "time_per_iteration": 2.851376533508301 + }, + { + "auxiliary_loss_clip": 0.01535394, + "auxiliary_loss_mlp": 0.01266315, + "balance_loss_clip": 1.21088779, + "balance_loss_mlp": 1.01969457, + "epoch": 0.3516007816022847, + "flos": 33258283770240.0, + "grad_norm": 6.6377470268133125, + "language_loss": 0.72491676, + "learning_rate": 3.009653168561666e-06, + "loss": 0.75293386, + "num_input_tokens_seen": 125813455, + "step": 5848, + "time_per_iteration": 2.855698823928833 + }, + { + "auxiliary_loss_clip": 0.01547359, + "auxiliary_loss_mlp": 0.01285475, + "balance_loss_clip": 1.2214824, + "balance_loss_mlp": 1.03828287, + "epoch": 0.35166090485495266, + "flos": 11728452222720.0, + "grad_norm": 2.085989572635662, + "language_loss": 0.89836681, + "learning_rate": 3.009316958003178e-06, + "loss": 0.92669523, + "num_input_tokens_seen": 125827660, + "step": 5849, + "time_per_iteration": 2.8800904750823975 + }, + { + "auxiliary_loss_clip": 0.01545044, + "auxiliary_loss_mlp": 0.01274073, + "balance_loss_clip": 1.21886683, + "balance_loss_mlp": 1.03126752, + "epoch": 0.3517210281076206, + "flos": 22640682427200.0, + "grad_norm": 2.919209890551548, + "language_loss": 0.74952829, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.7777195, + "num_input_tokens_seen": 125846655, + "step": 5850, + "time_per_iteration": 4.351349830627441 + }, + { + "auxiliary_loss_clip": 0.01547199, + "auxiliary_loss_mlp": 0.0127072, + "balance_loss_clip": 1.22308397, + "balance_loss_mlp": 1.02848709, + "epoch": 0.3517811513602886, + "flos": 21324789728640.0, + "grad_norm": 1.4299442778000129, + "language_loss": 0.7599721, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.78815126, + "num_input_tokens_seen": 125866290, + "step": 5851, + "time_per_iteration": 6.6359641551971436 + }, + { + "auxiliary_loss_clip": 0.01541137, + "auxiliary_loss_mlp": 0.01304793, + "balance_loss_clip": 1.216236, + "balance_loss_mlp": 1.05950809, + "epoch": 0.35184127461295656, + "flos": 21035015671680.0, + "grad_norm": 1.8627632227715538, + "language_loss": 0.87501597, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.90347528, + "num_input_tokens_seen": 125884620, + "step": 5852, + "time_per_iteration": 2.768109083175659 + }, + { + "auxiliary_loss_clip": 0.0154258, + "auxiliary_loss_mlp": 0.01283477, + "balance_loss_clip": 1.21784019, + "balance_loss_mlp": 1.04086185, + "epoch": 0.3519013978656245, + "flos": 22457449664640.0, + "grad_norm": 2.284121906729656, + "language_loss": 0.68206739, + "learning_rate": 3.007971733162737e-06, + "loss": 0.71032792, + "num_input_tokens_seen": 125902430, + "step": 5853, + "time_per_iteration": 2.7854416370391846 + }, + { + "auxiliary_loss_clip": 0.01540337, + "auxiliary_loss_mlp": 0.01270763, + "balance_loss_clip": 1.21360064, + "balance_loss_mlp": 1.02662241, + "epoch": 0.3519615211182925, + "flos": 13116978076320.0, + "grad_norm": 4.0535153858620045, + "language_loss": 0.81472021, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.84283125, + "num_input_tokens_seen": 125920570, + "step": 5854, + "time_per_iteration": 2.7132680416107178 + }, + { + "auxiliary_loss_clip": 0.01547958, + "auxiliary_loss_mlp": 0.01270329, + "balance_loss_clip": 1.2221632, + "balance_loss_mlp": 1.02752352, + "epoch": 0.35202164437096045, + "flos": 19137147456960.0, + "grad_norm": 1.4539714459937696, + "language_loss": 0.73245704, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.76063991, + "num_input_tokens_seen": 125939800, + "step": 5855, + "time_per_iteration": 2.8333802223205566 + }, + { + "auxiliary_loss_clip": 0.01539631, + "auxiliary_loss_mlp": 0.0128042, + "balance_loss_clip": 1.21346533, + "balance_loss_mlp": 1.03971219, + "epoch": 0.3520817676236284, + "flos": 26544628990080.0, + "grad_norm": 2.5592053472497396, + "language_loss": 0.71519142, + "learning_rate": 3.006962413152691e-06, + "loss": 0.74339199, + "num_input_tokens_seen": 125958720, + "step": 5856, + "time_per_iteration": 2.805694580078125 + }, + { + "auxiliary_loss_clip": 0.01539864, + "auxiliary_loss_mlp": 0.0128449, + "balance_loss_clip": 1.21455407, + "balance_loss_mlp": 1.04111218, + "epoch": 0.3521418908762964, + "flos": 44896883453280.0, + "grad_norm": 2.2931009215559213, + "language_loss": 0.61172909, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63997263, + "num_input_tokens_seen": 125984310, + "step": 5857, + "time_per_iteration": 2.9744832515716553 + }, + { + "auxiliary_loss_clip": 0.01536379, + "auxiliary_loss_mlp": 0.01273649, + "balance_loss_clip": 1.21041775, + "balance_loss_mlp": 1.02950823, + "epoch": 0.3522020141289644, + "flos": 20189133468000.0, + "grad_norm": 2.1738261076820358, + "language_loss": 0.73647535, + "learning_rate": 3.006289342204152e-06, + "loss": 0.76457566, + "num_input_tokens_seen": 126002410, + "step": 5858, + "time_per_iteration": 2.7957191467285156 + }, + { + "auxiliary_loss_clip": 0.01541086, + "auxiliary_loss_mlp": 0.01289888, + "balance_loss_clip": 1.21463561, + "balance_loss_mlp": 1.04689169, + "epoch": 0.35226213738163237, + "flos": 27566423821440.0, + "grad_norm": 1.6688655262884384, + "language_loss": 0.7612015, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.7895112, + "num_input_tokens_seen": 126022490, + "step": 5859, + "time_per_iteration": 2.7667760848999023 + }, + { + "auxiliary_loss_clip": 0.01526304, + "auxiliary_loss_mlp": 0.01289768, + "balance_loss_clip": 1.20051551, + "balance_loss_mlp": 1.04085886, + "epoch": 0.35232226063430033, + "flos": 22968536721120.0, + "grad_norm": 1.9332715282649073, + "language_loss": 0.72424638, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.75240707, + "num_input_tokens_seen": 126042895, + "step": 5860, + "time_per_iteration": 2.8255839347839355 + }, + { + "auxiliary_loss_clip": 0.01523338, + "auxiliary_loss_mlp": 0.01290739, + "balance_loss_clip": 1.19787049, + "balance_loss_mlp": 1.04526293, + "epoch": 0.3523823838869683, + "flos": 19170107392320.0, + "grad_norm": 2.7987731354800647, + "language_loss": 0.66386312, + "learning_rate": 3.005279449623811e-06, + "loss": 0.69200391, + "num_input_tokens_seen": 126060130, + "step": 5861, + "time_per_iteration": 2.7362139225006104 + }, + { + "auxiliary_loss_clip": 0.01547032, + "auxiliary_loss_mlp": 0.01282188, + "balance_loss_clip": 1.21953821, + "balance_loss_mlp": 1.04186177, + "epoch": 0.35244250713963626, + "flos": 17932916354400.0, + "grad_norm": 1.9973270034824042, + "language_loss": 0.66451883, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.69281101, + "num_input_tokens_seen": 126077850, + "step": 5862, + "time_per_iteration": 2.809039354324341 + }, + { + "auxiliary_loss_clip": 0.01537578, + "auxiliary_loss_mlp": 0.01276904, + "balance_loss_clip": 1.21307886, + "balance_loss_mlp": 1.03295445, + "epoch": 0.35250263039230423, + "flos": 21434592844800.0, + "grad_norm": 1.8881968658384347, + "language_loss": 0.76992321, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79806805, + "num_input_tokens_seen": 126095985, + "step": 5863, + "time_per_iteration": 2.7917640209198 + }, + { + "auxiliary_loss_clip": 0.01543337, + "auxiliary_loss_mlp": 0.01277787, + "balance_loss_clip": 1.21766841, + "balance_loss_mlp": 1.03440905, + "epoch": 0.3525627536449722, + "flos": 27419299031520.0, + "grad_norm": 1.7167629530462423, + "language_loss": 0.74894154, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77715272, + "num_input_tokens_seen": 126116070, + "step": 5864, + "time_per_iteration": 2.8229148387908936 + }, + { + "auxiliary_loss_clip": 0.01542663, + "auxiliary_loss_mlp": 0.01277344, + "balance_loss_clip": 1.21712565, + "balance_loss_mlp": 1.03244019, + "epoch": 0.35262287689764016, + "flos": 24792102941760.0, + "grad_norm": 2.8332862009146367, + "language_loss": 0.79025364, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81845373, + "num_input_tokens_seen": 126135205, + "step": 5865, + "time_per_iteration": 2.8547067642211914 + }, + { + "auxiliary_loss_clip": 0.01557443, + "auxiliary_loss_mlp": 0.01284692, + "balance_loss_clip": 1.23162115, + "balance_loss_mlp": 1.03807151, + "epoch": 0.3526830001503081, + "flos": 17823530448000.0, + "grad_norm": 3.215426656108301, + "language_loss": 0.81752527, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.84594667, + "num_input_tokens_seen": 126151895, + "step": 5866, + "time_per_iteration": 2.7706148624420166 + }, + { + "auxiliary_loss_clip": 0.01541914, + "auxiliary_loss_mlp": 0.01289983, + "balance_loss_clip": 1.2173202, + "balance_loss_mlp": 1.04031146, + "epoch": 0.3527431234029761, + "flos": 18080117000640.0, + "grad_norm": 2.825685646834538, + "language_loss": 0.8484692, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.8767882, + "num_input_tokens_seen": 126168515, + "step": 5867, + "time_per_iteration": 2.755603551864624 + }, + { + "auxiliary_loss_clip": 0.01546363, + "auxiliary_loss_mlp": 0.01283062, + "balance_loss_clip": 1.22272718, + "balance_loss_mlp": 1.03930247, + "epoch": 0.35280324665564405, + "flos": 19429386844320.0, + "grad_norm": 3.8703866571640777, + "language_loss": 0.74307936, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.77137363, + "num_input_tokens_seen": 126186460, + "step": 5868, + "time_per_iteration": 2.8020546436309814 + }, + { + "auxiliary_loss_clip": 0.0153872, + "auxiliary_loss_mlp": 0.01276872, + "balance_loss_clip": 1.2153008, + "balance_loss_mlp": 1.02910686, + "epoch": 0.352863369908312, + "flos": 21506012298720.0, + "grad_norm": 1.7691527571159105, + "language_loss": 0.61825693, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.64641285, + "num_input_tokens_seen": 126206170, + "step": 5869, + "time_per_iteration": 2.8155486583709717 + }, + { + "auxiliary_loss_clip": 0.01533118, + "auxiliary_loss_mlp": 0.01278727, + "balance_loss_clip": 1.21104169, + "balance_loss_mlp": 1.03401375, + "epoch": 0.35292349316098, + "flos": 22311690288480.0, + "grad_norm": 1.884143017637707, + "language_loss": 0.74292934, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.77104783, + "num_input_tokens_seen": 126225605, + "step": 5870, + "time_per_iteration": 2.78653621673584 + }, + { + "auxiliary_loss_clip": 0.01536643, + "auxiliary_loss_mlp": 0.01284096, + "balance_loss_clip": 1.21436429, + "balance_loss_mlp": 1.03995562, + "epoch": 0.352983616413648, + "flos": 33112562322240.0, + "grad_norm": 1.4846100728385776, + "language_loss": 0.71767664, + "learning_rate": 3.001910665140316e-06, + "loss": 0.745884, + "num_input_tokens_seen": 126250230, + "step": 5871, + "time_per_iteration": 2.9149985313415527 + }, + { + "auxiliary_loss_clip": 0.01545572, + "auxiliary_loss_mlp": 0.01274919, + "balance_loss_clip": 1.22186565, + "balance_loss_mlp": 1.0342114, + "epoch": 0.35304373966631597, + "flos": 18698541842880.0, + "grad_norm": 3.32688411208684, + "language_loss": 0.73691666, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.76512164, + "num_input_tokens_seen": 126268315, + "step": 5872, + "time_per_iteration": 2.7935197353363037 + }, + { + "auxiliary_loss_clip": 0.01532933, + "auxiliary_loss_mlp": 0.01280937, + "balance_loss_clip": 1.21121871, + "balance_loss_mlp": 1.03679633, + "epoch": 0.35310386291898394, + "flos": 23367051905760.0, + "grad_norm": 2.167530008712029, + "language_loss": 0.82561564, + "learning_rate": 3.001236451924089e-06, + "loss": 0.8537544, + "num_input_tokens_seen": 126288390, + "step": 5873, + "time_per_iteration": 2.8082776069641113 + }, + { + "auxiliary_loss_clip": 0.01527332, + "auxiliary_loss_mlp": 0.01287992, + "balance_loss_clip": 1.20621085, + "balance_loss_mlp": 1.03908324, + "epoch": 0.3531639861716519, + "flos": 24464248647840.0, + "grad_norm": 3.586392722279587, + "language_loss": 0.66102159, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68917483, + "num_input_tokens_seen": 126305750, + "step": 5874, + "time_per_iteration": 2.9495646953582764 + }, + { + "auxiliary_loss_clip": 0.01750611, + "auxiliary_loss_mlp": 0.01249283, + "balance_loss_clip": 1.42793393, + "balance_loss_mlp": 1.03947449, + "epoch": 0.35322410942431987, + "flos": 70318675900800.0, + "grad_norm": 0.769401367008033, + "language_loss": 0.6134094, + "learning_rate": 3.000562086839917e-06, + "loss": 0.6434083, + "num_input_tokens_seen": 126362495, + "step": 5875, + "time_per_iteration": 3.257398843765259 + }, + { + "auxiliary_loss_clip": 0.01536266, + "auxiliary_loss_mlp": 0.01280118, + "balance_loss_clip": 1.21314752, + "balance_loss_mlp": 1.03712153, + "epoch": 0.35328423267698783, + "flos": 19822516230240.0, + "grad_norm": 1.8529480533474036, + "language_loss": 0.79658759, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82475144, + "num_input_tokens_seen": 126378320, + "step": 5876, + "time_per_iteration": 2.8016602993011475 + }, + { + "auxiliary_loss_clip": 0.01740271, + "auxiliary_loss_mlp": 0.01237267, + "balance_loss_clip": 1.41824031, + "balance_loss_mlp": 1.02516937, + "epoch": 0.3533443559296558, + "flos": 60832065654720.0, + "grad_norm": 0.6785612999352599, + "language_loss": 0.56695068, + "learning_rate": 2.999887569990088e-06, + "loss": 0.596726, + "num_input_tokens_seen": 126442735, + "step": 5877, + "time_per_iteration": 4.964188575744629 + }, + { + "auxiliary_loss_clip": 0.0153005, + "auxiliary_loss_mlp": 0.01292963, + "balance_loss_clip": 1.20843482, + "balance_loss_mlp": 1.05111122, + "epoch": 0.35340447918232376, + "flos": 24758194802400.0, + "grad_norm": 1.6057508705556598, + "language_loss": 0.71907914, + "learning_rate": 2.999550254685024e-06, + "loss": 0.74730927, + "num_input_tokens_seen": 126463090, + "step": 5878, + "time_per_iteration": 2.9208295345306396 + }, + { + "auxiliary_loss_clip": 0.01537662, + "auxiliary_loss_mlp": 0.01277412, + "balance_loss_clip": 1.21591294, + "balance_loss_mlp": 1.03479767, + "epoch": 0.3534646024349917, + "flos": 21798327542400.0, + "grad_norm": 2.099170177401018, + "language_loss": 0.78866637, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.81681705, + "num_input_tokens_seen": 126482105, + "step": 5879, + "time_per_iteration": 2.7954654693603516 + }, + { + "auxiliary_loss_clip": 0.01526032, + "auxiliary_loss_mlp": 0.01287104, + "balance_loss_clip": 1.20657396, + "balance_loss_mlp": 1.04181862, + "epoch": 0.3535247256876597, + "flos": 20014244900640.0, + "grad_norm": 2.3347555786567424, + "language_loss": 0.63538373, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.66351509, + "num_input_tokens_seen": 126502125, + "step": 5880, + "time_per_iteration": 2.890108108520508 + }, + { + "auxiliary_loss_clip": 0.01537255, + "auxiliary_loss_mlp": 0.01287542, + "balance_loss_clip": 1.21608722, + "balance_loss_mlp": 1.04492724, + "epoch": 0.35358484894032766, + "flos": 18189958044960.0, + "grad_norm": 2.422122663560406, + "language_loss": 0.65808195, + "learning_rate": 2.998538081402727e-06, + "loss": 0.68632996, + "num_input_tokens_seen": 126521950, + "step": 5881, + "time_per_iteration": 2.7664122581481934 + }, + { + "auxiliary_loss_clip": 0.01525777, + "auxiliary_loss_mlp": 0.01264162, + "balance_loss_clip": 1.20509386, + "balance_loss_mlp": 1.02250147, + "epoch": 0.3536449721929956, + "flos": 22822587704160.0, + "grad_norm": 1.5870155091909612, + "language_loss": 0.75445431, + "learning_rate": 2.998200614562239e-06, + "loss": 0.78235376, + "num_input_tokens_seen": 126542445, + "step": 5882, + "time_per_iteration": 2.822378635406494 + }, + { + "auxiliary_loss_clip": 0.01535338, + "auxiliary_loss_mlp": 0.01278497, + "balance_loss_clip": 1.21358931, + "balance_loss_mlp": 1.02939677, + "epoch": 0.3537050954456636, + "flos": 26434750017600.0, + "grad_norm": 3.0994994488522387, + "language_loss": 0.70918679, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.73732507, + "num_input_tokens_seen": 126560690, + "step": 5883, + "time_per_iteration": 2.7757961750030518 + }, + { + "auxiliary_loss_clip": 0.01531698, + "auxiliary_loss_mlp": 0.01276837, + "balance_loss_clip": 1.20888972, + "balance_loss_mlp": 1.03097987, + "epoch": 0.3537652186983316, + "flos": 17198809531200.0, + "grad_norm": 1.9691404243440727, + "language_loss": 0.77973735, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.8078227, + "num_input_tokens_seen": 126577620, + "step": 5884, + "time_per_iteration": 2.8011410236358643 + }, + { + "auxiliary_loss_clip": 0.01529626, + "auxiliary_loss_mlp": 0.01283381, + "balance_loss_clip": 1.20878816, + "balance_loss_mlp": 1.03771484, + "epoch": 0.3538253419509996, + "flos": 19538810678880.0, + "grad_norm": 2.382996554774657, + "language_loss": 0.75212282, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.78025293, + "num_input_tokens_seen": 126596235, + "step": 5885, + "time_per_iteration": 2.820697784423828 + }, + { + "auxiliary_loss_clip": 0.01523602, + "auxiliary_loss_mlp": 0.01276675, + "balance_loss_clip": 1.20299745, + "balance_loss_mlp": 1.02986407, + "epoch": 0.35388546520366754, + "flos": 12130115444640.0, + "grad_norm": 3.1882706595375523, + "language_loss": 0.83833039, + "learning_rate": 2.996850368809606e-06, + "loss": 0.86633313, + "num_input_tokens_seen": 126612830, + "step": 5886, + "time_per_iteration": 2.7156336307525635 + }, + { + "auxiliary_loss_clip": 0.01532354, + "auxiliary_loss_mlp": 0.01275127, + "balance_loss_clip": 1.21177983, + "balance_loss_mlp": 1.03480077, + "epoch": 0.3539455884563355, + "flos": 19679942819520.0, + "grad_norm": 2.0472428669607625, + "language_loss": 0.78240514, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.81048, + "num_input_tokens_seen": 126630910, + "step": 5887, + "time_per_iteration": 2.7923295497894287 + }, + { + "auxiliary_loss_clip": 0.01532425, + "auxiliary_loss_mlp": 0.01277596, + "balance_loss_clip": 1.21148694, + "balance_loss_mlp": 1.03307354, + "epoch": 0.35400571170900347, + "flos": 18073707141600.0, + "grad_norm": 2.5956353772136755, + "language_loss": 0.65367222, + "learning_rate": 2.996175019078089e-06, + "loss": 0.68177235, + "num_input_tokens_seen": 126648365, + "step": 5888, + "time_per_iteration": 4.299969434738159 + }, + { + "auxiliary_loss_clip": 0.01534894, + "auxiliary_loss_mlp": 0.01284864, + "balance_loss_clip": 1.21430206, + "balance_loss_mlp": 1.03996086, + "epoch": 0.35406583496167143, + "flos": 26070484325760.0, + "grad_norm": 1.8801039048983397, + "language_loss": 0.77373278, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.80193037, + "num_input_tokens_seen": 126667500, + "step": 5889, + "time_per_iteration": 4.552464008331299 + }, + { + "auxiliary_loss_clip": 0.01539087, + "auxiliary_loss_mlp": 0.01288585, + "balance_loss_clip": 1.21907222, + "balance_loss_mlp": 1.04558873, + "epoch": 0.3541259582143394, + "flos": 19794524883840.0, + "grad_norm": 2.1156799101562194, + "language_loss": 0.80613136, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.83440804, + "num_input_tokens_seen": 126686820, + "step": 5890, + "time_per_iteration": 4.200288534164429 + }, + { + "auxiliary_loss_clip": 0.01517543, + "auxiliary_loss_mlp": 0.01277156, + "balance_loss_clip": 1.19715953, + "balance_loss_mlp": 1.03740239, + "epoch": 0.35418608146700736, + "flos": 24024239691840.0, + "grad_norm": 1.8049681020179684, + "language_loss": 0.79839349, + "learning_rate": 2.99516171119991e-06, + "loss": 0.8263405, + "num_input_tokens_seen": 126706965, + "step": 5891, + "time_per_iteration": 2.78436279296875 + }, + { + "auxiliary_loss_clip": 0.01537355, + "auxiliary_loss_mlp": 0.01297461, + "balance_loss_clip": 1.21681738, + "balance_loss_mlp": 1.05847049, + "epoch": 0.35424620471967533, + "flos": 12387422632320.0, + "grad_norm": 2.1559337601608064, + "language_loss": 0.73084158, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75918978, + "num_input_tokens_seen": 126724015, + "step": 5892, + "time_per_iteration": 2.7563717365264893 + }, + { + "auxiliary_loss_clip": 0.01528073, + "auxiliary_loss_mlp": 0.01277348, + "balance_loss_clip": 1.20699644, + "balance_loss_mlp": 1.03664017, + "epoch": 0.3543063279723433, + "flos": 19675163871360.0, + "grad_norm": 2.712830583128515, + "language_loss": 0.67233866, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.70039284, + "num_input_tokens_seen": 126737565, + "step": 5893, + "time_per_iteration": 2.7734804153442383 + }, + { + "auxiliary_loss_clip": 0.01525597, + "auxiliary_loss_mlp": 0.01299624, + "balance_loss_clip": 1.2049644, + "balance_loss_mlp": 1.06063342, + "epoch": 0.35436645122501126, + "flos": 21911430408480.0, + "grad_norm": 1.8685604279030692, + "language_loss": 0.69582468, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.72407687, + "num_input_tokens_seen": 126756095, + "step": 5894, + "time_per_iteration": 2.756699800491333 + }, + { + "auxiliary_loss_clip": 0.01527192, + "auxiliary_loss_mlp": 0.01283023, + "balance_loss_clip": 1.20737696, + "balance_loss_mlp": 1.04441333, + "epoch": 0.3544265744776792, + "flos": 21721332648960.0, + "grad_norm": 1.6840054119223302, + "language_loss": 0.74875623, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.77685833, + "num_input_tokens_seen": 126775455, + "step": 5895, + "time_per_iteration": 2.783745527267456 + }, + { + "auxiliary_loss_clip": 0.01526955, + "auxiliary_loss_mlp": 0.01290129, + "balance_loss_clip": 1.20649695, + "balance_loss_mlp": 1.04846835, + "epoch": 0.3544866977303472, + "flos": 21215252109600.0, + "grad_norm": 2.175709406247206, + "language_loss": 0.83628225, + "learning_rate": 2.993472110174491e-06, + "loss": 0.86445308, + "num_input_tokens_seen": 126792320, + "step": 5896, + "time_per_iteration": 2.860642910003662 + }, + { + "auxiliary_loss_clip": 0.01535608, + "auxiliary_loss_mlp": 0.01293334, + "balance_loss_clip": 1.21539545, + "balance_loss_mlp": 1.05033767, + "epoch": 0.35454682098301515, + "flos": 29313791640000.0, + "grad_norm": 1.8121469508390189, + "language_loss": 0.70385206, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.73214149, + "num_input_tokens_seen": 126813680, + "step": 5897, + "time_per_iteration": 2.826815128326416 + }, + { + "auxiliary_loss_clip": 0.01525424, + "auxiliary_loss_mlp": 0.01271344, + "balance_loss_clip": 1.20421326, + "balance_loss_mlp": 1.03063703, + "epoch": 0.3546069442356832, + "flos": 24318906481440.0, + "grad_norm": 1.8071670869202954, + "language_loss": 0.81503451, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.84300214, + "num_input_tokens_seen": 126834395, + "step": 5898, + "time_per_iteration": 2.850247383117676 + }, + { + "auxiliary_loss_clip": 0.01520182, + "auxiliary_loss_mlp": 0.0127931, + "balance_loss_clip": 1.19835758, + "balance_loss_mlp": 1.04146338, + "epoch": 0.35466706748835114, + "flos": 22859530096320.0, + "grad_norm": 1.7683136569742854, + "language_loss": 0.74539739, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.77339232, + "num_input_tokens_seen": 126855145, + "step": 5899, + "time_per_iteration": 2.8057498931884766 + }, + { + "auxiliary_loss_clip": 0.01519207, + "auxiliary_loss_mlp": 0.01284829, + "balance_loss_clip": 1.19788516, + "balance_loss_mlp": 1.04068828, + "epoch": 0.3547271907410191, + "flos": 28332656160480.0, + "grad_norm": 2.0637334444591375, + "language_loss": 0.79580289, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.82384324, + "num_input_tokens_seen": 126873790, + "step": 5900, + "time_per_iteration": 2.83474063873291 + }, + { + "auxiliary_loss_clip": 0.0152642, + "auxiliary_loss_mlp": 0.01280334, + "balance_loss_clip": 1.20430064, + "balance_loss_mlp": 1.03695607, + "epoch": 0.35478731399368707, + "flos": 23516376528960.0, + "grad_norm": 1.8902070671036384, + "language_loss": 0.81934142, + "learning_rate": 2.991781567335093e-06, + "loss": 0.84740895, + "num_input_tokens_seen": 126892865, + "step": 5901, + "time_per_iteration": 2.8044559955596924 + }, + { + "auxiliary_loss_clip": 0.01521296, + "auxiliary_loss_mlp": 0.012832, + "balance_loss_clip": 1.19916534, + "balance_loss_mlp": 1.0390594, + "epoch": 0.35484743724635504, + "flos": 18626287969440.0, + "grad_norm": 5.37120231904748, + "language_loss": 0.75979018, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.78783512, + "num_input_tokens_seen": 126911935, + "step": 5902, + "time_per_iteration": 2.7750768661499023 + }, + { + "auxiliary_loss_clip": 0.01524326, + "auxiliary_loss_mlp": 0.01279303, + "balance_loss_clip": 1.20279956, + "balance_loss_mlp": 1.03535318, + "epoch": 0.354907560499023, + "flos": 17386441960320.0, + "grad_norm": 2.628677429892899, + "language_loss": 0.70357919, + "learning_rate": 2.991105086850381e-06, + "loss": 0.73161548, + "num_input_tokens_seen": 126930040, + "step": 5903, + "time_per_iteration": 2.780092239379883 + }, + { + "auxiliary_loss_clip": 0.01517007, + "auxiliary_loss_mlp": 0.0128516, + "balance_loss_clip": 1.19376123, + "balance_loss_mlp": 1.04044688, + "epoch": 0.35496768375169097, + "flos": 19210577103360.0, + "grad_norm": 2.714708347752337, + "language_loss": 0.74632704, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.77434874, + "num_input_tokens_seen": 126948390, + "step": 5904, + "time_per_iteration": 2.7542426586151123 + }, + { + "auxiliary_loss_clip": 0.01514285, + "auxiliary_loss_mlp": 0.01279668, + "balance_loss_clip": 1.19123244, + "balance_loss_mlp": 1.03571773, + "epoch": 0.35502780700435893, + "flos": 18334769217120.0, + "grad_norm": 3.492949482826883, + "language_loss": 0.78846788, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.81640738, + "num_input_tokens_seen": 126964905, + "step": 5905, + "time_per_iteration": 2.75201153755188 + }, + { + "auxiliary_loss_clip": 0.01530789, + "auxiliary_loss_mlp": 0.01273685, + "balance_loss_clip": 1.20691955, + "balance_loss_mlp": 1.0394628, + "epoch": 0.3550879302570269, + "flos": 15450265939680.0, + "grad_norm": 2.222984560128403, + "language_loss": 0.72302568, + "learning_rate": 2.990090084284356e-06, + "loss": 0.75107038, + "num_input_tokens_seen": 126982000, + "step": 5906, + "time_per_iteration": 2.849790334701538 + }, + { + "auxiliary_loss_clip": 0.01526522, + "auxiliary_loss_mlp": 0.01289529, + "balance_loss_clip": 1.20115077, + "balance_loss_mlp": 1.04805839, + "epoch": 0.35514805350969486, + "flos": 21981256879680.0, + "grad_norm": 3.3659310406553993, + "language_loss": 0.74942148, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.77758199, + "num_input_tokens_seen": 126998390, + "step": 5907, + "time_per_iteration": 2.7575764656066895 + }, + { + "auxiliary_loss_clip": 0.01532309, + "auxiliary_loss_mlp": 0.0127113, + "balance_loss_clip": 1.20887232, + "balance_loss_mlp": 1.02832496, + "epoch": 0.3552081767623628, + "flos": 29864551916160.0, + "grad_norm": 1.960087650254947, + "language_loss": 0.75717115, + "learning_rate": 2.989413228164047e-06, + "loss": 0.7852056, + "num_input_tokens_seen": 127020220, + "step": 5908, + "time_per_iteration": 2.9296011924743652 + }, + { + "auxiliary_loss_clip": 0.01526858, + "auxiliary_loss_mlp": 0.01284376, + "balance_loss_clip": 1.20475042, + "balance_loss_mlp": 1.04405022, + "epoch": 0.3552683000150308, + "flos": 26434674161280.0, + "grad_norm": 2.412893363579766, + "language_loss": 0.68177319, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70988554, + "num_input_tokens_seen": 127038585, + "step": 5909, + "time_per_iteration": 2.818305253982544 + }, + { + "auxiliary_loss_clip": 0.01530732, + "auxiliary_loss_mlp": 0.01279691, + "balance_loss_clip": 1.21014369, + "balance_loss_mlp": 1.03860247, + "epoch": 0.35532842326769876, + "flos": 19787697815040.0, + "grad_norm": 1.8813444203841478, + "language_loss": 0.78268719, + "learning_rate": 2.988736221969144e-06, + "loss": 0.81079143, + "num_input_tokens_seen": 127056215, + "step": 5910, + "time_per_iteration": 2.8461687564849854 + }, + { + "auxiliary_loss_clip": 0.01519953, + "auxiliary_loss_mlp": 0.01284551, + "balance_loss_clip": 1.19957483, + "balance_loss_mlp": 1.04327166, + "epoch": 0.3553885465203668, + "flos": 17241251506560.0, + "grad_norm": 1.9245862708136763, + "language_loss": 0.71395689, + "learning_rate": 2.98839766262581e-06, + "loss": 0.74200195, + "num_input_tokens_seen": 127075825, + "step": 5911, + "time_per_iteration": 2.8110532760620117 + }, + { + "auxiliary_loss_clip": 0.0151716, + "auxiliary_loss_mlp": 0.01274333, + "balance_loss_clip": 1.19669056, + "balance_loss_mlp": 1.03629565, + "epoch": 0.35544866977303474, + "flos": 14935841205120.0, + "grad_norm": 2.0464085395145832, + "language_loss": 0.86805815, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89597303, + "num_input_tokens_seen": 127091205, + "step": 5912, + "time_per_iteration": 2.852090358734131 + }, + { + "auxiliary_loss_clip": 0.01517037, + "auxiliary_loss_mlp": 0.01283477, + "balance_loss_clip": 1.19692707, + "balance_loss_mlp": 1.04295993, + "epoch": 0.3555087930257027, + "flos": 19758113485920.0, + "grad_norm": 1.8089885585976253, + "language_loss": 0.76765406, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79565924, + "num_input_tokens_seen": 127109210, + "step": 5913, + "time_per_iteration": 2.7853333950042725 + }, + { + "auxiliary_loss_clip": 0.01515105, + "auxiliary_loss_mlp": 0.01274541, + "balance_loss_clip": 1.19522762, + "balance_loss_mlp": 1.03593183, + "epoch": 0.3555689162783707, + "flos": 21070327152960.0, + "grad_norm": 1.5425517225132628, + "language_loss": 0.82771146, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.85560787, + "num_input_tokens_seen": 127128400, + "step": 5914, + "time_per_iteration": 2.8672800064086914 + }, + { + "auxiliary_loss_clip": 0.01507325, + "auxiliary_loss_mlp": 0.0127346, + "balance_loss_clip": 1.18811762, + "balance_loss_mlp": 1.02703059, + "epoch": 0.35562903953103864, + "flos": 33072244323840.0, + "grad_norm": 2.095565665995677, + "language_loss": 0.7073878, + "learning_rate": 2.98704305057949e-06, + "loss": 0.73519564, + "num_input_tokens_seen": 127149965, + "step": 5915, + "time_per_iteration": 2.9044787883758545 + }, + { + "auxiliary_loss_clip": 0.01510921, + "auxiliary_loss_mlp": 0.01280644, + "balance_loss_clip": 1.18967509, + "balance_loss_mlp": 1.03974533, + "epoch": 0.3556891627837066, + "flos": 20559922803360.0, + "grad_norm": 1.8633698267643923, + "language_loss": 0.76260924, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.79052484, + "num_input_tokens_seen": 127169865, + "step": 5916, + "time_per_iteration": 4.385798692703247 + }, + { + "auxiliary_loss_clip": 0.0151531, + "auxiliary_loss_mlp": 0.01281329, + "balance_loss_clip": 1.19478381, + "balance_loss_mlp": 1.04024053, + "epoch": 0.35574928603637457, + "flos": 20705378754240.0, + "grad_norm": 3.496019772776534, + "language_loss": 0.88562846, + "learning_rate": 2.986365519932332e-06, + "loss": 0.91359484, + "num_input_tokens_seen": 127188075, + "step": 5917, + "time_per_iteration": 2.802263021469116 + }, + { + "auxiliary_loss_clip": 0.01510733, + "auxiliary_loss_mlp": 0.0127853, + "balance_loss_clip": 1.19079733, + "balance_loss_mlp": 1.03629684, + "epoch": 0.35580940928904253, + "flos": 15196372286400.0, + "grad_norm": 4.290757145617089, + "language_loss": 0.7506032, + "learning_rate": 2.98602669849771e-06, + "loss": 0.77849585, + "num_input_tokens_seen": 127206065, + "step": 5918, + "time_per_iteration": 2.7932186126708984 + }, + { + "auxiliary_loss_clip": 0.01641156, + "auxiliary_loss_mlp": 0.01234261, + "balance_loss_clip": 1.3272419, + "balance_loss_mlp": 1.01377106, + "epoch": 0.3558695325417105, + "flos": 58644840592800.0, + "grad_norm": 0.9579151593821115, + "language_loss": 0.63832742, + "learning_rate": 2.985687839672857e-06, + "loss": 0.66708159, + "num_input_tokens_seen": 127257885, + "step": 5919, + "time_per_iteration": 3.0619266033172607 + }, + { + "auxiliary_loss_clip": 0.01502509, + "auxiliary_loss_mlp": 0.01282799, + "balance_loss_clip": 1.18334675, + "balance_loss_mlp": 1.0439992, + "epoch": 0.35592965579437846, + "flos": 22020437033280.0, + "grad_norm": 2.2497864043112465, + "language_loss": 0.74400008, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.77185309, + "num_input_tokens_seen": 127275550, + "step": 5920, + "time_per_iteration": 2.7897627353668213 + }, + { + "auxiliary_loss_clip": 0.01510589, + "auxiliary_loss_mlp": 0.01278297, + "balance_loss_clip": 1.19215941, + "balance_loss_mlp": 1.03549123, + "epoch": 0.35598977904704643, + "flos": 23369934445920.0, + "grad_norm": 1.9035451871475757, + "language_loss": 0.77397895, + "learning_rate": 2.985010009903857e-06, + "loss": 0.80186784, + "num_input_tokens_seen": 127295110, + "step": 5921, + "time_per_iteration": 2.78261137008667 + }, + { + "auxiliary_loss_clip": 0.01510593, + "auxiliary_loss_mlp": 0.01281929, + "balance_loss_clip": 1.19185042, + "balance_loss_mlp": 1.0402683, + "epoch": 0.3560499022997144, + "flos": 17787270762720.0, + "grad_norm": 2.242547305545878, + "language_loss": 0.67709565, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.7050209, + "num_input_tokens_seen": 127312865, + "step": 5922, + "time_per_iteration": 2.8062760829925537 + }, + { + "auxiliary_loss_clip": 0.01511131, + "auxiliary_loss_mlp": 0.01299369, + "balance_loss_clip": 1.19227159, + "balance_loss_mlp": 1.05961537, + "epoch": 0.35611002555238236, + "flos": 20742548715360.0, + "grad_norm": 2.059926693231884, + "language_loss": 0.79238987, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.82049483, + "num_input_tokens_seen": 127331710, + "step": 5923, + "time_per_iteration": 2.754453659057617 + }, + { + "auxiliary_loss_clip": 0.01505668, + "auxiliary_loss_mlp": 0.0128105, + "balance_loss_clip": 1.18602538, + "balance_loss_mlp": 1.04205894, + "epoch": 0.3561701488050504, + "flos": 19464167331360.0, + "grad_norm": 1.9179777771472903, + "language_loss": 0.85466915, + "learning_rate": 2.983992985144908e-06, + "loss": 0.88253629, + "num_input_tokens_seen": 127350950, + "step": 5924, + "time_per_iteration": 2.82297945022583 + }, + { + "auxiliary_loss_clip": 0.0151035, + "auxiliary_loss_mlp": 0.01293085, + "balance_loss_clip": 1.19135976, + "balance_loss_mlp": 1.05256844, + "epoch": 0.35623027205771834, + "flos": 30777643548000.0, + "grad_norm": 2.167683724026894, + "language_loss": 0.77824128, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.80627561, + "num_input_tokens_seen": 127369385, + "step": 5925, + "time_per_iteration": 2.8454740047454834 + }, + { + "auxiliary_loss_clip": 0.01507402, + "auxiliary_loss_mlp": 0.01282537, + "balance_loss_clip": 1.18900239, + "balance_loss_mlp": 1.04526293, + "epoch": 0.3562903953103863, + "flos": 16982882330400.0, + "grad_norm": 1.9995867456939782, + "language_loss": 0.75805283, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.78595221, + "num_input_tokens_seen": 127386965, + "step": 5926, + "time_per_iteration": 4.155188083648682 + }, + { + "auxiliary_loss_clip": 0.0149567, + "auxiliary_loss_mlp": 0.01285245, + "balance_loss_clip": 1.1763885, + "balance_loss_mlp": 1.04568183, + "epoch": 0.3563505185630543, + "flos": 23841993061440.0, + "grad_norm": 2.0853429631805986, + "language_loss": 0.69825876, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.7260679, + "num_input_tokens_seen": 127406075, + "step": 5927, + "time_per_iteration": 4.49361515045166 + }, + { + "auxiliary_loss_clip": 0.01510967, + "auxiliary_loss_mlp": 0.01287933, + "balance_loss_clip": 1.19126403, + "balance_loss_mlp": 1.05046809, + "epoch": 0.35641064181572224, + "flos": 22275658172160.0, + "grad_norm": 1.950355580422069, + "language_loss": 0.79908133, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.82707036, + "num_input_tokens_seen": 127425350, + "step": 5928, + "time_per_iteration": 4.28633189201355 + }, + { + "auxiliary_loss_clip": 0.01513684, + "auxiliary_loss_mlp": 0.01295921, + "balance_loss_clip": 1.19463778, + "balance_loss_mlp": 1.05902863, + "epoch": 0.3564707650683902, + "flos": 23003393064480.0, + "grad_norm": 1.5453328583911778, + "language_loss": 0.8200053, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84810138, + "num_input_tokens_seen": 127446335, + "step": 5929, + "time_per_iteration": 2.806593418121338 + }, + { + "auxiliary_loss_clip": 0.01497681, + "auxiliary_loss_mlp": 0.01280651, + "balance_loss_clip": 1.17856205, + "balance_loss_mlp": 1.04280472, + "epoch": 0.35653088832105817, + "flos": 14686043793120.0, + "grad_norm": 1.7263280749138148, + "language_loss": 0.70414114, + "learning_rate": 2.981957928520201e-06, + "loss": 0.73192453, + "num_input_tokens_seen": 127462795, + "step": 5930, + "time_per_iteration": 2.819352149963379 + }, + { + "auxiliary_loss_clip": 0.01507383, + "auxiliary_loss_mlp": 0.01285178, + "balance_loss_clip": 1.18899822, + "balance_loss_mlp": 1.0423727, + "epoch": 0.35659101157372614, + "flos": 23479320352320.0, + "grad_norm": 1.906898191762173, + "language_loss": 0.67758179, + "learning_rate": 2.981618622015244e-06, + "loss": 0.7055074, + "num_input_tokens_seen": 127482675, + "step": 5931, + "time_per_iteration": 2.8111050128936768 + }, + { + "auxiliary_loss_clip": 0.01503824, + "auxiliary_loss_mlp": 0.01278045, + "balance_loss_clip": 1.18527734, + "balance_loss_mlp": 1.03943563, + "epoch": 0.3566511348263941, + "flos": 26581116244320.0, + "grad_norm": 1.9276753582579325, + "language_loss": 0.68035507, + "learning_rate": 2.981279278287211e-06, + "loss": 0.70817375, + "num_input_tokens_seen": 127502275, + "step": 5932, + "time_per_iteration": 2.783658504486084 + }, + { + "auxiliary_loss_clip": 0.01506868, + "auxiliary_loss_mlp": 0.012798, + "balance_loss_clip": 1.18854499, + "balance_loss_mlp": 1.03890228, + "epoch": 0.35671125807906207, + "flos": 13116674651040.0, + "grad_norm": 2.37032276381206, + "language_loss": 0.7876997, + "learning_rate": 2.980939897348969e-06, + "loss": 0.81556642, + "num_input_tokens_seen": 127520195, + "step": 5933, + "time_per_iteration": 2.78800630569458 + }, + { + "auxiliary_loss_clip": 0.01508251, + "auxiliary_loss_mlp": 0.01284897, + "balance_loss_clip": 1.19030094, + "balance_loss_mlp": 1.04552507, + "epoch": 0.35677138133173003, + "flos": 33003403984800.0, + "grad_norm": 1.8275354226660379, + "language_loss": 0.69571406, + "learning_rate": 2.980600479213388e-06, + "loss": 0.72364557, + "num_input_tokens_seen": 127544495, + "step": 5934, + "time_per_iteration": 2.8886773586273193 + }, + { + "auxiliary_loss_clip": 0.01501932, + "auxiliary_loss_mlp": 0.01277823, + "balance_loss_clip": 1.18340969, + "balance_loss_mlp": 1.03139305, + "epoch": 0.356831504584398, + "flos": 20779984173600.0, + "grad_norm": 2.810106297929356, + "language_loss": 0.70771062, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73550814, + "num_input_tokens_seen": 127563810, + "step": 5935, + "time_per_iteration": 2.786414861679077 + }, + { + "auxiliary_loss_clip": 0.01508153, + "auxiliary_loss_mlp": 0.01275613, + "balance_loss_clip": 1.18999493, + "balance_loss_mlp": 1.03185344, + "epoch": 0.35689162783706596, + "flos": 12167133693120.0, + "grad_norm": 2.132740555754522, + "language_loss": 0.78503072, + "learning_rate": 2.979921531401692e-06, + "loss": 0.81286836, + "num_input_tokens_seen": 127579065, + "step": 5936, + "time_per_iteration": 2.7316126823425293 + }, + { + "auxiliary_loss_clip": 0.01506303, + "auxiliary_loss_mlp": 0.01277492, + "balance_loss_clip": 1.18808496, + "balance_loss_mlp": 1.03564036, + "epoch": 0.356951751089734, + "flos": 23843737756800.0, + "grad_norm": 1.6566061935200533, + "language_loss": 0.64415872, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.67199665, + "num_input_tokens_seen": 127599105, + "step": 5937, + "time_per_iteration": 2.827766180038452 + }, + { + "auxiliary_loss_clip": 0.01504073, + "auxiliary_loss_mlp": 0.01270816, + "balance_loss_clip": 1.18566275, + "balance_loss_mlp": 1.02839208, + "epoch": 0.35701187434240195, + "flos": 11723407777440.0, + "grad_norm": 3.148150923683224, + "language_loss": 0.785918, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.81366694, + "num_input_tokens_seen": 127614940, + "step": 5938, + "time_per_iteration": 2.718259572982788 + }, + { + "auxiliary_loss_clip": 0.01513743, + "auxiliary_loss_mlp": 0.01271952, + "balance_loss_clip": 1.19567072, + "balance_loss_mlp": 1.02723932, + "epoch": 0.3570719975950699, + "flos": 24901147494720.0, + "grad_norm": 1.4933467694782614, + "language_loss": 0.80598456, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.8338415, + "num_input_tokens_seen": 127634960, + "step": 5939, + "time_per_iteration": 2.791215658187866 + }, + { + "auxiliary_loss_clip": 0.01499075, + "auxiliary_loss_mlp": 0.01269462, + "balance_loss_clip": 1.18119717, + "balance_loss_mlp": 1.01902735, + "epoch": 0.3571321208477379, + "flos": 25997244320160.0, + "grad_norm": 2.26378797596038, + "language_loss": 0.79094577, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81863117, + "num_input_tokens_seen": 127654545, + "step": 5940, + "time_per_iteration": 2.8330605030059814 + }, + { + "auxiliary_loss_clip": 0.01506206, + "auxiliary_loss_mlp": 0.01279474, + "balance_loss_clip": 1.18863559, + "balance_loss_mlp": 1.03705025, + "epoch": 0.35719224410040584, + "flos": 14503417881120.0, + "grad_norm": 2.0796019085585975, + "language_loss": 0.72682118, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.75467801, + "num_input_tokens_seen": 127672320, + "step": 5941, + "time_per_iteration": 2.791980266571045 + }, + { + "auxiliary_loss_clip": 0.0150773, + "auxiliary_loss_mlp": 0.01277581, + "balance_loss_clip": 1.19088769, + "balance_loss_mlp": 1.0343945, + "epoch": 0.3572523673530738, + "flos": 31178130996960.0, + "grad_norm": 3.110026390650074, + "language_loss": 0.64335907, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.6712122, + "num_input_tokens_seen": 127693315, + "step": 5942, + "time_per_iteration": 2.8373117446899414 + }, + { + "auxiliary_loss_clip": 0.01510281, + "auxiliary_loss_mlp": 0.01267832, + "balance_loss_clip": 1.19314063, + "balance_loss_mlp": 1.02330971, + "epoch": 0.3573124906057418, + "flos": 15853825569600.0, + "grad_norm": 5.501237562969923, + "language_loss": 0.73887837, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.7666595, + "num_input_tokens_seen": 127711570, + "step": 5943, + "time_per_iteration": 2.728440523147583 + }, + { + "auxiliary_loss_clip": 0.01621279, + "auxiliary_loss_mlp": 0.01337311, + "balance_loss_clip": 1.31050897, + "balance_loss_mlp": 1.12902832, + "epoch": 0.35737261385840974, + "flos": 60828234910560.0, + "grad_norm": 0.8256009712301845, + "language_loss": 0.60619807, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.63578391, + "num_input_tokens_seen": 127772475, + "step": 5944, + "time_per_iteration": 3.3888540267944336 + }, + { + "auxiliary_loss_clip": 0.01510147, + "auxiliary_loss_mlp": 0.01278472, + "balance_loss_clip": 1.19263959, + "balance_loss_mlp": 1.03356862, + "epoch": 0.3574327371110777, + "flos": 18845780417280.0, + "grad_norm": 4.143590797206329, + "language_loss": 0.7242887, + "learning_rate": 2.976864428379655e-06, + "loss": 0.75217497, + "num_input_tokens_seen": 127790940, + "step": 5945, + "time_per_iteration": 2.726625680923462 + }, + { + "auxiliary_loss_clip": 0.01508754, + "auxiliary_loss_mlp": 0.01283559, + "balance_loss_clip": 1.19106436, + "balance_loss_mlp": 1.03712964, + "epoch": 0.35749286036374567, + "flos": 23551915579200.0, + "grad_norm": 1.8016925575967204, + "language_loss": 0.81283259, + "learning_rate": 2.976524564880326e-06, + "loss": 0.8407557, + "num_input_tokens_seen": 127808275, + "step": 5946, + "time_per_iteration": 2.8029403686523438 + }, + { + "auxiliary_loss_clip": 0.01509646, + "auxiliary_loss_mlp": 0.01286465, + "balance_loss_clip": 1.19367373, + "balance_loss_mlp": 1.03908229, + "epoch": 0.35755298361641363, + "flos": 21107610898560.0, + "grad_norm": 1.7036943317112245, + "language_loss": 0.69185245, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71981359, + "num_input_tokens_seen": 127828840, + "step": 5947, + "time_per_iteration": 2.8176116943359375 + }, + { + "auxiliary_loss_clip": 0.01513447, + "auxiliary_loss_mlp": 0.01274912, + "balance_loss_clip": 1.1960727, + "balance_loss_mlp": 1.03134298, + "epoch": 0.3576131068690816, + "flos": 19247064357600.0, + "grad_norm": 1.8092031572571017, + "language_loss": 0.75946927, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.7873528, + "num_input_tokens_seen": 127846240, + "step": 5948, + "time_per_iteration": 2.754774808883667 + }, + { + "auxiliary_loss_clip": 0.01501988, + "auxiliary_loss_mlp": 0.01270742, + "balance_loss_clip": 1.18552041, + "balance_loss_mlp": 1.02698278, + "epoch": 0.35767323012174956, + "flos": 28657058991840.0, + "grad_norm": 2.008518791692257, + "language_loss": 0.70975232, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.73747957, + "num_input_tokens_seen": 127866880, + "step": 5949, + "time_per_iteration": 2.8661110401153564 + }, + { + "auxiliary_loss_clip": 0.01505949, + "auxiliary_loss_mlp": 0.01280955, + "balance_loss_clip": 1.18792152, + "balance_loss_mlp": 1.03738642, + "epoch": 0.35773335337441753, + "flos": 17086882438080.0, + "grad_norm": 2.2645498707039655, + "language_loss": 0.77271295, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.80058205, + "num_input_tokens_seen": 127883560, + "step": 5950, + "time_per_iteration": 2.7106196880340576 + }, + { + "auxiliary_loss_clip": 0.01503981, + "auxiliary_loss_mlp": 0.01268712, + "balance_loss_clip": 1.18557513, + "balance_loss_mlp": 1.02399909, + "epoch": 0.35779347662708555, + "flos": 15890464536480.0, + "grad_norm": 2.233349170071682, + "language_loss": 0.7296114, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.75733835, + "num_input_tokens_seen": 127902330, + "step": 5951, + "time_per_iteration": 2.7743942737579346 + }, + { + "auxiliary_loss_clip": 0.01511146, + "auxiliary_loss_mlp": 0.01279283, + "balance_loss_clip": 1.19243813, + "balance_loss_mlp": 1.03189945, + "epoch": 0.3578535998797535, + "flos": 28661420730240.0, + "grad_norm": 2.0749214766257755, + "language_loss": 0.70049667, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.72840095, + "num_input_tokens_seen": 127922325, + "step": 5952, + "time_per_iteration": 2.8598177433013916 + }, + { + "auxiliary_loss_clip": 0.01502679, + "auxiliary_loss_mlp": 0.01270604, + "balance_loss_clip": 1.18456125, + "balance_loss_mlp": 1.02837062, + "epoch": 0.3579137231324215, + "flos": 37855488163680.0, + "grad_norm": 2.0789316779490736, + "language_loss": 0.69847429, + "learning_rate": 2.974144484269449e-06, + "loss": 0.72620714, + "num_input_tokens_seen": 127942635, + "step": 5953, + "time_per_iteration": 2.9041261672973633 + }, + { + "auxiliary_loss_clip": 0.01514309, + "auxiliary_loss_mlp": 0.01289449, + "balance_loss_clip": 1.19573283, + "balance_loss_mlp": 1.04740644, + "epoch": 0.35797384638508944, + "flos": 22349087818560.0, + "grad_norm": 1.88846474587716, + "language_loss": 0.6687746, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.69681227, + "num_input_tokens_seen": 127962520, + "step": 5954, + "time_per_iteration": 4.40779709815979 + }, + { + "auxiliary_loss_clip": 0.01510887, + "auxiliary_loss_mlp": 0.01281525, + "balance_loss_clip": 1.19314933, + "balance_loss_mlp": 1.03890991, + "epoch": 0.3580339696377574, + "flos": 13591426165920.0, + "grad_norm": 2.807722554185958, + "language_loss": 0.75244659, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.78037071, + "num_input_tokens_seen": 127981180, + "step": 5955, + "time_per_iteration": 2.7809817790985107 + }, + { + "auxiliary_loss_clip": 0.01509401, + "auxiliary_loss_mlp": 0.01277757, + "balance_loss_clip": 1.19245517, + "balance_loss_mlp": 1.03781283, + "epoch": 0.3580940928904254, + "flos": 23770459823040.0, + "grad_norm": 1.5371798889914172, + "language_loss": 0.76056218, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78843373, + "num_input_tokens_seen": 127999725, + "step": 5956, + "time_per_iteration": 2.780941963195801 + }, + { + "auxiliary_loss_clip": 0.01506099, + "auxiliary_loss_mlp": 0.0126909, + "balance_loss_clip": 1.1873821, + "balance_loss_mlp": 1.03219676, + "epoch": 0.35815421614309334, + "flos": 19465912026720.0, + "grad_norm": 1.8149766816392128, + "language_loss": 0.73310173, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.76085353, + "num_input_tokens_seen": 128018885, + "step": 5957, + "time_per_iteration": 2.791247844696045 + }, + { + "auxiliary_loss_clip": 0.01511583, + "auxiliary_loss_mlp": 0.01281134, + "balance_loss_clip": 1.19506001, + "balance_loss_mlp": 1.04099846, + "epoch": 0.3582143393957613, + "flos": 23370465440160.0, + "grad_norm": 1.8891877557312153, + "language_loss": 0.71454161, + "learning_rate": 2.972443318242726e-06, + "loss": 0.74246877, + "num_input_tokens_seen": 128037875, + "step": 5958, + "time_per_iteration": 2.8024826049804688 + }, + { + "auxiliary_loss_clip": 0.0150369, + "auxiliary_loss_mlp": 0.01270271, + "balance_loss_clip": 1.18643034, + "balance_loss_mlp": 1.03185236, + "epoch": 0.35827446264842927, + "flos": 26325705464640.0, + "grad_norm": 19.835329298801508, + "language_loss": 0.88544405, + "learning_rate": 2.972102974360324e-06, + "loss": 0.91318375, + "num_input_tokens_seen": 128056045, + "step": 5959, + "time_per_iteration": 2.7984657287597656 + }, + { + "auxiliary_loss_clip": 0.01505874, + "auxiliary_loss_mlp": 0.01279875, + "balance_loss_clip": 1.18763125, + "balance_loss_mlp": 1.04031181, + "epoch": 0.35833458590109724, + "flos": 30449296188000.0, + "grad_norm": 1.4957373333031019, + "language_loss": 0.57964188, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60749936, + "num_input_tokens_seen": 128077815, + "step": 5960, + "time_per_iteration": 2.8107240200042725 + }, + { + "auxiliary_loss_clip": 0.01496577, + "auxiliary_loss_mlp": 0.01280437, + "balance_loss_clip": 1.17788601, + "balance_loss_mlp": 1.04182744, + "epoch": 0.3583947091537652, + "flos": 14831651456640.0, + "grad_norm": 4.043008201077887, + "language_loss": 0.76351535, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.79128551, + "num_input_tokens_seen": 128095460, + "step": 5961, + "time_per_iteration": 2.754462718963623 + }, + { + "auxiliary_loss_clip": 0.0149777, + "auxiliary_loss_mlp": 0.01281084, + "balance_loss_clip": 1.18003976, + "balance_loss_mlp": 1.04247439, + "epoch": 0.35845483240643317, + "flos": 34243515491040.0, + "grad_norm": 1.618485478249755, + "language_loss": 0.70148945, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72927797, + "num_input_tokens_seen": 128118605, + "step": 5962, + "time_per_iteration": 2.903163433074951 + }, + { + "auxiliary_loss_clip": 0.01504144, + "auxiliary_loss_mlp": 0.01276079, + "balance_loss_clip": 1.18713081, + "balance_loss_mlp": 1.03613424, + "epoch": 0.35851495565910113, + "flos": 20962230804000.0, + "grad_norm": 1.7893768704991146, + "language_loss": 0.74694681, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.77474904, + "num_input_tokens_seen": 128139205, + "step": 5963, + "time_per_iteration": 2.826420783996582 + }, + { + "auxiliary_loss_clip": 0.01505026, + "auxiliary_loss_mlp": 0.01276547, + "balance_loss_clip": 1.18711996, + "balance_loss_mlp": 1.03336, + "epoch": 0.35857507891176915, + "flos": 22312069570080.0, + "grad_norm": 1.8168810700426228, + "language_loss": 0.78558409, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.81339979, + "num_input_tokens_seen": 128158765, + "step": 5964, + "time_per_iteration": 4.277425289154053 + }, + { + "auxiliary_loss_clip": 0.01505132, + "auxiliary_loss_mlp": 0.01281137, + "balance_loss_clip": 1.18528974, + "balance_loss_mlp": 1.03966713, + "epoch": 0.3586352021644371, + "flos": 23370124086720.0, + "grad_norm": 2.1691634406684495, + "language_loss": 0.66423798, + "learning_rate": 2.970060137410626e-06, + "loss": 0.69210064, + "num_input_tokens_seen": 128177850, + "step": 5965, + "time_per_iteration": 4.332896709442139 + }, + { + "auxiliary_loss_clip": 0.01511964, + "auxiliary_loss_mlp": 0.01281611, + "balance_loss_clip": 1.19296706, + "balance_loss_mlp": 1.0405221, + "epoch": 0.3586953254171051, + "flos": 27851381002080.0, + "grad_norm": 2.779674410353966, + "language_loss": 0.79351389, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.82144964, + "num_input_tokens_seen": 128196925, + "step": 5966, + "time_per_iteration": 2.8163392543792725 + }, + { + "auxiliary_loss_clip": 0.01511628, + "auxiliary_loss_mlp": 0.01292392, + "balance_loss_clip": 1.19158685, + "balance_loss_mlp": 1.05416417, + "epoch": 0.35875544866977305, + "flos": 19502550993600.0, + "grad_norm": 2.127254087658592, + "language_loss": 0.91700375, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.94504392, + "num_input_tokens_seen": 128213955, + "step": 5967, + "time_per_iteration": 4.253779888153076 + }, + { + "auxiliary_loss_clip": 0.01512419, + "auxiliary_loss_mlp": 0.01301952, + "balance_loss_clip": 1.19223261, + "balance_loss_mlp": 1.06315231, + "epoch": 0.358815571922441, + "flos": 21473317860480.0, + "grad_norm": 2.242046438183773, + "language_loss": 0.80784094, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.83598465, + "num_input_tokens_seen": 128232980, + "step": 5968, + "time_per_iteration": 2.78433895111084 + }, + { + "auxiliary_loss_clip": 0.01504573, + "auxiliary_loss_mlp": 0.01288658, + "balance_loss_clip": 1.18381381, + "balance_loss_mlp": 1.04756927, + "epoch": 0.358875695175109, + "flos": 21837583552320.0, + "grad_norm": 2.26346506074453, + "language_loss": 0.84545612, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.87338847, + "num_input_tokens_seen": 128252795, + "step": 5969, + "time_per_iteration": 2.739550828933716 + }, + { + "auxiliary_loss_clip": 0.01504181, + "auxiliary_loss_mlp": 0.01283467, + "balance_loss_clip": 1.18527246, + "balance_loss_mlp": 1.0435226, + "epoch": 0.35893581842777694, + "flos": 32014189807200.0, + "grad_norm": 1.845039453373192, + "language_loss": 0.72329247, + "learning_rate": 2.968356761586202e-06, + "loss": 0.75116897, + "num_input_tokens_seen": 128273115, + "step": 5970, + "time_per_iteration": 2.848010540008545 + }, + { + "auxiliary_loss_clip": 0.01510416, + "auxiliary_loss_mlp": 0.01291584, + "balance_loss_clip": 1.19178915, + "balance_loss_mlp": 1.05450106, + "epoch": 0.3589959416804449, + "flos": 20487934427040.0, + "grad_norm": 1.6027838329224555, + "language_loss": 0.79762721, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.82564723, + "num_input_tokens_seen": 128292220, + "step": 5971, + "time_per_iteration": 2.7252674102783203 + }, + { + "auxiliary_loss_clip": 0.01500543, + "auxiliary_loss_mlp": 0.01269028, + "balance_loss_clip": 1.18027496, + "balance_loss_mlp": 1.02679491, + "epoch": 0.3590560649331129, + "flos": 16182552211200.0, + "grad_norm": 2.1303776780804977, + "language_loss": 0.78616548, + "learning_rate": 2.967675154124696e-06, + "loss": 0.81386119, + "num_input_tokens_seen": 128310305, + "step": 5972, + "time_per_iteration": 2.7949788570404053 + }, + { + "auxiliary_loss_clip": 0.014978, + "auxiliary_loss_mlp": 0.01287112, + "balance_loss_clip": 1.17882371, + "balance_loss_mlp": 1.0492661, + "epoch": 0.35911618818578084, + "flos": 20377448604000.0, + "grad_norm": 1.9198327398070807, + "language_loss": 0.8147037, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.84255284, + "num_input_tokens_seen": 128328305, + "step": 5973, + "time_per_iteration": 2.7485580444335938 + }, + { + "auxiliary_loss_clip": 0.01635838, + "auxiliary_loss_mlp": 0.01243385, + "balance_loss_clip": 1.32601428, + "balance_loss_mlp": 1.02442169, + "epoch": 0.3591763114384488, + "flos": 41241482308800.0, + "grad_norm": 0.9513501686394724, + "language_loss": 0.56660479, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.595397, + "num_input_tokens_seen": 128378380, + "step": 5974, + "time_per_iteration": 3.1879775524139404 + }, + { + "auxiliary_loss_clip": 0.01509603, + "auxiliary_loss_mlp": 0.01285074, + "balance_loss_clip": 1.19007921, + "balance_loss_mlp": 1.04474795, + "epoch": 0.35923643469111677, + "flos": 18697328141760.0, + "grad_norm": 1.9658890949088066, + "language_loss": 0.69131684, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.71926367, + "num_input_tokens_seen": 128394315, + "step": 5975, + "time_per_iteration": 2.7609570026397705 + }, + { + "auxiliary_loss_clip": 0.01502009, + "auxiliary_loss_mlp": 0.0128327, + "balance_loss_clip": 1.18337476, + "balance_loss_mlp": 1.04332566, + "epoch": 0.35929655794378473, + "flos": 25012316024640.0, + "grad_norm": 1.5773677416399559, + "language_loss": 0.79961526, + "learning_rate": 2.96631149897303e-06, + "loss": 0.8274681, + "num_input_tokens_seen": 128414515, + "step": 5976, + "time_per_iteration": 2.7985565662384033 + }, + { + "auxiliary_loss_clip": 0.01515541, + "auxiliary_loss_mlp": 0.01279829, + "balance_loss_clip": 1.19646335, + "balance_loss_mlp": 1.04102862, + "epoch": 0.35935668119645275, + "flos": 14977334976480.0, + "grad_norm": 1.794133123253224, + "language_loss": 0.78727633, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81523007, + "num_input_tokens_seen": 128430615, + "step": 5977, + "time_per_iteration": 2.7279984951019287 + }, + { + "auxiliary_loss_clip": 0.01513353, + "auxiliary_loss_mlp": 0.01282576, + "balance_loss_clip": 1.19593716, + "balance_loss_mlp": 1.04568362, + "epoch": 0.3594168044491207, + "flos": 21180244053600.0, + "grad_norm": 1.8324813085310936, + "language_loss": 0.80039442, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.8283537, + "num_input_tokens_seen": 128449480, + "step": 5978, + "time_per_iteration": 2.800930976867676 + }, + { + "auxiliary_loss_clip": 0.01523622, + "auxiliary_loss_mlp": 0.01290782, + "balance_loss_clip": 1.2060014, + "balance_loss_mlp": 1.05388951, + "epoch": 0.3594769277017887, + "flos": 27674596026720.0, + "grad_norm": 2.065908352342876, + "language_loss": 0.67732108, + "learning_rate": 2.965288372816436e-06, + "loss": 0.70546514, + "num_input_tokens_seen": 128471465, + "step": 5979, + "time_per_iteration": 2.836941957473755 + }, + { + "auxiliary_loss_clip": 0.01515041, + "auxiliary_loss_mlp": 0.01286272, + "balance_loss_clip": 1.19892049, + "balance_loss_mlp": 1.04728127, + "epoch": 0.35953705095445665, + "flos": 23004530909280.0, + "grad_norm": 5.284599013787234, + "language_loss": 0.66967869, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.6976918, + "num_input_tokens_seen": 128490645, + "step": 5980, + "time_per_iteration": 2.7288477420806885 + }, + { + "auxiliary_loss_clip": 0.01519449, + "auxiliary_loss_mlp": 0.01285426, + "balance_loss_clip": 1.20578313, + "balance_loss_mlp": 1.04548109, + "epoch": 0.3595971742071246, + "flos": 25516007089920.0, + "grad_norm": 3.7900926450832504, + "language_loss": 0.71045345, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73850214, + "num_input_tokens_seen": 128510225, + "step": 5981, + "time_per_iteration": 2.821836233139038 + }, + { + "auxiliary_loss_clip": 0.01521565, + "auxiliary_loss_mlp": 0.01292646, + "balance_loss_clip": 1.20678127, + "balance_loss_mlp": 1.05270159, + "epoch": 0.3596572974597926, + "flos": 29865196694880.0, + "grad_norm": 4.383729919905995, + "language_loss": 0.71442032, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.74256241, + "num_input_tokens_seen": 128530195, + "step": 5982, + "time_per_iteration": 2.8044233322143555 + }, + { + "auxiliary_loss_clip": 0.01531276, + "auxiliary_loss_mlp": 0.0129696, + "balance_loss_clip": 1.21724558, + "balance_loss_mlp": 1.06063926, + "epoch": 0.35971742071246054, + "flos": 23114789163360.0, + "grad_norm": 1.8730743996908237, + "language_loss": 0.75846982, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.78675222, + "num_input_tokens_seen": 128549990, + "step": 5983, + "time_per_iteration": 2.7731857299804688 + }, + { + "auxiliary_loss_clip": 0.01538643, + "auxiliary_loss_mlp": 0.01294214, + "balance_loss_clip": 1.22510362, + "balance_loss_mlp": 1.05217171, + "epoch": 0.3597775439651285, + "flos": 16727130197280.0, + "grad_norm": 2.0506986457921124, + "language_loss": 0.76490301, + "learning_rate": 2.96358243065131e-06, + "loss": 0.79323155, + "num_input_tokens_seen": 128567925, + "step": 5984, + "time_per_iteration": 2.8442904949188232 + }, + { + "auxiliary_loss_clip": 0.01535401, + "auxiliary_loss_mlp": 0.01274626, + "balance_loss_clip": 1.22114849, + "balance_loss_mlp": 1.03658915, + "epoch": 0.3598376672177965, + "flos": 19721550375360.0, + "grad_norm": 2.1132346712651016, + "language_loss": 0.8671397, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.89524001, + "num_input_tokens_seen": 128585655, + "step": 5985, + "time_per_iteration": 2.7687642574310303 + }, + { + "auxiliary_loss_clip": 0.0154331, + "auxiliary_loss_mlp": 0.0129054, + "balance_loss_clip": 1.2291038, + "balance_loss_mlp": 1.05498207, + "epoch": 0.35989779047046444, + "flos": 17313467451840.0, + "grad_norm": 1.453460376637793, + "language_loss": 0.72565347, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.75399196, + "num_input_tokens_seen": 128604820, + "step": 5986, + "time_per_iteration": 2.7385430335998535 + }, + { + "auxiliary_loss_clip": 0.01534557, + "auxiliary_loss_mlp": 0.01297849, + "balance_loss_clip": 1.22106361, + "balance_loss_mlp": 1.05752301, + "epoch": 0.3599579137231324, + "flos": 22713467294880.0, + "grad_norm": 1.705683408097654, + "language_loss": 0.74017507, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.76849914, + "num_input_tokens_seen": 128623070, + "step": 5987, + "time_per_iteration": 2.77724552154541 + }, + { + "auxiliary_loss_clip": 0.01543217, + "auxiliary_loss_mlp": 0.01296579, + "balance_loss_clip": 1.22952366, + "balance_loss_mlp": 1.0545361, + "epoch": 0.36001803697580037, + "flos": 20962003235040.0, + "grad_norm": 1.8500815571152305, + "language_loss": 0.69245231, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.72085023, + "num_input_tokens_seen": 128642430, + "step": 5988, + "time_per_iteration": 2.7649526596069336 + }, + { + "auxiliary_loss_clip": 0.01543267, + "auxiliary_loss_mlp": 0.01291293, + "balance_loss_clip": 1.22961378, + "balance_loss_mlp": 1.04963183, + "epoch": 0.36007816022846834, + "flos": 20487668929920.0, + "grad_norm": 1.8551874636339813, + "language_loss": 0.73541462, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.76376021, + "num_input_tokens_seen": 128661285, + "step": 5989, + "time_per_iteration": 2.7373769283294678 + }, + { + "auxiliary_loss_clip": 0.01538213, + "auxiliary_loss_mlp": 0.01270582, + "balance_loss_clip": 1.22466278, + "balance_loss_mlp": 1.0306375, + "epoch": 0.36013828348113636, + "flos": 28003739878080.0, + "grad_norm": 1.8492198575603394, + "language_loss": 0.80275023, + "learning_rate": 2.961534094403931e-06, + "loss": 0.83083808, + "num_input_tokens_seen": 128682210, + "step": 5990, + "time_per_iteration": 2.7979331016540527 + }, + { + "auxiliary_loss_clip": 0.0153753, + "auxiliary_loss_mlp": 0.01281042, + "balance_loss_clip": 1.2230134, + "balance_loss_mlp": 1.0397625, + "epoch": 0.3601984067338043, + "flos": 20084147228160.0, + "grad_norm": 1.658721976268634, + "language_loss": 0.83993453, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86812019, + "num_input_tokens_seen": 128700445, + "step": 5991, + "time_per_iteration": 2.7469730377197266 + }, + { + "auxiliary_loss_clip": 0.01535178, + "auxiliary_loss_mlp": 0.01277839, + "balance_loss_clip": 1.22025907, + "balance_loss_mlp": 1.03255391, + "epoch": 0.3602585299864723, + "flos": 18619195403520.0, + "grad_norm": 1.8524615569940739, + "language_loss": 0.75560677, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.78373694, + "num_input_tokens_seen": 128716855, + "step": 5992, + "time_per_iteration": 4.407474756240845 + }, + { + "auxiliary_loss_clip": 0.01544102, + "auxiliary_loss_mlp": 0.01278564, + "balance_loss_clip": 1.23087883, + "balance_loss_mlp": 1.03861928, + "epoch": 0.36031865323914025, + "flos": 19575032436000.0, + "grad_norm": 2.0870779736256453, + "language_loss": 0.7754001, + "learning_rate": 2.960509433875627e-06, + "loss": 0.80362678, + "num_input_tokens_seen": 128735835, + "step": 5993, + "time_per_iteration": 2.7474284172058105 + }, + { + "auxiliary_loss_clip": 0.01541161, + "auxiliary_loss_mlp": 0.01283243, + "balance_loss_clip": 1.22681713, + "balance_loss_mlp": 1.03986549, + "epoch": 0.3603787764918082, + "flos": 17492376404160.0, + "grad_norm": 2.0509985216604236, + "language_loss": 0.74560452, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.77384853, + "num_input_tokens_seen": 128752465, + "step": 5994, + "time_per_iteration": 2.77828311920166 + }, + { + "auxiliary_loss_clip": 0.01540896, + "auxiliary_loss_mlp": 0.01279566, + "balance_loss_clip": 1.22556865, + "balance_loss_mlp": 1.0356164, + "epoch": 0.3604388997444762, + "flos": 15525212712480.0, + "grad_norm": 2.3275374195033316, + "language_loss": 0.69314349, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.72134805, + "num_input_tokens_seen": 128770865, + "step": 5995, + "time_per_iteration": 2.7524523735046387 + }, + { + "auxiliary_loss_clip": 0.01534296, + "auxiliary_loss_mlp": 0.01277051, + "balance_loss_clip": 1.21929383, + "balance_loss_mlp": 1.03443646, + "epoch": 0.36049902299714415, + "flos": 17312784744960.0, + "grad_norm": 2.2203159567882875, + "language_loss": 0.8296715, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.85778499, + "num_input_tokens_seen": 128789730, + "step": 5996, + "time_per_iteration": 2.7650365829467773 + }, + { + "auxiliary_loss_clip": 0.01534432, + "auxiliary_loss_mlp": 0.01266775, + "balance_loss_clip": 1.21965384, + "balance_loss_mlp": 1.02187121, + "epoch": 0.3605591462498121, + "flos": 17057942887680.0, + "grad_norm": 1.7230753459333514, + "language_loss": 0.7383126, + "learning_rate": 2.959142709981763e-06, + "loss": 0.76632464, + "num_input_tokens_seen": 128806610, + "step": 5997, + "time_per_iteration": 2.6958110332489014 + }, + { + "auxiliary_loss_clip": 0.01541754, + "auxiliary_loss_mlp": 0.01277579, + "balance_loss_clip": 1.22606325, + "balance_loss_mlp": 1.03668141, + "epoch": 0.3606192695024801, + "flos": 16838753865120.0, + "grad_norm": 2.5789037834906776, + "language_loss": 0.68989491, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.71808827, + "num_input_tokens_seen": 128824830, + "step": 5998, + "time_per_iteration": 2.761300563812256 + }, + { + "auxiliary_loss_clip": 0.01546126, + "auxiliary_loss_mlp": 0.01273729, + "balance_loss_clip": 1.23183286, + "balance_loss_mlp": 1.02672696, + "epoch": 0.36067939275514804, + "flos": 12131253289440.0, + "grad_norm": 4.868588367660037, + "language_loss": 0.77592504, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.80412358, + "num_input_tokens_seen": 128838170, + "step": 5999, + "time_per_iteration": 2.709371566772461 + }, + { + "auxiliary_loss_clip": 0.01537549, + "auxiliary_loss_mlp": 0.01265435, + "balance_loss_clip": 1.22253442, + "balance_loss_mlp": 1.01938677, + "epoch": 0.360739516007816, + "flos": 18043440105600.0, + "grad_norm": 2.058644267118808, + "language_loss": 0.78640926, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.81443912, + "num_input_tokens_seen": 128855625, + "step": 6000, + "time_per_iteration": 2.776345729827881 + }, + { + "auxiliary_loss_clip": 0.01533713, + "auxiliary_loss_mlp": 0.0127839, + "balance_loss_clip": 1.21769059, + "balance_loss_mlp": 1.03596616, + "epoch": 0.360799639260484, + "flos": 18551682550080.0, + "grad_norm": 1.7415384623674324, + "language_loss": 0.78653932, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.81466043, + "num_input_tokens_seen": 128873540, + "step": 6001, + "time_per_iteration": 4.3721160888671875 + }, + { + "auxiliary_loss_clip": 0.01534179, + "auxiliary_loss_mlp": 0.01272719, + "balance_loss_clip": 1.21841359, + "balance_loss_mlp": 1.02972293, + "epoch": 0.36085976251315194, + "flos": 19684039060800.0, + "grad_norm": 2.0462647316686553, + "language_loss": 0.83575284, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.86382186, + "num_input_tokens_seen": 128889925, + "step": 6002, + "time_per_iteration": 4.476437568664551 + }, + { + "auxiliary_loss_clip": 0.01545945, + "auxiliary_loss_mlp": 0.0127445, + "balance_loss_clip": 1.23072851, + "balance_loss_mlp": 1.03393328, + "epoch": 0.3609198857658199, + "flos": 24200797098240.0, + "grad_norm": 2.429418635394944, + "language_loss": 0.90767336, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.93587726, + "num_input_tokens_seen": 128906890, + "step": 6003, + "time_per_iteration": 2.769990921020508 + }, + { + "auxiliary_loss_clip": 0.01697122, + "auxiliary_loss_mlp": 0.0126255, + "balance_loss_clip": 1.38936472, + "balance_loss_mlp": 1.05121613, + "epoch": 0.3609800090184879, + "flos": 57122123451840.0, + "grad_norm": 0.882333627750725, + "language_loss": 0.53363085, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.56322753, + "num_input_tokens_seen": 128965940, + "step": 6004, + "time_per_iteration": 3.221820116043091 + }, + { + "auxiliary_loss_clip": 0.01529143, + "auxiliary_loss_mlp": 0.01273612, + "balance_loss_clip": 1.21280348, + "balance_loss_mlp": 1.02737391, + "epoch": 0.3610401322711559, + "flos": 20813209606080.0, + "grad_norm": 1.7733347491327245, + "language_loss": 0.77554619, + "learning_rate": 2.956407517225883e-06, + "loss": 0.80357373, + "num_input_tokens_seen": 128985835, + "step": 6005, + "time_per_iteration": 4.326914548873901 + }, + { + "auxiliary_loss_clip": 0.01533154, + "auxiliary_loss_mlp": 0.01275996, + "balance_loss_clip": 1.21660101, + "balance_loss_mlp": 1.03319025, + "epoch": 0.36110025552382385, + "flos": 13700887928640.0, + "grad_norm": 3.206996686078784, + "language_loss": 0.79359519, + "learning_rate": 2.956065454793429e-06, + "loss": 0.82168669, + "num_input_tokens_seen": 129003120, + "step": 6006, + "time_per_iteration": 2.779749870300293 + }, + { + "auxiliary_loss_clip": 0.01539585, + "auxiliary_loss_mlp": 0.01286075, + "balance_loss_clip": 1.22294831, + "balance_loss_mlp": 1.03983617, + "epoch": 0.3611603787764918, + "flos": 22457184167520.0, + "grad_norm": 2.152796560049705, + "language_loss": 0.84832323, + "learning_rate": 2.955723356106876e-06, + "loss": 0.87657982, + "num_input_tokens_seen": 129021645, + "step": 6007, + "time_per_iteration": 2.8302602767944336 + }, + { + "auxiliary_loss_clip": 0.01529119, + "auxiliary_loss_mlp": 0.01276733, + "balance_loss_clip": 1.21192813, + "balance_loss_mlp": 1.02915919, + "epoch": 0.3612205020291598, + "flos": 20888763229440.0, + "grad_norm": 2.26113471427051, + "language_loss": 0.72245634, + "learning_rate": 2.955381221179198e-06, + "loss": 0.75051486, + "num_input_tokens_seen": 129038375, + "step": 6008, + "time_per_iteration": 2.810265064239502 + }, + { + "auxiliary_loss_clip": 0.01531475, + "auxiliary_loss_mlp": 0.01273752, + "balance_loss_clip": 1.21462655, + "balance_loss_mlp": 1.02655935, + "epoch": 0.36128062528182775, + "flos": 15743794884480.0, + "grad_norm": 4.616615782371972, + "language_loss": 0.83087909, + "learning_rate": 2.955039050023368e-06, + "loss": 0.85893136, + "num_input_tokens_seen": 129056235, + "step": 6009, + "time_per_iteration": 2.7361276149749756 + }, + { + "auxiliary_loss_clip": 0.01535559, + "auxiliary_loss_mlp": 0.01277464, + "balance_loss_clip": 1.21861732, + "balance_loss_mlp": 1.02931833, + "epoch": 0.3613407485344957, + "flos": 16766424135360.0, + "grad_norm": 2.217894906546855, + "language_loss": 0.76592946, + "learning_rate": 2.954696842652362e-06, + "loss": 0.79405969, + "num_input_tokens_seen": 129072405, + "step": 6010, + "time_per_iteration": 2.7191002368927 + }, + { + "auxiliary_loss_clip": 0.01540008, + "auxiliary_loss_mlp": 0.01288219, + "balance_loss_clip": 1.22417998, + "balance_loss_mlp": 1.0429337, + "epoch": 0.3614008717871637, + "flos": 20373011009280.0, + "grad_norm": 1.5574192861260006, + "language_loss": 0.82957315, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85785544, + "num_input_tokens_seen": 129090225, + "step": 6011, + "time_per_iteration": 2.8163228034973145 + }, + { + "auxiliary_loss_clip": 0.01531672, + "auxiliary_loss_mlp": 0.01279803, + "balance_loss_clip": 1.21550894, + "balance_loss_mlp": 1.03318262, + "epoch": 0.36146099503983165, + "flos": 22778476889760.0, + "grad_norm": 2.1090013657849056, + "language_loss": 0.62767619, + "learning_rate": 2.954012319316727e-06, + "loss": 0.65579093, + "num_input_tokens_seen": 129107685, + "step": 6012, + "time_per_iteration": 2.7866809368133545 + }, + { + "auxiliary_loss_clip": 0.01532664, + "auxiliary_loss_mlp": 0.0126464, + "balance_loss_clip": 1.21660542, + "balance_loss_mlp": 1.02526784, + "epoch": 0.3615211182924996, + "flos": 22998538260000.0, + "grad_norm": 1.740001167679264, + "language_loss": 0.84084952, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86882257, + "num_input_tokens_seen": 129125315, + "step": 6013, + "time_per_iteration": 2.8116562366485596 + }, + { + "auxiliary_loss_clip": 0.01529823, + "auxiliary_loss_mlp": 0.01268467, + "balance_loss_clip": 1.21305096, + "balance_loss_mlp": 1.02718735, + "epoch": 0.3615812415451676, + "flos": 16649604309600.0, + "grad_norm": 1.9000423776698308, + "language_loss": 0.91475844, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.94274127, + "num_input_tokens_seen": 129141600, + "step": 6014, + "time_per_iteration": 2.737062692642212 + }, + { + "auxiliary_loss_clip": 0.01534275, + "auxiliary_loss_mlp": 0.01283156, + "balance_loss_clip": 1.21799016, + "balance_loss_mlp": 1.04187703, + "epoch": 0.36164136479783554, + "flos": 21321983044800.0, + "grad_norm": 2.0523993797123863, + "language_loss": 0.73895562, + "learning_rate": 2.95298526302391e-06, + "loss": 0.7671299, + "num_input_tokens_seen": 129160665, + "step": 6015, + "time_per_iteration": 2.7597568035125732 + }, + { + "auxiliary_loss_clip": 0.0152572, + "auxiliary_loss_mlp": 0.01272028, + "balance_loss_clip": 1.20928168, + "balance_loss_mlp": 1.02750635, + "epoch": 0.3617014880505035, + "flos": 24171819619680.0, + "grad_norm": 1.8776315683720046, + "language_loss": 0.64741957, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67539704, + "num_input_tokens_seen": 129179220, + "step": 6016, + "time_per_iteration": 2.7918527126312256 + }, + { + "auxiliary_loss_clip": 0.01536052, + "auxiliary_loss_mlp": 0.01280688, + "balance_loss_clip": 1.21985745, + "balance_loss_mlp": 1.03826404, + "epoch": 0.3617616113031715, + "flos": 39017656572480.0, + "grad_norm": 1.6600275048131252, + "language_loss": 0.71740794, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74557537, + "num_input_tokens_seen": 129200385, + "step": 6017, + "time_per_iteration": 2.9213201999664307 + }, + { + "auxiliary_loss_clip": 0.01529736, + "auxiliary_loss_mlp": 0.01284842, + "balance_loss_clip": 1.21268225, + "balance_loss_mlp": 1.04012942, + "epoch": 0.3618217345558395, + "flos": 12132504918720.0, + "grad_norm": 2.0789369396060793, + "language_loss": 0.73227978, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.76042557, + "num_input_tokens_seen": 129217395, + "step": 6018, + "time_per_iteration": 2.8233909606933594 + }, + { + "auxiliary_loss_clip": 0.01533866, + "auxiliary_loss_mlp": 0.01285588, + "balance_loss_clip": 1.21633053, + "balance_loss_mlp": 1.04564404, + "epoch": 0.36188185780850746, + "flos": 24937255467360.0, + "grad_norm": 1.6564219821737465, + "language_loss": 0.69044447, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71863902, + "num_input_tokens_seen": 129238940, + "step": 6019, + "time_per_iteration": 2.7767281532287598 + }, + { + "auxiliary_loss_clip": 0.01523924, + "auxiliary_loss_mlp": 0.01269749, + "balance_loss_clip": 1.20519686, + "balance_loss_mlp": 1.02484512, + "epoch": 0.3619419810611754, + "flos": 20960903318400.0, + "grad_norm": 1.6175430489410896, + "language_loss": 0.76566517, + "learning_rate": 2.95127277996311e-06, + "loss": 0.79360187, + "num_input_tokens_seen": 129258240, + "step": 6020, + "time_per_iteration": 2.7977781295776367 + }, + { + "auxiliary_loss_clip": 0.01532961, + "auxiliary_loss_mlp": 0.01275539, + "balance_loss_clip": 1.21477401, + "balance_loss_mlp": 1.03387761, + "epoch": 0.3620021043138434, + "flos": 22531106880000.0, + "grad_norm": 1.747335738724388, + "language_loss": 0.739573, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.767658, + "num_input_tokens_seen": 129279040, + "step": 6021, + "time_per_iteration": 2.8376364707946777 + }, + { + "auxiliary_loss_clip": 0.01526847, + "auxiliary_loss_mlp": 0.01274093, + "balance_loss_clip": 1.20873547, + "balance_loss_mlp": 1.03433919, + "epoch": 0.36206222756651135, + "flos": 15598528574400.0, + "grad_norm": 2.0131767350028813, + "language_loss": 0.81127536, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83928478, + "num_input_tokens_seen": 129295415, + "step": 6022, + "time_per_iteration": 2.7267541885375977 + }, + { + "auxiliary_loss_clip": 0.0153684, + "auxiliary_loss_mlp": 0.01292603, + "balance_loss_clip": 1.21706057, + "balance_loss_mlp": 1.05666435, + "epoch": 0.3621223508191793, + "flos": 23589464821920.0, + "grad_norm": 1.7227728910936295, + "language_loss": 0.81606323, + "learning_rate": 2.950244857154417e-06, + "loss": 0.84435767, + "num_input_tokens_seen": 129312620, + "step": 6023, + "time_per_iteration": 2.8869168758392334 + }, + { + "auxiliary_loss_clip": 0.01519993, + "auxiliary_loss_mlp": 0.01281991, + "balance_loss_clip": 1.20178604, + "balance_loss_mlp": 1.04052043, + "epoch": 0.3621824740718473, + "flos": 22312107498240.0, + "grad_norm": 3.205629694528299, + "language_loss": 0.79301631, + "learning_rate": 2.9499021441341e-06, + "loss": 0.82103616, + "num_input_tokens_seen": 129331825, + "step": 6024, + "time_per_iteration": 2.7306787967681885 + }, + { + "auxiliary_loss_clip": 0.01524698, + "auxiliary_loss_mlp": 0.01285239, + "balance_loss_clip": 1.20601285, + "balance_loss_mlp": 1.04910898, + "epoch": 0.36224259732451525, + "flos": 16765513859520.0, + "grad_norm": 2.3703217487537653, + "language_loss": 0.75513828, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.7832377, + "num_input_tokens_seen": 129350400, + "step": 6025, + "time_per_iteration": 2.761723756790161 + }, + { + "auxiliary_loss_clip": 0.01525256, + "auxiliary_loss_mlp": 0.01270227, + "balance_loss_clip": 1.20617211, + "balance_loss_mlp": 1.03314328, + "epoch": 0.3623027205771832, + "flos": 23152034980800.0, + "grad_norm": 1.627180898461371, + "language_loss": 0.71967119, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74762607, + "num_input_tokens_seen": 129371155, + "step": 6026, + "time_per_iteration": 2.8546056747436523 + }, + { + "auxiliary_loss_clip": 0.01528918, + "auxiliary_loss_mlp": 0.01298123, + "balance_loss_clip": 1.20933723, + "balance_loss_mlp": 1.05341005, + "epoch": 0.3623628438298512, + "flos": 28551731398560.0, + "grad_norm": 2.7938886372090344, + "language_loss": 0.79128069, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81955111, + "num_input_tokens_seen": 129391230, + "step": 6027, + "time_per_iteration": 2.8081226348876953 + }, + { + "auxiliary_loss_clip": 0.01528977, + "auxiliary_loss_mlp": 0.01296719, + "balance_loss_clip": 1.20966339, + "balance_loss_mlp": 1.05753708, + "epoch": 0.36242296708251914, + "flos": 25487370964800.0, + "grad_norm": 3.659207467658624, + "language_loss": 0.6752919, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.70354885, + "num_input_tokens_seen": 129410065, + "step": 6028, + "time_per_iteration": 2.8141348361968994 + }, + { + "auxiliary_loss_clip": 0.01524856, + "auxiliary_loss_mlp": 0.01273771, + "balance_loss_clip": 1.20640099, + "balance_loss_mlp": 1.03611529, + "epoch": 0.3624830903351871, + "flos": 16292279471040.0, + "grad_norm": 1.9510667611004098, + "language_loss": 0.85459435, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.88258064, + "num_input_tokens_seen": 129428655, + "step": 6029, + "time_per_iteration": 2.769993305206299 + }, + { + "auxiliary_loss_clip": 0.0152227, + "auxiliary_loss_mlp": 0.01272329, + "balance_loss_clip": 1.20395517, + "balance_loss_mlp": 1.03448224, + "epoch": 0.36254321358785513, + "flos": 18298281962880.0, + "grad_norm": 1.7288670395072492, + "language_loss": 0.72769797, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.75564402, + "num_input_tokens_seen": 129447845, + "step": 6030, + "time_per_iteration": 4.380281448364258 + }, + { + "auxiliary_loss_clip": 0.01530271, + "auxiliary_loss_mlp": 0.01289206, + "balance_loss_clip": 1.21144164, + "balance_loss_mlp": 1.04754496, + "epoch": 0.3626033368405231, + "flos": 14867000866080.0, + "grad_norm": 2.3196680304213535, + "language_loss": 0.74790913, + "learning_rate": 2.94750214514905e-06, + "loss": 0.77610391, + "num_input_tokens_seen": 129463275, + "step": 6031, + "time_per_iteration": 2.7513153553009033 + }, + { + "auxiliary_loss_clip": 0.01521511, + "auxiliary_loss_mlp": 0.01272016, + "balance_loss_clip": 1.20264196, + "balance_loss_mlp": 1.030164, + "epoch": 0.36266346009319106, + "flos": 22308618107520.0, + "grad_norm": 1.6472109549182705, + "language_loss": 0.73213756, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.76007283, + "num_input_tokens_seen": 129483205, + "step": 6032, + "time_per_iteration": 2.778765916824341 + }, + { + "auxiliary_loss_clip": 0.01518806, + "auxiliary_loss_mlp": 0.01276493, + "balance_loss_clip": 1.20005083, + "balance_loss_mlp": 1.03349686, + "epoch": 0.362723583345859, + "flos": 18224245465920.0, + "grad_norm": 2.0317591721920394, + "language_loss": 0.77859282, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80654585, + "num_input_tokens_seen": 129499885, + "step": 6033, + "time_per_iteration": 2.773650884628296 + }, + { + "auxiliary_loss_clip": 0.01693986, + "auxiliary_loss_mlp": 0.01226318, + "balance_loss_clip": 1.38245964, + "balance_loss_mlp": 1.01116943, + "epoch": 0.362783706598527, + "flos": 68505919205760.0, + "grad_norm": 0.7801912653230735, + "language_loss": 0.64733565, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.67653871, + "num_input_tokens_seen": 129561885, + "step": 6034, + "time_per_iteration": 3.374204397201538 + }, + { + "auxiliary_loss_clip": 0.01520181, + "auxiliary_loss_mlp": 0.01286301, + "balance_loss_clip": 1.20072365, + "balance_loss_mlp": 1.04788232, + "epoch": 0.36284382985119495, + "flos": 26578916411040.0, + "grad_norm": 1.5314942433402672, + "language_loss": 0.89773875, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92580354, + "num_input_tokens_seen": 129582325, + "step": 6035, + "time_per_iteration": 2.848893165588379 + }, + { + "auxiliary_loss_clip": 0.01515487, + "auxiliary_loss_mlp": 0.01291671, + "balance_loss_clip": 1.19575143, + "balance_loss_mlp": 1.05077291, + "epoch": 0.3629039531038629, + "flos": 20158828503840.0, + "grad_norm": 2.0377499133553245, + "language_loss": 0.73667419, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.76474577, + "num_input_tokens_seen": 129600350, + "step": 6036, + "time_per_iteration": 2.823270559310913 + }, + { + "auxiliary_loss_clip": 0.01513337, + "auxiliary_loss_mlp": 0.01267839, + "balance_loss_clip": 1.1932199, + "balance_loss_mlp": 1.0248425, + "epoch": 0.3629640763565309, + "flos": 18627767167680.0, + "grad_norm": 3.868361349873422, + "language_loss": 0.76149011, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78930187, + "num_input_tokens_seen": 129618425, + "step": 6037, + "time_per_iteration": 2.771172046661377 + }, + { + "auxiliary_loss_clip": 0.01519882, + "auxiliary_loss_mlp": 0.01282488, + "balance_loss_clip": 1.20012534, + "balance_loss_mlp": 1.04235268, + "epoch": 0.36302419960919885, + "flos": 19573401525120.0, + "grad_norm": 1.7869738206491328, + "language_loss": 0.78590095, + "learning_rate": 2.945100385624828e-06, + "loss": 0.81392461, + "num_input_tokens_seen": 129636750, + "step": 6038, + "time_per_iteration": 2.7732393741607666 + }, + { + "auxiliary_loss_clip": 0.01679505, + "auxiliary_loss_mlp": 0.01226448, + "balance_loss_clip": 1.36792874, + "balance_loss_mlp": 1.00901031, + "epoch": 0.3630843228618668, + "flos": 63804373351200.0, + "grad_norm": 0.8452579963371253, + "language_loss": 0.63409972, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.66315925, + "num_input_tokens_seen": 129699030, + "step": 6039, + "time_per_iteration": 4.89352011680603 + }, + { + "auxiliary_loss_clip": 0.01509925, + "auxiliary_loss_mlp": 0.01279513, + "balance_loss_clip": 1.19020176, + "balance_loss_mlp": 1.04147577, + "epoch": 0.3631444461145348, + "flos": 21837318055200.0, + "grad_norm": 2.6141227777334963, + "language_loss": 0.71207213, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73996657, + "num_input_tokens_seen": 129717135, + "step": 6040, + "time_per_iteration": 4.331967353820801 + }, + { + "auxiliary_loss_clip": 0.01501662, + "auxiliary_loss_mlp": 0.01287121, + "balance_loss_clip": 1.18210661, + "balance_loss_mlp": 1.04450607, + "epoch": 0.36320456936720275, + "flos": 21723987620160.0, + "grad_norm": 1.5486317800054308, + "language_loss": 0.81166661, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83955443, + "num_input_tokens_seen": 129735940, + "step": 6041, + "time_per_iteration": 2.8064522743225098 + }, + { + "auxiliary_loss_clip": 0.01502149, + "auxiliary_loss_mlp": 0.01280735, + "balance_loss_clip": 1.18326747, + "balance_loss_mlp": 1.03583193, + "epoch": 0.3632646926198707, + "flos": 17020962567360.0, + "grad_norm": 2.0641055460182605, + "language_loss": 0.84145254, + "learning_rate": 2.943727162882107e-06, + "loss": 0.86928135, + "num_input_tokens_seen": 129752790, + "step": 6042, + "time_per_iteration": 2.781064033508301 + }, + { + "auxiliary_loss_clip": 0.01514148, + "auxiliary_loss_mlp": 0.01285072, + "balance_loss_clip": 1.19447422, + "balance_loss_mlp": 1.04398274, + "epoch": 0.36332481587253873, + "flos": 23333409263520.0, + "grad_norm": 1.8014196036280676, + "language_loss": 0.78419316, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.81218535, + "num_input_tokens_seen": 129773655, + "step": 6043, + "time_per_iteration": 2.8077826499938965 + }, + { + "auxiliary_loss_clip": 0.01509568, + "auxiliary_loss_mlp": 0.0127547, + "balance_loss_clip": 1.18926632, + "balance_loss_mlp": 1.03438151, + "epoch": 0.3633849391252067, + "flos": 10745078981760.0, + "grad_norm": 2.9628986535325756, + "language_loss": 0.65498316, + "learning_rate": 2.943040336741298e-06, + "loss": 0.68283355, + "num_input_tokens_seen": 129791605, + "step": 6044, + "time_per_iteration": 4.300287961959839 + }, + { + "auxiliary_loss_clip": 0.01507173, + "auxiliary_loss_mlp": 0.01278554, + "balance_loss_clip": 1.18826616, + "balance_loss_mlp": 1.03956342, + "epoch": 0.36344506237787466, + "flos": 25851560800320.0, + "grad_norm": 1.6983004976370515, + "language_loss": 0.81295514, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.84081239, + "num_input_tokens_seen": 129811075, + "step": 6045, + "time_per_iteration": 2.8015997409820557 + }, + { + "auxiliary_loss_clip": 0.01504496, + "auxiliary_loss_mlp": 0.01289499, + "balance_loss_clip": 1.18506646, + "balance_loss_mlp": 1.04764748, + "epoch": 0.3635051856305426, + "flos": 30156677519040.0, + "grad_norm": 1.8411308192435127, + "language_loss": 0.64301658, + "learning_rate": 2.942353367559755e-06, + "loss": 0.67095649, + "num_input_tokens_seen": 129833755, + "step": 6046, + "time_per_iteration": 2.8724820613861084 + }, + { + "auxiliary_loss_clip": 0.0150906, + "auxiliary_loss_mlp": 0.0127112, + "balance_loss_clip": 1.18796945, + "balance_loss_mlp": 1.03098452, + "epoch": 0.3635653088832106, + "flos": 22200483830400.0, + "grad_norm": 1.5565864954699569, + "language_loss": 0.77545655, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.8032583, + "num_input_tokens_seen": 129854475, + "step": 6047, + "time_per_iteration": 2.823306083679199 + }, + { + "auxiliary_loss_clip": 0.01502564, + "auxiliary_loss_mlp": 0.0128144, + "balance_loss_clip": 1.18204999, + "balance_loss_mlp": 1.03768086, + "epoch": 0.36362543213587856, + "flos": 24789182473440.0, + "grad_norm": 2.5329549532406377, + "language_loss": 0.79605937, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.82389939, + "num_input_tokens_seen": 129873530, + "step": 6048, + "time_per_iteration": 2.8248789310455322 + }, + { + "auxiliary_loss_clip": 0.01675101, + "auxiliary_loss_mlp": 0.01250053, + "balance_loss_clip": 1.36422443, + "balance_loss_mlp": 1.03566742, + "epoch": 0.3636855553885465, + "flos": 62533387958400.0, + "grad_norm": 0.7490073857210726, + "language_loss": 0.52469349, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.55394506, + "num_input_tokens_seen": 129940400, + "step": 6049, + "time_per_iteration": 3.466377019882202 + }, + { + "auxiliary_loss_clip": 0.01507159, + "auxiliary_loss_mlp": 0.01272315, + "balance_loss_clip": 1.18687999, + "balance_loss_mlp": 1.02989089, + "epoch": 0.3637456786412145, + "flos": 24062926779360.0, + "grad_norm": 2.041297096296657, + "language_loss": 0.86716688, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.8949616, + "num_input_tokens_seen": 129958635, + "step": 6050, + "time_per_iteration": 2.7859482765197754 + }, + { + "auxiliary_loss_clip": 0.01506292, + "auxiliary_loss_mlp": 0.01269455, + "balance_loss_clip": 1.18582976, + "balance_loss_mlp": 1.03046465, + "epoch": 0.36380580189388245, + "flos": 16693411698720.0, + "grad_norm": 3.450378709208529, + "language_loss": 0.78696918, + "learning_rate": 2.940635319486546e-06, + "loss": 0.81472665, + "num_input_tokens_seen": 129977685, + "step": 6051, + "time_per_iteration": 2.787771224975586 + }, + { + "auxiliary_loss_clip": 0.01503717, + "auxiliary_loss_mlp": 0.0127866, + "balance_loss_clip": 1.18295836, + "balance_loss_mlp": 1.03776133, + "epoch": 0.3638659251465504, + "flos": 25116164419680.0, + "grad_norm": 2.3759351666301276, + "language_loss": 0.82519799, + "learning_rate": 2.940291602812822e-06, + "loss": 0.85302174, + "num_input_tokens_seen": 129997530, + "step": 6052, + "time_per_iteration": 2.827021360397339 + }, + { + "auxiliary_loss_clip": 0.01495856, + "auxiliary_loss_mlp": 0.01274347, + "balance_loss_clip": 1.17481434, + "balance_loss_mlp": 1.03230476, + "epoch": 0.3639260483992184, + "flos": 23005251544320.0, + "grad_norm": 1.8497112789478722, + "language_loss": 0.72765672, + "learning_rate": 2.939947850483145e-06, + "loss": 0.7553587, + "num_input_tokens_seen": 130017955, + "step": 6053, + "time_per_iteration": 2.7533795833587646 + }, + { + "auxiliary_loss_clip": 0.01666659, + "auxiliary_loss_mlp": 0.01239113, + "balance_loss_clip": 1.35493088, + "balance_loss_mlp": 1.02701569, + "epoch": 0.36398617165188635, + "flos": 70722387243360.0, + "grad_norm": 0.797272560875391, + "language_loss": 0.61207318, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.64113081, + "num_input_tokens_seen": 130074275, + "step": 6054, + "time_per_iteration": 3.388273239135742 + }, + { + "auxiliary_loss_clip": 0.01510705, + "auxiliary_loss_mlp": 0.01276383, + "balance_loss_clip": 1.1892494, + "balance_loss_mlp": 1.03319585, + "epoch": 0.3640462949045543, + "flos": 22237577935200.0, + "grad_norm": 2.4301309698186735, + "language_loss": 0.76192629, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78979719, + "num_input_tokens_seen": 130091375, + "step": 6055, + "time_per_iteration": 2.7763826847076416 + }, + { + "auxiliary_loss_clip": 0.01502757, + "auxiliary_loss_mlp": 0.01299183, + "balance_loss_clip": 1.18123662, + "balance_loss_mlp": 1.0628624, + "epoch": 0.3641064181572223, + "flos": 21545685518400.0, + "grad_norm": 1.737247500879144, + "language_loss": 0.75373441, + "learning_rate": 2.938916379688765e-06, + "loss": 0.78175384, + "num_input_tokens_seen": 130111595, + "step": 6056, + "time_per_iteration": 2.809786796569824 + }, + { + "auxiliary_loss_clip": 0.01506115, + "auxiliary_loss_mlp": 0.01294964, + "balance_loss_clip": 1.1854732, + "balance_loss_mlp": 1.05501974, + "epoch": 0.3641665414098903, + "flos": 22275506459520.0, + "grad_norm": 1.9347437224993953, + "language_loss": 0.80014443, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82815522, + "num_input_tokens_seen": 130131440, + "step": 6057, + "time_per_iteration": 2.8137381076812744 + }, + { + "auxiliary_loss_clip": 0.01497704, + "auxiliary_loss_mlp": 0.01285348, + "balance_loss_clip": 1.17633057, + "balance_loss_mlp": 1.04368711, + "epoch": 0.36422666466255826, + "flos": 28332201022560.0, + "grad_norm": 2.8324704940984935, + "language_loss": 0.80391181, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.83174229, + "num_input_tokens_seen": 130151375, + "step": 6058, + "time_per_iteration": 2.8286569118499756 + }, + { + "auxiliary_loss_clip": 0.01495626, + "auxiliary_loss_mlp": 0.01271763, + "balance_loss_clip": 1.17372584, + "balance_loss_mlp": 1.0283848, + "epoch": 0.36428678791522623, + "flos": 24172805751840.0, + "grad_norm": 1.9144358312268772, + "language_loss": 0.85489988, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.88257384, + "num_input_tokens_seen": 130169960, + "step": 6059, + "time_per_iteration": 2.80853533744812 + }, + { + "auxiliary_loss_clip": 0.01504403, + "auxiliary_loss_mlp": 0.0127483, + "balance_loss_clip": 1.18413317, + "balance_loss_mlp": 1.03049803, + "epoch": 0.3643469111678942, + "flos": 22530651742080.0, + "grad_norm": 1.7167184964538027, + "language_loss": 0.88486958, + "learning_rate": 2.937540586903884e-06, + "loss": 0.91266191, + "num_input_tokens_seen": 130189800, + "step": 6060, + "time_per_iteration": 2.771310567855835 + }, + { + "auxiliary_loss_clip": 0.01502067, + "auxiliary_loss_mlp": 0.01281551, + "balance_loss_clip": 1.18208933, + "balance_loss_mlp": 1.03912699, + "epoch": 0.36440703442056216, + "flos": 19428590352960.0, + "grad_norm": 1.8692865551156073, + "language_loss": 0.66537201, + "learning_rate": 2.937196549795971e-06, + "loss": 0.69320816, + "num_input_tokens_seen": 130206370, + "step": 6061, + "time_per_iteration": 2.7948484420776367 + }, + { + "auxiliary_loss_clip": 0.01509231, + "auxiliary_loss_mlp": 0.01297511, + "balance_loss_clip": 1.18846858, + "balance_loss_mlp": 1.04936528, + "epoch": 0.3644671576732301, + "flos": 18042453973440.0, + "grad_norm": 2.2691746680414826, + "language_loss": 0.75458252, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.78264987, + "num_input_tokens_seen": 130224445, + "step": 6062, + "time_per_iteration": 2.781447172164917 + }, + { + "auxiliary_loss_clip": 0.01503244, + "auxiliary_loss_mlp": 0.01283091, + "balance_loss_clip": 1.18399405, + "balance_loss_mlp": 1.03818703, + "epoch": 0.3645272809258981, + "flos": 21544926955200.0, + "grad_norm": 1.683639601265669, + "language_loss": 0.72759825, + "learning_rate": 2.936508368977432e-06, + "loss": 0.75546157, + "num_input_tokens_seen": 130245380, + "step": 6063, + "time_per_iteration": 2.804311752319336 + }, + { + "auxiliary_loss_clip": 0.01510424, + "auxiliary_loss_mlp": 0.01281534, + "balance_loss_clip": 1.19025147, + "balance_loss_mlp": 1.03777456, + "epoch": 0.36458740417856605, + "flos": 22749044273280.0, + "grad_norm": 2.430014735022536, + "language_loss": 0.67440552, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70232511, + "num_input_tokens_seen": 130265575, + "step": 6064, + "time_per_iteration": 2.7516753673553467 + }, + { + "auxiliary_loss_clip": 0.0151138, + "auxiliary_loss_mlp": 0.01279646, + "balance_loss_clip": 1.19192791, + "balance_loss_mlp": 1.03436136, + "epoch": 0.364647527431234, + "flos": 26143079552640.0, + "grad_norm": 4.01931802776031, + "language_loss": 0.74475479, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.77266502, + "num_input_tokens_seen": 130286195, + "step": 6065, + "time_per_iteration": 2.792773962020874 + }, + { + "auxiliary_loss_clip": 0.01508016, + "auxiliary_loss_mlp": 0.01280176, + "balance_loss_clip": 1.18696725, + "balance_loss_mlp": 1.03641701, + "epoch": 0.364707650683902, + "flos": 31032599189760.0, + "grad_norm": 18.95060292702927, + "language_loss": 0.7481038, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77598572, + "num_input_tokens_seen": 130306095, + "step": 6066, + "time_per_iteration": 2.801175355911255 + }, + { + "auxiliary_loss_clip": 0.01504117, + "auxiliary_loss_mlp": 0.01277846, + "balance_loss_clip": 1.18527937, + "balance_loss_mlp": 1.03732955, + "epoch": 0.36476777393656995, + "flos": 19574918651520.0, + "grad_norm": 4.479220449805669, + "language_loss": 0.77239889, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.80021846, + "num_input_tokens_seen": 130324685, + "step": 6067, + "time_per_iteration": 2.755589723587036 + }, + { + "auxiliary_loss_clip": 0.01520429, + "auxiliary_loss_mlp": 0.01286872, + "balance_loss_clip": 1.20344758, + "balance_loss_mlp": 1.0457828, + "epoch": 0.3648278971892379, + "flos": 17750783508480.0, + "grad_norm": 1.9504070102583004, + "language_loss": 0.70583314, + "learning_rate": 2.934787295690886e-06, + "loss": 0.73390615, + "num_input_tokens_seen": 130343855, + "step": 6068, + "time_per_iteration": 4.402535915374756 + }, + { + "auxiliary_loss_clip": 0.01506858, + "auxiliary_loss_mlp": 0.01285167, + "balance_loss_clip": 1.18815827, + "balance_loss_mlp": 1.04198003, + "epoch": 0.3648880204419059, + "flos": 17933105995200.0, + "grad_norm": 8.175413038760581, + "language_loss": 0.74025172, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76817203, + "num_input_tokens_seen": 130362320, + "step": 6069, + "time_per_iteration": 2.731325387954712 + }, + { + "auxiliary_loss_clip": 0.01511418, + "auxiliary_loss_mlp": 0.01294719, + "balance_loss_clip": 1.19283676, + "balance_loss_mlp": 1.05229473, + "epoch": 0.3649481436945739, + "flos": 22640492786400.0, + "grad_norm": 1.9260573127379068, + "language_loss": 0.66410625, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.69216764, + "num_input_tokens_seen": 130383165, + "step": 6070, + "time_per_iteration": 2.807565927505493 + }, + { + "auxiliary_loss_clip": 0.01517919, + "auxiliary_loss_mlp": 0.01276586, + "balance_loss_clip": 1.19945669, + "balance_loss_mlp": 1.03645134, + "epoch": 0.36500826694724187, + "flos": 21581793491040.0, + "grad_norm": 1.9034495016004433, + "language_loss": 0.73903191, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76697707, + "num_input_tokens_seen": 130402425, + "step": 6071, + "time_per_iteration": 2.7729337215423584 + }, + { + "auxiliary_loss_clip": 0.01509885, + "auxiliary_loss_mlp": 0.01266218, + "balance_loss_clip": 1.1900022, + "balance_loss_mlp": 1.02493811, + "epoch": 0.36506839019990983, + "flos": 13774279646880.0, + "grad_norm": 1.827975234538103, + "language_loss": 0.88325077, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.91101182, + "num_input_tokens_seen": 130419440, + "step": 6072, + "time_per_iteration": 2.6904709339141846 + }, + { + "auxiliary_loss_clip": 0.0151313, + "auxiliary_loss_mlp": 0.01270597, + "balance_loss_clip": 1.19513679, + "balance_loss_mlp": 1.03027117, + "epoch": 0.3651285134525778, + "flos": 17276904341280.0, + "grad_norm": 2.1180027902443768, + "language_loss": 0.72745299, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.75529027, + "num_input_tokens_seen": 130438495, + "step": 6073, + "time_per_iteration": 2.7875924110412598 + }, + { + "auxiliary_loss_clip": 0.01514727, + "auxiliary_loss_mlp": 0.01307095, + "balance_loss_clip": 1.19775712, + "balance_loss_mlp": 1.06619728, + "epoch": 0.36518863670524576, + "flos": 21910140851040.0, + "grad_norm": 3.0800813739881012, + "language_loss": 0.67097139, + "learning_rate": 2.932720838132236e-06, + "loss": 0.69918966, + "num_input_tokens_seen": 130455575, + "step": 6074, + "time_per_iteration": 2.7669260501861572 + }, + { + "auxiliary_loss_clip": 0.01511248, + "auxiliary_loss_mlp": 0.01291341, + "balance_loss_clip": 1.19455373, + "balance_loss_mlp": 1.04777229, + "epoch": 0.3652487599579137, + "flos": 27124442601120.0, + "grad_norm": 1.5865406770988324, + "language_loss": 0.73155516, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75958109, + "num_input_tokens_seen": 130476385, + "step": 6075, + "time_per_iteration": 2.822598695755005 + }, + { + "auxiliary_loss_clip": 0.01512707, + "auxiliary_loss_mlp": 0.01287604, + "balance_loss_clip": 1.19541645, + "balance_loss_mlp": 1.04231954, + "epoch": 0.3653088832105817, + "flos": 19757620419840.0, + "grad_norm": 1.905639237528965, + "language_loss": 0.8953746, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.92337769, + "num_input_tokens_seen": 130493630, + "step": 6076, + "time_per_iteration": 2.754350185394287 + }, + { + "auxiliary_loss_clip": 0.01519389, + "auxiliary_loss_mlp": 0.01280428, + "balance_loss_clip": 1.20257688, + "balance_loss_mlp": 1.04086542, + "epoch": 0.36536900646324966, + "flos": 13116674651040.0, + "grad_norm": 2.15580704579013, + "language_loss": 0.69637752, + "learning_rate": 2.931687131696872e-06, + "loss": 0.72437567, + "num_input_tokens_seen": 130510735, + "step": 6077, + "time_per_iteration": 4.390051364898682 + }, + { + "auxiliary_loss_clip": 0.01702775, + "auxiliary_loss_mlp": 0.01227348, + "balance_loss_clip": 1.3944242, + "balance_loss_mlp": 1.0160141, + "epoch": 0.3654291297159176, + "flos": 71107968561120.0, + "grad_norm": 0.7584161906301002, + "language_loss": 0.61705941, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.64636064, + "num_input_tokens_seen": 130577050, + "step": 6078, + "time_per_iteration": 3.446611166000366 + }, + { + "auxiliary_loss_clip": 0.01500769, + "auxiliary_loss_mlp": 0.01280478, + "balance_loss_clip": 1.18236959, + "balance_loss_mlp": 1.04205942, + "epoch": 0.3654892529685856, + "flos": 23619428432640.0, + "grad_norm": 1.8768421611613793, + "language_loss": 0.7851032, + "learning_rate": 2.930997817403173e-06, + "loss": 0.81291568, + "num_input_tokens_seen": 130593780, + "step": 6079, + "time_per_iteration": 4.272518873214722 + }, + { + "auxiliary_loss_clip": 0.01511749, + "auxiliary_loss_mlp": 0.01288042, + "balance_loss_clip": 1.1950376, + "balance_loss_mlp": 1.04847956, + "epoch": 0.36554937622125355, + "flos": 43474146035040.0, + "grad_norm": 2.226021093222093, + "language_loss": 0.62439013, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65238804, + "num_input_tokens_seen": 130615510, + "step": 6080, + "time_per_iteration": 2.9502601623535156 + }, + { + "auxiliary_loss_clip": 0.01508265, + "auxiliary_loss_mlp": 0.01285158, + "balance_loss_clip": 1.19230616, + "balance_loss_mlp": 1.04483259, + "epoch": 0.3656094994739215, + "flos": 23296884081120.0, + "grad_norm": 2.8683344542762916, + "language_loss": 0.67187554, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69980979, + "num_input_tokens_seen": 130635410, + "step": 6081, + "time_per_iteration": 2.8352696895599365 + }, + { + "auxiliary_loss_clip": 0.01512076, + "auxiliary_loss_mlp": 0.01301421, + "balance_loss_clip": 1.19443703, + "balance_loss_mlp": 1.06071341, + "epoch": 0.3656696227265895, + "flos": 24574317261120.0, + "grad_norm": 1.7714973617835468, + "language_loss": 0.75, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.778135, + "num_input_tokens_seen": 130657725, + "step": 6082, + "time_per_iteration": 4.435353755950928 + }, + { + "auxiliary_loss_clip": 0.01509674, + "auxiliary_loss_mlp": 0.01293606, + "balance_loss_clip": 1.19156349, + "balance_loss_mlp": 1.05633235, + "epoch": 0.3657297459792575, + "flos": 27930879154080.0, + "grad_norm": 1.7737359234330525, + "language_loss": 0.82780612, + "learning_rate": 2.929618765277987e-06, + "loss": 0.85583889, + "num_input_tokens_seen": 130678360, + "step": 6083, + "time_per_iteration": 2.808119773864746 + }, + { + "auxiliary_loss_clip": 0.01688839, + "auxiliary_loss_mlp": 0.01264519, + "balance_loss_clip": 1.38066435, + "balance_loss_mlp": 1.05699921, + "epoch": 0.36578986923192547, + "flos": 67398633573120.0, + "grad_norm": 0.81952416571451, + "language_loss": 0.59213507, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.62166858, + "num_input_tokens_seen": 130742110, + "step": 6084, + "time_per_iteration": 3.4001657962799072 + }, + { + "auxiliary_loss_clip": 0.01511738, + "auxiliary_loss_mlp": 0.01299498, + "balance_loss_clip": 1.19396734, + "balance_loss_mlp": 1.06394041, + "epoch": 0.36584999248459343, + "flos": 20229451466400.0, + "grad_norm": 1.7973521663031673, + "language_loss": 0.73221642, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.76032877, + "num_input_tokens_seen": 130759870, + "step": 6085, + "time_per_iteration": 2.807655096054077 + }, + { + "auxiliary_loss_clip": 0.01507952, + "auxiliary_loss_mlp": 0.01281611, + "balance_loss_clip": 1.19050562, + "balance_loss_mlp": 1.04395568, + "epoch": 0.3659101157372614, + "flos": 19064476373760.0, + "grad_norm": 2.051886450945256, + "language_loss": 0.78053796, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80843353, + "num_input_tokens_seen": 130778510, + "step": 6086, + "time_per_iteration": 2.7137820720672607 + }, + { + "auxiliary_loss_clip": 0.01507025, + "auxiliary_loss_mlp": 0.01282106, + "balance_loss_clip": 1.18968594, + "balance_loss_mlp": 1.04368734, + "epoch": 0.36597023898992936, + "flos": 30813372239040.0, + "grad_norm": 1.98144640406328, + "language_loss": 0.77072692, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.7986182, + "num_input_tokens_seen": 130798535, + "step": 6087, + "time_per_iteration": 2.9071924686431885 + }, + { + "auxiliary_loss_clip": 0.01507974, + "auxiliary_loss_mlp": 0.01299004, + "balance_loss_clip": 1.19042063, + "balance_loss_mlp": 1.0605855, + "epoch": 0.36603036224259733, + "flos": 20523890687040.0, + "grad_norm": 2.4836335604061657, + "language_loss": 0.7069419, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.7350117, + "num_input_tokens_seen": 130816655, + "step": 6088, + "time_per_iteration": 2.7663047313690186 + }, + { + "auxiliary_loss_clip": 0.01506551, + "auxiliary_loss_mlp": 0.01284128, + "balance_loss_clip": 1.18848586, + "balance_loss_mlp": 1.03750801, + "epoch": 0.3660904854952653, + "flos": 38332629152640.0, + "grad_norm": 1.6157197719005525, + "language_loss": 0.79741192, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82531869, + "num_input_tokens_seen": 130841225, + "step": 6089, + "time_per_iteration": 2.911646842956543 + }, + { + "auxiliary_loss_clip": 0.01505692, + "auxiliary_loss_mlp": 0.0128423, + "balance_loss_clip": 1.18879986, + "balance_loss_mlp": 1.04676473, + "epoch": 0.36615060874793326, + "flos": 21837735264960.0, + "grad_norm": 1.9764861821725728, + "language_loss": 0.71591485, + "learning_rate": 2.927204067389884e-06, + "loss": 0.74381411, + "num_input_tokens_seen": 130861050, + "step": 6090, + "time_per_iteration": 2.78714919090271 + }, + { + "auxiliary_loss_clip": 0.01508328, + "auxiliary_loss_mlp": 0.01281939, + "balance_loss_clip": 1.1905998, + "balance_loss_mlp": 1.04447412, + "epoch": 0.3662107320006012, + "flos": 16583418941760.0, + "grad_norm": 2.479295398014328, + "language_loss": 0.74579155, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.77369428, + "num_input_tokens_seen": 130879775, + "step": 6091, + "time_per_iteration": 2.862816333770752 + }, + { + "auxiliary_loss_clip": 0.01506073, + "auxiliary_loss_mlp": 0.0127244, + "balance_loss_clip": 1.18906176, + "balance_loss_mlp": 1.03001595, + "epoch": 0.3662708552532692, + "flos": 20960410252320.0, + "grad_norm": 2.246587360756452, + "language_loss": 0.73114562, + "learning_rate": 2.926513837074284e-06, + "loss": 0.7589308, + "num_input_tokens_seen": 130898070, + "step": 6092, + "time_per_iteration": 2.780261278152466 + }, + { + "auxiliary_loss_clip": 0.01504453, + "auxiliary_loss_mlp": 0.01284206, + "balance_loss_clip": 1.18685865, + "balance_loss_mlp": 1.04502487, + "epoch": 0.36633097850593715, + "flos": 21904565411520.0, + "grad_norm": 2.9918485221114506, + "language_loss": 0.78664035, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.81452692, + "num_input_tokens_seen": 130915250, + "step": 6093, + "time_per_iteration": 2.7914702892303467 + }, + { + "auxiliary_loss_clip": 0.01498115, + "auxiliary_loss_mlp": 0.01284936, + "balance_loss_clip": 1.18056679, + "balance_loss_mlp": 1.04537356, + "epoch": 0.3663911017586051, + "flos": 32856923973600.0, + "grad_norm": 1.8041568693236936, + "language_loss": 0.74393177, + "learning_rate": 2.925823466224696e-06, + "loss": 0.77176225, + "num_input_tokens_seen": 130936995, + "step": 6094, + "time_per_iteration": 2.9153547286987305 + }, + { + "auxiliary_loss_clip": 0.01511016, + "auxiliary_loss_mlp": 0.01277327, + "balance_loss_clip": 1.19370341, + "balance_loss_mlp": 1.03413963, + "epoch": 0.3664512250112731, + "flos": 27274222362240.0, + "grad_norm": 1.5574356363195818, + "language_loss": 0.79426348, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.82214689, + "num_input_tokens_seen": 130957970, + "step": 6095, + "time_per_iteration": 2.8229148387908936 + }, + { + "auxiliary_loss_clip": 0.01498974, + "auxiliary_loss_mlp": 0.01283699, + "balance_loss_clip": 1.18122816, + "balance_loss_mlp": 1.03803229, + "epoch": 0.3665113482639411, + "flos": 17786322558720.0, + "grad_norm": 2.4714767798127664, + "language_loss": 0.73685503, + "learning_rate": 2.925132954945834e-06, + "loss": 0.76468182, + "num_input_tokens_seen": 130974915, + "step": 6096, + "time_per_iteration": 2.78041410446167 + }, + { + "auxiliary_loss_clip": 0.01506565, + "auxiliary_loss_mlp": 0.01288112, + "balance_loss_clip": 1.18923068, + "balance_loss_mlp": 1.0426358, + "epoch": 0.36657147151660907, + "flos": 27857070226080.0, + "grad_norm": 3.836587665618564, + "language_loss": 0.67561704, + "learning_rate": 2.924787646678155e-06, + "loss": 0.70356381, + "num_input_tokens_seen": 130995745, + "step": 6097, + "time_per_iteration": 2.7729859352111816 + }, + { + "auxiliary_loss_clip": 0.0151319, + "auxiliary_loss_mlp": 0.01291833, + "balance_loss_clip": 1.19631934, + "balance_loss_mlp": 1.04864573, + "epoch": 0.36663159476927704, + "flos": 25376354147520.0, + "grad_norm": 1.563983001590336, + "language_loss": 0.7777397, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.80578995, + "num_input_tokens_seen": 131015545, + "step": 6098, + "time_per_iteration": 2.825150728225708 + }, + { + "auxiliary_loss_clip": 0.01506849, + "auxiliary_loss_mlp": 0.01284006, + "balance_loss_clip": 1.18996024, + "balance_loss_mlp": 1.04196358, + "epoch": 0.366691718021945, + "flos": 21359153005920.0, + "grad_norm": 2.0338268148728846, + "language_loss": 0.73845011, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.76635873, + "num_input_tokens_seen": 131033990, + "step": 6099, + "time_per_iteration": 2.7697978019714355 + }, + { + "auxiliary_loss_clip": 0.01500607, + "auxiliary_loss_mlp": 0.01263414, + "balance_loss_clip": 1.18285954, + "balance_loss_mlp": 1.02346992, + "epoch": 0.36675184127461297, + "flos": 16802266610880.0, + "grad_norm": 1.8765521206093279, + "language_loss": 0.84780318, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.87544334, + "num_input_tokens_seen": 131050710, + "step": 6100, + "time_per_iteration": 2.763153553009033 + }, + { + "auxiliary_loss_clip": 0.01495964, + "auxiliary_loss_mlp": 0.01270697, + "balance_loss_clip": 1.17703843, + "balance_loss_mlp": 1.02674675, + "epoch": 0.36681196452728093, + "flos": 21908585796480.0, + "grad_norm": 4.187069296961544, + "language_loss": 0.70939159, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.73705816, + "num_input_tokens_seen": 131071435, + "step": 6101, + "time_per_iteration": 2.80832576751709 + }, + { + "auxiliary_loss_clip": 0.01503121, + "auxiliary_loss_mlp": 0.01286961, + "balance_loss_clip": 1.18450093, + "balance_loss_mlp": 1.04091311, + "epoch": 0.3668720877799489, + "flos": 17714372110560.0, + "grad_norm": 7.220485247078543, + "language_loss": 0.76206994, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78997076, + "num_input_tokens_seen": 131088775, + "step": 6102, + "time_per_iteration": 2.766061544418335 + }, + { + "auxiliary_loss_clip": 0.01499216, + "auxiliary_loss_mlp": 0.01285027, + "balance_loss_clip": 1.17987823, + "balance_loss_mlp": 1.04031444, + "epoch": 0.36693221103261686, + "flos": 47048948746560.0, + "grad_norm": 3.4711524724915277, + "language_loss": 0.7016803, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72952276, + "num_input_tokens_seen": 131112800, + "step": 6103, + "time_per_iteration": 3.0243263244628906 + }, + { + "auxiliary_loss_clip": 0.01501697, + "auxiliary_loss_mlp": 0.01281152, + "balance_loss_clip": 1.18236935, + "balance_loss_mlp": 1.04139781, + "epoch": 0.3669923342852848, + "flos": 15962756338080.0, + "grad_norm": 1.768643101487644, + "language_loss": 0.71862823, + "learning_rate": 2.922369507632716e-06, + "loss": 0.74645674, + "num_input_tokens_seen": 131131150, + "step": 6104, + "time_per_iteration": 2.7660906314849854 + }, + { + "auxiliary_loss_clip": 0.015023, + "auxiliary_loss_mlp": 0.01275273, + "balance_loss_clip": 1.18384075, + "balance_loss_mlp": 1.03265882, + "epoch": 0.3670524575379528, + "flos": 19976543945280.0, + "grad_norm": 1.9319997299986054, + "language_loss": 0.81591779, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.84369361, + "num_input_tokens_seen": 131150365, + "step": 6105, + "time_per_iteration": 2.7890522480010986 + }, + { + "auxiliary_loss_clip": 0.01495603, + "auxiliary_loss_mlp": 0.01283506, + "balance_loss_clip": 1.17681241, + "balance_loss_mlp": 1.03669512, + "epoch": 0.36711258079062076, + "flos": 25705346286240.0, + "grad_norm": 1.758049737287633, + "language_loss": 0.80590677, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83369792, + "num_input_tokens_seen": 131169310, + "step": 6106, + "time_per_iteration": 2.9213805198669434 + }, + { + "auxiliary_loss_clip": 0.01716239, + "auxiliary_loss_mlp": 0.01229195, + "balance_loss_clip": 1.40499496, + "balance_loss_mlp": 1.01786041, + "epoch": 0.3671727040432887, + "flos": 60779534424480.0, + "grad_norm": 0.692163395650759, + "language_loss": 0.59059936, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.62005365, + "num_input_tokens_seen": 131232900, + "step": 6107, + "time_per_iteration": 4.985331058502197 + }, + { + "auxiliary_loss_clip": 0.01494498, + "auxiliary_loss_mlp": 0.01281217, + "balance_loss_clip": 1.17520285, + "balance_loss_mlp": 1.03841138, + "epoch": 0.3672328272959567, + "flos": 18663306217920.0, + "grad_norm": 1.5888485438925737, + "language_loss": 0.74446589, + "learning_rate": 2.92098694412469e-06, + "loss": 0.77222306, + "num_input_tokens_seen": 131250920, + "step": 6108, + "time_per_iteration": 2.82570743560791 + }, + { + "auxiliary_loss_clip": 0.01492191, + "auxiliary_loss_mlp": 0.01281332, + "balance_loss_clip": 1.1724546, + "balance_loss_mlp": 1.03890836, + "epoch": 0.3672929505486247, + "flos": 15050840479200.0, + "grad_norm": 2.322924836965128, + "language_loss": 0.73491925, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.76265454, + "num_input_tokens_seen": 131267910, + "step": 6109, + "time_per_iteration": 2.7722647190093994 + }, + { + "auxiliary_loss_clip": 0.01499241, + "auxiliary_loss_mlp": 0.01281298, + "balance_loss_clip": 1.18039572, + "balance_loss_mlp": 1.03887439, + "epoch": 0.3673530738012927, + "flos": 20591138043360.0, + "grad_norm": 1.896255119946738, + "language_loss": 0.53276134, + "learning_rate": 2.920295452774744e-06, + "loss": 0.56056678, + "num_input_tokens_seen": 131287150, + "step": 6110, + "time_per_iteration": 2.8593780994415283 + }, + { + "auxiliary_loss_clip": 0.01501023, + "auxiliary_loss_mlp": 0.01280185, + "balance_loss_clip": 1.18106711, + "balance_loss_mlp": 1.04138529, + "epoch": 0.36741319705396064, + "flos": 21692013816960.0, + "grad_norm": 1.472348989074511, + "language_loss": 0.80811203, + "learning_rate": 2.919949654746672e-06, + "loss": 0.83592409, + "num_input_tokens_seen": 131308225, + "step": 6111, + "time_per_iteration": 2.8086893558502197 + }, + { + "auxiliary_loss_clip": 0.01502063, + "auxiliary_loss_mlp": 0.01280525, + "balance_loss_clip": 1.18355012, + "balance_loss_mlp": 1.03981781, + "epoch": 0.3674733203066286, + "flos": 29864817413280.0, + "grad_norm": 1.7012367690133192, + "language_loss": 0.72541136, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.75323719, + "num_input_tokens_seen": 131332115, + "step": 6112, + "time_per_iteration": 2.769754648208618 + }, + { + "auxiliary_loss_clip": 0.01501235, + "auxiliary_loss_mlp": 0.01288115, + "balance_loss_clip": 1.1818912, + "balance_loss_mlp": 1.04931521, + "epoch": 0.36753344355929657, + "flos": 18258836312160.0, + "grad_norm": 1.651235767117964, + "language_loss": 0.85014486, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87803841, + "num_input_tokens_seen": 131351885, + "step": 6113, + "time_per_iteration": 2.809309959411621 + }, + { + "auxiliary_loss_clip": 0.01492138, + "auxiliary_loss_mlp": 0.01283194, + "balance_loss_clip": 1.17228556, + "balance_loss_mlp": 1.04115188, + "epoch": 0.36759356681196453, + "flos": 25303493423520.0, + "grad_norm": 1.832714092750318, + "language_loss": 0.78711069, + "learning_rate": 2.918912051407413e-06, + "loss": 0.81486398, + "num_input_tokens_seen": 131370245, + "step": 6114, + "time_per_iteration": 2.771402359008789 + }, + { + "auxiliary_loss_clip": 0.01506808, + "auxiliary_loss_mlp": 0.01284617, + "balance_loss_clip": 1.18592715, + "balance_loss_mlp": 1.04123914, + "epoch": 0.3676536900646325, + "flos": 21035091528000.0, + "grad_norm": 1.9592434241810173, + "language_loss": 0.67382818, + "learning_rate": 2.918566113919698e-06, + "loss": 0.70174241, + "num_input_tokens_seen": 131388115, + "step": 6115, + "time_per_iteration": 2.7863831520080566 + }, + { + "auxiliary_loss_clip": 0.01502546, + "auxiliary_loss_mlp": 0.01274038, + "balance_loss_clip": 1.18416381, + "balance_loss_mlp": 1.03428459, + "epoch": 0.36771381331730046, + "flos": 16290307206720.0, + "grad_norm": 2.653663822303427, + "language_loss": 0.76007181, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78783762, + "num_input_tokens_seen": 131404595, + "step": 6116, + "time_per_iteration": 5.8191142082214355 + }, + { + "auxiliary_loss_clip": 0.01502006, + "auxiliary_loss_mlp": 0.01285928, + "balance_loss_clip": 1.18335295, + "balance_loss_mlp": 1.04674613, + "epoch": 0.36777393656996843, + "flos": 22312183354560.0, + "grad_norm": 1.7678965464524476, + "language_loss": 0.63106048, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.65893984, + "num_input_tokens_seen": 131423760, + "step": 6117, + "time_per_iteration": 2.8288629055023193 + }, + { + "auxiliary_loss_clip": 0.01499127, + "auxiliary_loss_mlp": 0.0127188, + "balance_loss_clip": 1.18037462, + "balance_loss_mlp": 1.0315541, + "epoch": 0.3678340598226364, + "flos": 26836527024000.0, + "grad_norm": 1.9940509589334756, + "language_loss": 0.73633206, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.76404214, + "num_input_tokens_seen": 131444955, + "step": 6118, + "time_per_iteration": 2.7835562229156494 + }, + { + "auxiliary_loss_clip": 0.01499125, + "auxiliary_loss_mlp": 0.01284655, + "balance_loss_clip": 1.1797328, + "balance_loss_mlp": 1.03994179, + "epoch": 0.36789418307530436, + "flos": 21763850480640.0, + "grad_norm": 2.661077173095987, + "language_loss": 0.72710043, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.75493824, + "num_input_tokens_seen": 131465720, + "step": 6119, + "time_per_iteration": 2.78401517868042 + }, + { + "auxiliary_loss_clip": 0.01497217, + "auxiliary_loss_mlp": 0.01290866, + "balance_loss_clip": 1.17796922, + "balance_loss_mlp": 1.05263829, + "epoch": 0.3679543063279723, + "flos": 15926003586720.0, + "grad_norm": 2.01356886802168, + "language_loss": 0.80727971, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.83516061, + "num_input_tokens_seen": 131483080, + "step": 6120, + "time_per_iteration": 2.7578461170196533 + }, + { + "auxiliary_loss_clip": 0.01503492, + "auxiliary_loss_mlp": 0.01288312, + "balance_loss_clip": 1.1852591, + "balance_loss_mlp": 1.04893994, + "epoch": 0.3680144295806403, + "flos": 24277602350880.0, + "grad_norm": 1.8968293403975962, + "language_loss": 0.64543986, + "learning_rate": 2.916489757978126e-06, + "loss": 0.6733579, + "num_input_tokens_seen": 131502545, + "step": 6121, + "time_per_iteration": 4.25331974029541 + }, + { + "auxiliary_loss_clip": 0.01501901, + "auxiliary_loss_mlp": 0.0129184, + "balance_loss_clip": 1.1830548, + "balance_loss_mlp": 1.05227661, + "epoch": 0.36807455283330826, + "flos": 26106554370240.0, + "grad_norm": 2.484238441033488, + "language_loss": 0.71617401, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.74411142, + "num_input_tokens_seen": 131522155, + "step": 6122, + "time_per_iteration": 2.8045105934143066 + }, + { + "auxiliary_loss_clip": 0.0150088, + "auxiliary_loss_mlp": 0.01270619, + "balance_loss_clip": 1.18237746, + "balance_loss_mlp": 1.03239131, + "epoch": 0.3681346760859763, + "flos": 24647329697760.0, + "grad_norm": 2.9596123630090894, + "language_loss": 0.69131637, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71903133, + "num_input_tokens_seen": 131543865, + "step": 6123, + "time_per_iteration": 2.7825136184692383 + }, + { + "auxiliary_loss_clip": 0.01497031, + "auxiliary_loss_mlp": 0.01293297, + "balance_loss_clip": 1.17726207, + "balance_loss_mlp": 1.05011034, + "epoch": 0.36819479933864424, + "flos": 23880642220800.0, + "grad_norm": 2.897096583588552, + "language_loss": 0.73655325, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.76445651, + "num_input_tokens_seen": 131562155, + "step": 6124, + "time_per_iteration": 2.8851165771484375 + }, + { + "auxiliary_loss_clip": 0.01500472, + "auxiliary_loss_mlp": 0.01287557, + "balance_loss_clip": 1.18108153, + "balance_loss_mlp": 1.0470407, + "epoch": 0.3682549225913122, + "flos": 25556211303840.0, + "grad_norm": 2.4278118560338076, + "language_loss": 0.74263489, + "learning_rate": 2.915104825441114e-06, + "loss": 0.7705152, + "num_input_tokens_seen": 131581695, + "step": 6125, + "time_per_iteration": 2.844407558441162 + }, + { + "auxiliary_loss_clip": 0.01502853, + "auxiliary_loss_mlp": 0.012823, + "balance_loss_clip": 1.18389416, + "balance_loss_mlp": 1.04197431, + "epoch": 0.36831504584398017, + "flos": 16948519053120.0, + "grad_norm": 1.7960597765373174, + "language_loss": 0.78422183, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.81207335, + "num_input_tokens_seen": 131599465, + "step": 6126, + "time_per_iteration": 2.839904308319092 + }, + { + "auxiliary_loss_clip": 0.01497708, + "auxiliary_loss_mlp": 0.01281068, + "balance_loss_clip": 1.17825627, + "balance_loss_mlp": 1.03883517, + "epoch": 0.36837516909664814, + "flos": 19867120110720.0, + "grad_norm": 2.306852106288498, + "language_loss": 0.65722018, + "learning_rate": 2.914412150914888e-06, + "loss": 0.68500793, + "num_input_tokens_seen": 131618330, + "step": 6127, + "time_per_iteration": 2.7555017471313477 + }, + { + "auxiliary_loss_clip": 0.01496106, + "auxiliary_loss_mlp": 0.01278106, + "balance_loss_clip": 1.17710352, + "balance_loss_mlp": 1.03320217, + "epoch": 0.3684352923493161, + "flos": 37629661713120.0, + "grad_norm": 2.270198267619293, + "language_loss": 0.70133293, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72907507, + "num_input_tokens_seen": 131638960, + "step": 6128, + "time_per_iteration": 2.924593210220337 + }, + { + "auxiliary_loss_clip": 0.0149931, + "auxiliary_loss_mlp": 0.01277447, + "balance_loss_clip": 1.17988563, + "balance_loss_mlp": 1.03521419, + "epoch": 0.36849541560198407, + "flos": 14467271980320.0, + "grad_norm": 1.785332667016381, + "language_loss": 0.74808824, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77585578, + "num_input_tokens_seen": 131657440, + "step": 6129, + "time_per_iteration": 2.8069398403167725 + }, + { + "auxiliary_loss_clip": 0.01493726, + "auxiliary_loss_mlp": 0.01266207, + "balance_loss_clip": 1.17479396, + "balance_loss_mlp": 1.02206612, + "epoch": 0.36855553885465203, + "flos": 25772859139680.0, + "grad_norm": 1.6728477316955932, + "language_loss": 0.84832966, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.875929, + "num_input_tokens_seen": 131678035, + "step": 6130, + "time_per_iteration": 2.783506393432617 + }, + { + "auxiliary_loss_clip": 0.01706945, + "auxiliary_loss_mlp": 0.01288818, + "balance_loss_clip": 1.40001035, + "balance_loss_mlp": 1.08053589, + "epoch": 0.36861566210732, + "flos": 65056887365760.0, + "grad_norm": 0.821920568037171, + "language_loss": 0.60223687, + "learning_rate": 2.913026385872321e-06, + "loss": 0.63219452, + "num_input_tokens_seen": 131742470, + "step": 6131, + "time_per_iteration": 3.507100820541382 + }, + { + "auxiliary_loss_clip": 0.01499624, + "auxiliary_loss_mlp": 0.0126646, + "balance_loss_clip": 1.18159652, + "balance_loss_mlp": 1.02499008, + "epoch": 0.36867578535998796, + "flos": 30957045566400.0, + "grad_norm": 1.6702828689330407, + "language_loss": 0.7324788, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.76013964, + "num_input_tokens_seen": 131764570, + "step": 6132, + "time_per_iteration": 2.8818352222442627 + }, + { + "auxiliary_loss_clip": 0.01497085, + "auxiliary_loss_mlp": 0.0127846, + "balance_loss_clip": 1.17729855, + "balance_loss_mlp": 1.03107691, + "epoch": 0.3687359086126559, + "flos": 28840367610720.0, + "grad_norm": 2.4830441625533712, + "language_loss": 0.74189997, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76965547, + "num_input_tokens_seen": 131785720, + "step": 6133, + "time_per_iteration": 2.8457233905792236 + }, + { + "auxiliary_loss_clip": 0.01506901, + "auxiliary_loss_mlp": 0.01272541, + "balance_loss_clip": 1.18859577, + "balance_loss_mlp": 1.03202438, + "epoch": 0.3687960318653239, + "flos": 21398902081920.0, + "grad_norm": 1.7788045458248183, + "language_loss": 0.71504581, + "learning_rate": 2.911986698512874e-06, + "loss": 0.74284029, + "num_input_tokens_seen": 131804430, + "step": 6134, + "time_per_iteration": 2.8404061794281006 + }, + { + "auxiliary_loss_clip": 0.01500196, + "auxiliary_loss_mlp": 0.01263846, + "balance_loss_clip": 1.18075919, + "balance_loss_mlp": 1.02199447, + "epoch": 0.36885615511799186, + "flos": 20268100625760.0, + "grad_norm": 1.6800049599642992, + "language_loss": 0.75390643, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.78154683, + "num_input_tokens_seen": 131822060, + "step": 6135, + "time_per_iteration": 2.8220584392547607 + }, + { + "auxiliary_loss_clip": 0.01697927, + "auxiliary_loss_mlp": 0.01224808, + "balance_loss_clip": 1.39133167, + "balance_loss_mlp": 1.01271057, + "epoch": 0.3689162783706599, + "flos": 63094692627360.0, + "grad_norm": 0.8092132737185841, + "language_loss": 0.58697772, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.6162051, + "num_input_tokens_seen": 131880715, + "step": 6136, + "time_per_iteration": 3.4369521141052246 + }, + { + "auxiliary_loss_clip": 0.01506894, + "auxiliary_loss_mlp": 0.01274257, + "balance_loss_clip": 1.18720686, + "balance_loss_mlp": 1.03126073, + "epoch": 0.36897640162332784, + "flos": 10963509441120.0, + "grad_norm": 1.8444653641143984, + "language_loss": 0.79221094, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.82002246, + "num_input_tokens_seen": 131895850, + "step": 6137, + "time_per_iteration": 2.9074442386627197 + }, + { + "auxiliary_loss_clip": 0.01503289, + "auxiliary_loss_mlp": 0.01270464, + "balance_loss_clip": 1.18484616, + "balance_loss_mlp": 1.02765846, + "epoch": 0.3690365248759958, + "flos": 20706402814560.0, + "grad_norm": 1.9764642593739654, + "language_loss": 0.74371094, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.77144843, + "num_input_tokens_seen": 131915775, + "step": 6138, + "time_per_iteration": 2.8116726875305176 + }, + { + "auxiliary_loss_clip": 0.01503604, + "auxiliary_loss_mlp": 0.01272303, + "balance_loss_clip": 1.18309212, + "balance_loss_mlp": 1.02682722, + "epoch": 0.3690966481286638, + "flos": 31828757211360.0, + "grad_norm": 2.3411067381929733, + "language_loss": 0.6492539, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.67701304, + "num_input_tokens_seen": 131935715, + "step": 6139, + "time_per_iteration": 2.864356517791748 + }, + { + "auxiliary_loss_clip": 0.01505245, + "auxiliary_loss_mlp": 0.01270428, + "balance_loss_clip": 1.18571365, + "balance_loss_mlp": 1.03067434, + "epoch": 0.36915677138133174, + "flos": 13116598794720.0, + "grad_norm": 2.0582830592668113, + "language_loss": 0.71272606, + "learning_rate": 2.909906390418006e-06, + "loss": 0.74048281, + "num_input_tokens_seen": 131954120, + "step": 6140, + "time_per_iteration": 2.7034895420074463 + }, + { + "auxiliary_loss_clip": 0.01700456, + "auxiliary_loss_mlp": 0.01224113, + "balance_loss_clip": 1.39418507, + "balance_loss_mlp": 1.01049042, + "epoch": 0.3692168946339997, + "flos": 68693930916480.0, + "grad_norm": 0.7525846935139745, + "language_loss": 0.59265572, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.62190139, + "num_input_tokens_seen": 132017485, + "step": 6141, + "time_per_iteration": 3.4790384769439697 + }, + { + "auxiliary_loss_clip": 0.01502526, + "auxiliary_loss_mlp": 0.01283244, + "balance_loss_clip": 1.18273211, + "balance_loss_mlp": 1.04139221, + "epoch": 0.36927701788666767, + "flos": 22020057751680.0, + "grad_norm": 1.703859954429829, + "language_loss": 0.75246954, + "learning_rate": 2.909212678216192e-06, + "loss": 0.7803272, + "num_input_tokens_seen": 132036760, + "step": 6142, + "time_per_iteration": 2.754894256591797 + }, + { + "auxiliary_loss_clip": 0.01502224, + "auxiliary_loss_mlp": 0.01272677, + "balance_loss_clip": 1.18257844, + "balance_loss_mlp": 1.03311419, + "epoch": 0.36933714113933563, + "flos": 21837773193120.0, + "grad_norm": 1.9949309671162072, + "language_loss": 0.77431935, + "learning_rate": 2.908865770392555e-06, + "loss": 0.80206835, + "num_input_tokens_seen": 132056935, + "step": 6143, + "time_per_iteration": 2.847580909729004 + }, + { + "auxiliary_loss_clip": 0.01505551, + "auxiliary_loss_mlp": 0.01269001, + "balance_loss_clip": 1.18576169, + "balance_loss_mlp": 1.02619517, + "epoch": 0.3693972643920036, + "flos": 23693730426720.0, + "grad_norm": 1.6044285469759079, + "language_loss": 0.8186239, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.84636939, + "num_input_tokens_seen": 132077285, + "step": 6144, + "time_per_iteration": 2.8397269248962402 + }, + { + "auxiliary_loss_clip": 0.01509969, + "auxiliary_loss_mlp": 0.0128621, + "balance_loss_clip": 1.18970335, + "balance_loss_mlp": 1.04512095, + "epoch": 0.36945738764467156, + "flos": 22858961173920.0, + "grad_norm": 2.7942200366659846, + "language_loss": 0.77882993, + "learning_rate": 2.908171851365593e-06, + "loss": 0.80679178, + "num_input_tokens_seen": 132095520, + "step": 6145, + "time_per_iteration": 4.462657928466797 + }, + { + "auxiliary_loss_clip": 0.01505508, + "auxiliary_loss_mlp": 0.01270801, + "balance_loss_clip": 1.18578613, + "balance_loss_mlp": 1.03219187, + "epoch": 0.36951751089733953, + "flos": 16617706362720.0, + "grad_norm": 2.467772450094699, + "language_loss": 0.77097148, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79873455, + "num_input_tokens_seen": 132112810, + "step": 6146, + "time_per_iteration": 2.784040927886963 + }, + { + "auxiliary_loss_clip": 0.01499643, + "auxiliary_loss_mlp": 0.01277109, + "balance_loss_clip": 1.18067575, + "balance_loss_mlp": 1.03773689, + "epoch": 0.3695776341500075, + "flos": 18916441308000.0, + "grad_norm": 1.7444980291816152, + "language_loss": 0.80833673, + "learning_rate": 2.907477794586761e-06, + "loss": 0.83610427, + "num_input_tokens_seen": 132131615, + "step": 6147, + "time_per_iteration": 2.819343090057373 + }, + { + "auxiliary_loss_clip": 0.01500155, + "auxiliary_loss_mlp": 0.01270295, + "balance_loss_clip": 1.1811173, + "balance_loss_mlp": 1.02996922, + "epoch": 0.36963775740267546, + "flos": 20810289137760.0, + "grad_norm": 1.8371401838156531, + "language_loss": 0.83689541, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.86459994, + "num_input_tokens_seen": 132149585, + "step": 6148, + "time_per_iteration": 2.7669641971588135 + }, + { + "auxiliary_loss_clip": 0.01501977, + "auxiliary_loss_mlp": 0.01278506, + "balance_loss_clip": 1.18365335, + "balance_loss_mlp": 1.04313898, + "epoch": 0.3696978806553435, + "flos": 26063960682240.0, + "grad_norm": 2.6089596668104464, + "language_loss": 0.74470699, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.7725119, + "num_input_tokens_seen": 132165555, + "step": 6149, + "time_per_iteration": 2.915285348892212 + }, + { + "auxiliary_loss_clip": 0.01505571, + "auxiliary_loss_mlp": 0.01285162, + "balance_loss_clip": 1.18754482, + "balance_loss_mlp": 1.04579008, + "epoch": 0.36975800390801145, + "flos": 26836375311360.0, + "grad_norm": 1.9744783514075641, + "language_loss": 0.71149421, + "learning_rate": 2.906436451364054e-06, + "loss": 0.73940158, + "num_input_tokens_seen": 132185100, + "step": 6150, + "time_per_iteration": 2.8037052154541016 + }, + { + "auxiliary_loss_clip": 0.01501563, + "auxiliary_loss_mlp": 0.01285222, + "balance_loss_clip": 1.18368912, + "balance_loss_mlp": 1.04756665, + "epoch": 0.3698181271606794, + "flos": 21144780859680.0, + "grad_norm": 1.7674863878737435, + "language_loss": 0.82042348, + "learning_rate": 2.906089268194611e-06, + "loss": 0.8482914, + "num_input_tokens_seen": 132203930, + "step": 6151, + "time_per_iteration": 2.799948215484619 + }, + { + "auxiliary_loss_clip": 0.01677017, + "auxiliary_loss_mlp": 0.01243408, + "balance_loss_clip": 1.37312031, + "balance_loss_mlp": 1.03436279, + "epoch": 0.3698782504133474, + "flos": 66748993490880.0, + "grad_norm": 0.7902289856140432, + "language_loss": 0.63152504, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.66072929, + "num_input_tokens_seen": 132263845, + "step": 6152, + "time_per_iteration": 3.494248390197754 + }, + { + "auxiliary_loss_clip": 0.01507006, + "auxiliary_loss_mlp": 0.01287867, + "balance_loss_clip": 1.18865335, + "balance_loss_mlp": 1.05383563, + "epoch": 0.36993837366601534, + "flos": 24313558610880.0, + "grad_norm": 2.0927956533548313, + "language_loss": 0.70100808, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72895682, + "num_input_tokens_seen": 132282350, + "step": 6153, + "time_per_iteration": 2.8691539764404297 + }, + { + "auxiliary_loss_clip": 0.01501944, + "auxiliary_loss_mlp": 0.01278925, + "balance_loss_clip": 1.18499374, + "balance_loss_mlp": 1.04107904, + "epoch": 0.3699984969186833, + "flos": 24351297494400.0, + "grad_norm": 2.0056635360275523, + "language_loss": 0.72024369, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74805242, + "num_input_tokens_seen": 132301930, + "step": 6154, + "time_per_iteration": 4.369643449783325 + }, + { + "auxiliary_loss_clip": 0.0149706, + "auxiliary_loss_mlp": 0.01270051, + "balance_loss_clip": 1.18102109, + "balance_loss_mlp": 1.03010678, + "epoch": 0.37005862017135127, + "flos": 19831239707040.0, + "grad_norm": 2.4798977968219345, + "language_loss": 0.67904794, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70671904, + "num_input_tokens_seen": 132320915, + "step": 6155, + "time_per_iteration": 2.797386646270752 + }, + { + "auxiliary_loss_clip": 0.0149466, + "auxiliary_loss_mlp": 0.0127833, + "balance_loss_clip": 1.17730379, + "balance_loss_mlp": 1.0429635, + "epoch": 0.37011874342401924, + "flos": 19575866855520.0, + "grad_norm": 1.7163328402853997, + "language_loss": 0.68112648, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.70885634, + "num_input_tokens_seen": 132340415, + "step": 6156, + "time_per_iteration": 2.7808730602264404 + }, + { + "auxiliary_loss_clip": 0.01495799, + "auxiliary_loss_mlp": 0.01272355, + "balance_loss_clip": 1.17887163, + "balance_loss_mlp": 1.03546262, + "epoch": 0.3701788666766872, + "flos": 20376348687360.0, + "grad_norm": 1.7391612562168086, + "language_loss": 0.81710196, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84478348, + "num_input_tokens_seen": 132358600, + "step": 6157, + "time_per_iteration": 2.787498712539673 + }, + { + "auxiliary_loss_clip": 0.0150756, + "auxiliary_loss_mlp": 0.01288681, + "balance_loss_clip": 1.1905719, + "balance_loss_mlp": 1.04644823, + "epoch": 0.37023898992935517, + "flos": 15342738513120.0, + "grad_norm": 3.4897608077829427, + "language_loss": 0.76483095, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.79279339, + "num_input_tokens_seen": 132373160, + "step": 6158, + "time_per_iteration": 2.739741086959839 + }, + { + "auxiliary_loss_clip": 0.01497861, + "auxiliary_loss_mlp": 0.0127377, + "balance_loss_clip": 1.18071365, + "balance_loss_mlp": 1.03344381, + "epoch": 0.37029911318202313, + "flos": 19576018568160.0, + "grad_norm": 2.304214035155238, + "language_loss": 0.69252324, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.72023952, + "num_input_tokens_seen": 132392345, + "step": 6159, + "time_per_iteration": 4.241193056106567 + }, + { + "auxiliary_loss_clip": 0.01508942, + "auxiliary_loss_mlp": 0.01263874, + "balance_loss_clip": 1.19187498, + "balance_loss_mlp": 1.0237385, + "epoch": 0.3703592364346911, + "flos": 26215750635840.0, + "grad_norm": 2.109261069210036, + "language_loss": 0.71309197, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.74082017, + "num_input_tokens_seen": 132412620, + "step": 6160, + "time_per_iteration": 2.810638427734375 + }, + { + "auxiliary_loss_clip": 0.0150327, + "auxiliary_loss_mlp": 0.0125777, + "balance_loss_clip": 1.18630815, + "balance_loss_mlp": 1.02259374, + "epoch": 0.37041935968735906, + "flos": 20050428729600.0, + "grad_norm": 1.7295227323261178, + "language_loss": 0.79096603, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81857646, + "num_input_tokens_seen": 132431570, + "step": 6161, + "time_per_iteration": 2.7691378593444824 + }, + { + "auxiliary_loss_clip": 0.01508574, + "auxiliary_loss_mlp": 0.01270656, + "balance_loss_clip": 1.19234633, + "balance_loss_mlp": 1.03338206, + "epoch": 0.3704794829400271, + "flos": 24136090928640.0, + "grad_norm": 2.0246856020159494, + "language_loss": 0.79691339, + "learning_rate": 2.902267988534295e-06, + "loss": 0.82470572, + "num_input_tokens_seen": 132451525, + "step": 6162, + "time_per_iteration": 2.815246105194092 + }, + { + "auxiliary_loss_clip": 0.01501672, + "auxiliary_loss_mlp": 0.01271664, + "balance_loss_clip": 1.18407059, + "balance_loss_mlp": 1.03419924, + "epoch": 0.37053960619269505, + "flos": 14868669705120.0, + "grad_norm": 2.15534165648007, + "language_loss": 0.79417706, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.82191038, + "num_input_tokens_seen": 132469875, + "step": 6163, + "time_per_iteration": 2.7075695991516113 + }, + { + "auxiliary_loss_clip": 0.01502365, + "auxiliary_loss_mlp": 0.01287979, + "balance_loss_clip": 1.18525124, + "balance_loss_mlp": 1.04975128, + "epoch": 0.370599729445363, + "flos": 21363666456960.0, + "grad_norm": 1.5871506770005044, + "language_loss": 0.68376768, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.71167111, + "num_input_tokens_seen": 132488360, + "step": 6164, + "time_per_iteration": 2.814779281616211 + }, + { + "auxiliary_loss_clip": 0.01504114, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 1.18717432, + "balance_loss_mlp": 1.03233767, + "epoch": 0.370659852698031, + "flos": 26831217081600.0, + "grad_norm": 3.043219887606196, + "language_loss": 0.83332175, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.86108756, + "num_input_tokens_seen": 132508630, + "step": 6165, + "time_per_iteration": 2.8080403804779053 + }, + { + "auxiliary_loss_clip": 0.01502686, + "auxiliary_loss_mlp": 0.01276446, + "balance_loss_clip": 1.18423879, + "balance_loss_mlp": 1.03211451, + "epoch": 0.37071997595069894, + "flos": 19101039484320.0, + "grad_norm": 1.7613833035489397, + "language_loss": 0.69410217, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.72189355, + "num_input_tokens_seen": 132527465, + "step": 6166, + "time_per_iteration": 2.8534188270568848 + }, + { + "auxiliary_loss_clip": 0.0165953, + "auxiliary_loss_mlp": 0.0122197, + "balance_loss_clip": 1.35621595, + "balance_loss_mlp": 1.00834656, + "epoch": 0.3707800992033669, + "flos": 52183562339520.0, + "grad_norm": 0.7858568613878134, + "language_loss": 0.56669313, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.59550816, + "num_input_tokens_seen": 132579940, + "step": 6167, + "time_per_iteration": 3.1983554363250732 + }, + { + "auxiliary_loss_clip": 0.01513277, + "auxiliary_loss_mlp": 0.01272553, + "balance_loss_clip": 1.19632745, + "balance_loss_mlp": 1.03394401, + "epoch": 0.3708402224560349, + "flos": 19903910790240.0, + "grad_norm": 2.032113625893419, + "language_loss": 0.75579756, + "learning_rate": 2.900181908135584e-06, + "loss": 0.78365588, + "num_input_tokens_seen": 132598390, + "step": 6168, + "time_per_iteration": 2.752354621887207 + }, + { + "auxiliary_loss_clip": 0.01496632, + "auxiliary_loss_mlp": 0.01268441, + "balance_loss_clip": 1.17973745, + "balance_loss_mlp": 1.0284965, + "epoch": 0.37090034570870284, + "flos": 20009503880640.0, + "grad_norm": 1.7754441877879457, + "language_loss": 0.73835421, + "learning_rate": 2.899834108519755e-06, + "loss": 0.76600486, + "num_input_tokens_seen": 132616920, + "step": 6169, + "time_per_iteration": 2.7867467403411865 + }, + { + "auxiliary_loss_clip": 0.01504512, + "auxiliary_loss_mlp": 0.01262537, + "balance_loss_clip": 1.18706346, + "balance_loss_mlp": 1.02373695, + "epoch": 0.3709604689613708, + "flos": 24137039132640.0, + "grad_norm": 1.4544324346160613, + "language_loss": 0.79169285, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81936336, + "num_input_tokens_seen": 132637660, + "step": 6170, + "time_per_iteration": 2.763749837875366 + }, + { + "auxiliary_loss_clip": 0.01497327, + "auxiliary_loss_mlp": 0.01263036, + "balance_loss_clip": 1.17876339, + "balance_loss_mlp": 1.02118468, + "epoch": 0.37102059221403877, + "flos": 23878214818560.0, + "grad_norm": 1.8504574537415215, + "language_loss": 0.7657665, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.79337013, + "num_input_tokens_seen": 132657635, + "step": 6171, + "time_per_iteration": 2.817115068435669 + }, + { + "auxiliary_loss_clip": 0.01500136, + "auxiliary_loss_mlp": 0.01268706, + "balance_loss_clip": 1.18162715, + "balance_loss_mlp": 1.02933383, + "epoch": 0.37108071546670673, + "flos": 14503000671360.0, + "grad_norm": 6.688300148540936, + "language_loss": 0.80611777, + "learning_rate": 2.898790504994232e-06, + "loss": 0.83380616, + "num_input_tokens_seen": 132674455, + "step": 6172, + "time_per_iteration": 2.7185237407684326 + }, + { + "auxiliary_loss_clip": 0.01499206, + "auxiliary_loss_mlp": 0.0127611, + "balance_loss_clip": 1.18036497, + "balance_loss_mlp": 1.03730965, + "epoch": 0.3711408387193747, + "flos": 34565149566720.0, + "grad_norm": 1.8890735347479337, + "language_loss": 0.59563398, + "learning_rate": 2.89844256897035e-06, + "loss": 0.62338716, + "num_input_tokens_seen": 132695140, + "step": 6173, + "time_per_iteration": 2.932671070098877 + }, + { + "auxiliary_loss_clip": 0.01499338, + "auxiliary_loss_mlp": 0.01280685, + "balance_loss_clip": 1.18146658, + "balance_loss_mlp": 1.04245687, + "epoch": 0.37120096197204266, + "flos": 17312595104160.0, + "grad_norm": 1.8361173843755847, + "language_loss": 0.80587816, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83367836, + "num_input_tokens_seen": 132712470, + "step": 6174, + "time_per_iteration": 2.806906223297119 + }, + { + "auxiliary_loss_clip": 0.01494237, + "auxiliary_loss_mlp": 0.01272504, + "balance_loss_clip": 1.17627335, + "balance_loss_mlp": 1.03637385, + "epoch": 0.37126108522471063, + "flos": 30666664658880.0, + "grad_norm": 2.274302366776501, + "language_loss": 0.80098194, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.82864934, + "num_input_tokens_seen": 132732945, + "step": 6175, + "time_per_iteration": 2.89727520942688 + }, + { + "auxiliary_loss_clip": 0.01499259, + "auxiliary_loss_mlp": 0.0128072, + "balance_loss_clip": 1.18168628, + "balance_loss_mlp": 1.04401827, + "epoch": 0.37132120847737865, + "flos": 25157999544480.0, + "grad_norm": 1.9994173131243353, + "language_loss": 0.89079207, + "learning_rate": 2.89739855653729e-06, + "loss": 0.91859186, + "num_input_tokens_seen": 132752470, + "step": 6176, + "time_per_iteration": 2.840055465698242 + }, + { + "auxiliary_loss_clip": 0.01496552, + "auxiliary_loss_mlp": 0.01301588, + "balance_loss_clip": 1.17786777, + "balance_loss_mlp": 1.06793737, + "epoch": 0.3713813317300466, + "flos": 21215517606720.0, + "grad_norm": 1.6511727020978815, + "language_loss": 0.73621178, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.76419324, + "num_input_tokens_seen": 132771485, + "step": 6177, + "time_per_iteration": 2.8014588356018066 + }, + { + "auxiliary_loss_clip": 0.0149766, + "auxiliary_loss_mlp": 0.01273029, + "balance_loss_clip": 1.17972112, + "balance_loss_mlp": 1.03518295, + "epoch": 0.3714414549827146, + "flos": 21618811739520.0, + "grad_norm": 2.009731686818778, + "language_loss": 0.76068097, + "learning_rate": 2.896702378079374e-06, + "loss": 0.78838789, + "num_input_tokens_seen": 132791465, + "step": 6178, + "time_per_iteration": 2.7743520736694336 + }, + { + "auxiliary_loss_clip": 0.01498501, + "auxiliary_loss_mlp": 0.01282628, + "balance_loss_clip": 1.17907262, + "balance_loss_mlp": 1.04249227, + "epoch": 0.37150157823538255, + "flos": 19974230327520.0, + "grad_norm": 4.460099698490428, + "language_loss": 0.72134054, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74915183, + "num_input_tokens_seen": 132810160, + "step": 6179, + "time_per_iteration": 2.7417616844177246 + }, + { + "auxiliary_loss_clip": 0.01491813, + "auxiliary_loss_mlp": 0.01274586, + "balance_loss_clip": 1.17329764, + "balance_loss_mlp": 1.03464174, + "epoch": 0.3715617014880505, + "flos": 24862725904320.0, + "grad_norm": 1.8921365901697043, + "language_loss": 0.7003535, + "learning_rate": 2.896006063609283e-06, + "loss": 0.72801745, + "num_input_tokens_seen": 132831265, + "step": 6180, + "time_per_iteration": 2.788444995880127 + }, + { + "auxiliary_loss_clip": 0.01485706, + "auxiliary_loss_mlp": 0.01265359, + "balance_loss_clip": 1.16667116, + "balance_loss_mlp": 1.03018308, + "epoch": 0.3716218247407185, + "flos": 20451181675680.0, + "grad_norm": 1.8236355035088, + "language_loss": 0.78221339, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.80972403, + "num_input_tokens_seen": 132850005, + "step": 6181, + "time_per_iteration": 2.834120988845825 + }, + { + "auxiliary_loss_clip": 0.0149598, + "auxiliary_loss_mlp": 0.01276171, + "balance_loss_clip": 1.17669225, + "balance_loss_mlp": 1.03908765, + "epoch": 0.37168194799338644, + "flos": 24135749575200.0, + "grad_norm": 12.55935490058645, + "language_loss": 0.78738487, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.81510639, + "num_input_tokens_seen": 132865790, + "step": 6182, + "time_per_iteration": 2.816663980484009 + }, + { + "auxiliary_loss_clip": 0.01669081, + "auxiliary_loss_mlp": 0.01237473, + "balance_loss_clip": 1.36415243, + "balance_loss_mlp": 1.02461243, + "epoch": 0.3717420712460544, + "flos": 67415852957760.0, + "grad_norm": 0.7810885393022995, + "language_loss": 0.57412148, + "learning_rate": 2.894961337112362e-06, + "loss": 0.60318696, + "num_input_tokens_seen": 132921775, + "step": 6183, + "time_per_iteration": 4.892171621322632 + }, + { + "auxiliary_loss_clip": 0.01488564, + "auxiliary_loss_mlp": 0.01281434, + "balance_loss_clip": 1.16810608, + "balance_loss_mlp": 1.03729367, + "epoch": 0.37180219449872237, + "flos": 22378785932160.0, + "grad_norm": 2.102761094259722, + "language_loss": 0.76879585, + "learning_rate": 2.894613027055066e-06, + "loss": 0.7964958, + "num_input_tokens_seen": 132941060, + "step": 6184, + "time_per_iteration": 2.8734841346740723 + }, + { + "auxiliary_loss_clip": 0.0148778, + "auxiliary_loss_mlp": 0.01267882, + "balance_loss_clip": 1.1679101, + "balance_loss_mlp": 1.02946329, + "epoch": 0.37186231775139034, + "flos": 21871946829600.0, + "grad_norm": 1.9407596117375343, + "language_loss": 0.72320026, + "learning_rate": 2.894264683073954e-06, + "loss": 0.75075692, + "num_input_tokens_seen": 132961850, + "step": 6185, + "time_per_iteration": 2.7676429748535156 + }, + { + "auxiliary_loss_clip": 0.0148831, + "auxiliary_loss_mlp": 0.01272823, + "balance_loss_clip": 1.16955876, + "balance_loss_mlp": 1.03459477, + "epoch": 0.3719224410040583, + "flos": 22417207522560.0, + "grad_norm": 3.55005835845916, + "language_loss": 0.77060175, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79821312, + "num_input_tokens_seen": 132981625, + "step": 6186, + "time_per_iteration": 2.8240420818328857 + }, + { + "auxiliary_loss_clip": 0.01491047, + "auxiliary_loss_mlp": 0.01276378, + "balance_loss_clip": 1.17111754, + "balance_loss_mlp": 1.03528941, + "epoch": 0.37198256425672627, + "flos": 25153334380800.0, + "grad_norm": 1.798349786486108, + "language_loss": 0.84201145, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.86968565, + "num_input_tokens_seen": 133001225, + "step": 6187, + "time_per_iteration": 2.766263723373413 + }, + { + "auxiliary_loss_clip": 0.01489655, + "auxiliary_loss_mlp": 0.01261984, + "balance_loss_clip": 1.17029345, + "balance_loss_mlp": 1.02356565, + "epoch": 0.37204268750939423, + "flos": 21140229480480.0, + "grad_norm": 1.721394534041364, + "language_loss": 0.85009301, + "learning_rate": 2.893219447719824e-06, + "loss": 0.87760937, + "num_input_tokens_seen": 133018820, + "step": 6188, + "time_per_iteration": 2.756931781768799 + }, + { + "auxiliary_loss_clip": 0.01486753, + "auxiliary_loss_mlp": 0.0126862, + "balance_loss_clip": 1.16724539, + "balance_loss_mlp": 1.02829385, + "epoch": 0.37210281076206225, + "flos": 21508970695200.0, + "grad_norm": 1.9717459051420914, + "language_loss": 0.65288067, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.68043435, + "num_input_tokens_seen": 133040205, + "step": 6189, + "time_per_iteration": 2.734548807144165 + }, + { + "auxiliary_loss_clip": 0.01481782, + "auxiliary_loss_mlp": 0.01268685, + "balance_loss_clip": 1.16210103, + "balance_loss_mlp": 1.02893102, + "epoch": 0.3721629340147302, + "flos": 17349765065280.0, + "grad_norm": 1.7844477224500732, + "language_loss": 0.84072864, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.86823332, + "num_input_tokens_seen": 133058095, + "step": 6190, + "time_per_iteration": 2.7146334648132324 + }, + { + "auxiliary_loss_clip": 0.01482325, + "auxiliary_loss_mlp": 0.01276316, + "balance_loss_clip": 1.16165948, + "balance_loss_mlp": 1.03331995, + "epoch": 0.3722230572673982, + "flos": 16434094318560.0, + "grad_norm": 5.604518216267273, + "language_loss": 0.88198572, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90957212, + "num_input_tokens_seen": 133071530, + "step": 6191, + "time_per_iteration": 4.15674614906311 + }, + { + "auxiliary_loss_clip": 0.01483137, + "auxiliary_loss_mlp": 0.01277009, + "balance_loss_clip": 1.16278434, + "balance_loss_mlp": 1.03057981, + "epoch": 0.37228318052006615, + "flos": 22676942112480.0, + "grad_norm": 1.9619888314412006, + "language_loss": 0.73944068, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76704216, + "num_input_tokens_seen": 133091410, + "step": 6192, + "time_per_iteration": 2.7208962440490723 + }, + { + "auxiliary_loss_clip": 0.01484556, + "auxiliary_loss_mlp": 0.01275396, + "balance_loss_clip": 1.1639936, + "balance_loss_mlp": 1.03583336, + "epoch": 0.3723433037727341, + "flos": 25267878516960.0, + "grad_norm": 2.280099335030701, + "language_loss": 0.79242641, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.82002592, + "num_input_tokens_seen": 133110365, + "step": 6193, + "time_per_iteration": 4.261371612548828 + }, + { + "auxiliary_loss_clip": 0.01477727, + "auxiliary_loss_mlp": 0.01272483, + "balance_loss_clip": 1.15672421, + "balance_loss_mlp": 1.02967799, + "epoch": 0.3724034270254021, + "flos": 10526193384480.0, + "grad_norm": 1.9585092466093017, + "language_loss": 0.8412773, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86877942, + "num_input_tokens_seen": 133128255, + "step": 6194, + "time_per_iteration": 2.801501989364624 + }, + { + "auxiliary_loss_clip": 0.01480897, + "auxiliary_loss_mlp": 0.0126699, + "balance_loss_clip": 1.16077876, + "balance_loss_mlp": 1.02666426, + "epoch": 0.37246355027807004, + "flos": 20268290266560.0, + "grad_norm": 2.3059445058799533, + "language_loss": 0.77110159, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79858053, + "num_input_tokens_seen": 133143975, + "step": 6195, + "time_per_iteration": 2.7450006008148193 + }, + { + "auxiliary_loss_clip": 0.01480027, + "auxiliary_loss_mlp": 0.01266226, + "balance_loss_clip": 1.1589185, + "balance_loss_mlp": 1.02723503, + "epoch": 0.372523673530738, + "flos": 19502626849920.0, + "grad_norm": 2.2237031081034497, + "language_loss": 0.79517174, + "learning_rate": 2.890430664088655e-06, + "loss": 0.82263422, + "num_input_tokens_seen": 133162935, + "step": 6196, + "time_per_iteration": 2.7734315395355225 + }, + { + "auxiliary_loss_clip": 0.01495038, + "auxiliary_loss_mlp": 0.0126882, + "balance_loss_clip": 1.17443514, + "balance_loss_mlp": 1.03059244, + "epoch": 0.372583796783406, + "flos": 16766158638240.0, + "grad_norm": 2.3548935732423186, + "language_loss": 0.83569258, + "learning_rate": 2.890081914052443e-06, + "loss": 0.8633312, + "num_input_tokens_seen": 133181180, + "step": 6197, + "time_per_iteration": 4.184356451034546 + }, + { + "auxiliary_loss_clip": 0.01478359, + "auxiliary_loss_mlp": 0.01271134, + "balance_loss_clip": 1.15766549, + "balance_loss_mlp": 1.03614879, + "epoch": 0.37264392003607394, + "flos": 22640189361120.0, + "grad_norm": 1.5066236832322306, + "language_loss": 0.6449759, + "learning_rate": 2.889733130264237e-06, + "loss": 0.67247081, + "num_input_tokens_seen": 133199615, + "step": 6198, + "time_per_iteration": 2.7740936279296875 + }, + { + "auxiliary_loss_clip": 0.0147815, + "auxiliary_loss_mlp": 0.01266624, + "balance_loss_clip": 1.1575706, + "balance_loss_mlp": 1.02877808, + "epoch": 0.3727040432887419, + "flos": 19975102675200.0, + "grad_norm": 1.4637855878451833, + "language_loss": 0.74219751, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76964521, + "num_input_tokens_seen": 133219650, + "step": 6199, + "time_per_iteration": 2.8243601322174072 + }, + { + "auxiliary_loss_clip": 0.01477407, + "auxiliary_loss_mlp": 0.01259812, + "balance_loss_clip": 1.15717936, + "balance_loss_mlp": 1.02082098, + "epoch": 0.37276416654140987, + "flos": 63903405234240.0, + "grad_norm": 1.7259729067502991, + "language_loss": 0.81045389, + "learning_rate": 2.889035461484742e-06, + "loss": 0.83782601, + "num_input_tokens_seen": 133245675, + "step": 6200, + "time_per_iteration": 3.1240909099578857 + }, + { + "auxiliary_loss_clip": 0.01479321, + "auxiliary_loss_mlp": 0.01279485, + "balance_loss_clip": 1.15896916, + "balance_loss_mlp": 1.04297376, + "epoch": 0.37282428979407783, + "flos": 39789122997600.0, + "grad_norm": 1.8783776136670445, + "language_loss": 0.59962499, + "learning_rate": 2.88868657651991e-06, + "loss": 0.627213, + "num_input_tokens_seen": 133266905, + "step": 6201, + "time_per_iteration": 2.9136672019958496 + }, + { + "auxiliary_loss_clip": 0.0147881, + "auxiliary_loss_mlp": 0.01267685, + "balance_loss_clip": 1.15873301, + "balance_loss_mlp": 1.02793086, + "epoch": 0.37288441304674586, + "flos": 22711153677120.0, + "grad_norm": 1.875010911491693, + "language_loss": 0.72832036, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75578529, + "num_input_tokens_seen": 133286865, + "step": 6202, + "time_per_iteration": 2.7693817615509033 + }, + { + "auxiliary_loss_clip": 0.01483946, + "auxiliary_loss_mlp": 0.01286276, + "balance_loss_clip": 1.16278553, + "balance_loss_mlp": 1.04900217, + "epoch": 0.3729445362994138, + "flos": 18772274914560.0, + "grad_norm": 1.9245962822569236, + "language_loss": 0.74117666, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76887894, + "num_input_tokens_seen": 133305295, + "step": 6203, + "time_per_iteration": 2.7641849517822266 + }, + { + "auxiliary_loss_clip": 0.01472432, + "auxiliary_loss_mlp": 0.01265515, + "balance_loss_clip": 1.15164471, + "balance_loss_mlp": 1.03091133, + "epoch": 0.3730046595520818, + "flos": 22458435796800.0, + "grad_norm": 1.6968310640229955, + "language_loss": 0.81808364, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.84546316, + "num_input_tokens_seen": 133324625, + "step": 6204, + "time_per_iteration": 2.8444173336029053 + }, + { + "auxiliary_loss_clip": 0.01475951, + "auxiliary_loss_mlp": 0.01294018, + "balance_loss_clip": 1.15427101, + "balance_loss_mlp": 1.05521774, + "epoch": 0.37306478280474975, + "flos": 24318678912480.0, + "grad_norm": 1.714072152746564, + "language_loss": 0.75195217, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77965182, + "num_input_tokens_seen": 133344625, + "step": 6205, + "time_per_iteration": 2.8773908615112305 + }, + { + "auxiliary_loss_clip": 0.01480177, + "auxiliary_loss_mlp": 0.01272821, + "balance_loss_clip": 1.15863788, + "balance_loss_mlp": 1.03421211, + "epoch": 0.3731249060574177, + "flos": 15816996961920.0, + "grad_norm": 3.221083215959405, + "language_loss": 0.78139234, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80892229, + "num_input_tokens_seen": 133363605, + "step": 6206, + "time_per_iteration": 2.7757081985473633 + }, + { + "auxiliary_loss_clip": 0.01483585, + "auxiliary_loss_mlp": 0.01280418, + "balance_loss_clip": 1.16176736, + "balance_loss_mlp": 1.04238057, + "epoch": 0.3731850293100857, + "flos": 19830101862240.0, + "grad_norm": 2.1711473925761378, + "language_loss": 0.93786204, + "learning_rate": 2.886592559513283e-06, + "loss": 0.96550202, + "num_input_tokens_seen": 133379405, + "step": 6207, + "time_per_iteration": 2.7113699913024902 + }, + { + "auxiliary_loss_clip": 0.01476435, + "auxiliary_loss_mlp": 0.01271585, + "balance_loss_clip": 1.15451694, + "balance_loss_mlp": 1.03354764, + "epoch": 0.37324515256275365, + "flos": 19064362589280.0, + "grad_norm": 2.1918302742287006, + "language_loss": 0.8228687, + "learning_rate": 2.886243438932759e-06, + "loss": 0.85034889, + "num_input_tokens_seen": 133397585, + "step": 6208, + "time_per_iteration": 2.7181224822998047 + }, + { + "auxiliary_loss_clip": 0.01474583, + "auxiliary_loss_mlp": 0.0128994, + "balance_loss_clip": 1.15241516, + "balance_loss_mlp": 1.05381048, + "epoch": 0.3733052758154216, + "flos": 20706440742720.0, + "grad_norm": 2.2580261705341846, + "language_loss": 0.74023306, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.76787829, + "num_input_tokens_seen": 133415365, + "step": 6209, + "time_per_iteration": 2.7432780265808105 + }, + { + "auxiliary_loss_clip": 0.0148238, + "auxiliary_loss_mlp": 0.01287712, + "balance_loss_clip": 1.16084266, + "balance_loss_mlp": 1.05406189, + "epoch": 0.3733653990680896, + "flos": 20195581255200.0, + "grad_norm": 2.4280785334157726, + "language_loss": 0.6989454, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.7266463, + "num_input_tokens_seen": 133435700, + "step": 6210, + "time_per_iteration": 2.8046722412109375 + }, + { + "auxiliary_loss_clip": 0.01480146, + "auxiliary_loss_mlp": 0.01276131, + "balance_loss_clip": 1.15865993, + "balance_loss_mlp": 1.03828478, + "epoch": 0.37342552232075754, + "flos": 20341757841120.0, + "grad_norm": 1.7422909219852276, + "language_loss": 0.78013813, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.80770093, + "num_input_tokens_seen": 133455180, + "step": 6211, + "time_per_iteration": 2.7877585887908936 + }, + { + "auxiliary_loss_clip": 0.01479216, + "auxiliary_loss_mlp": 0.01268463, + "balance_loss_clip": 1.15845942, + "balance_loss_mlp": 1.03099871, + "epoch": 0.3734856455734255, + "flos": 35520834886560.0, + "grad_norm": 1.5010281128083454, + "language_loss": 0.73299193, + "learning_rate": 2.884846620678668e-06, + "loss": 0.76046872, + "num_input_tokens_seen": 133476715, + "step": 6212, + "time_per_iteration": 2.8533082008361816 + }, + { + "auxiliary_loss_clip": 0.01483665, + "auxiliary_loss_mlp": 0.01277459, + "balance_loss_clip": 1.16276956, + "balance_loss_mlp": 1.03560686, + "epoch": 0.37354576882609347, + "flos": 21144439506240.0, + "grad_norm": 2.0034115549227196, + "language_loss": 0.81937408, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84698534, + "num_input_tokens_seen": 133494550, + "step": 6213, + "time_per_iteration": 2.7852625846862793 + }, + { + "auxiliary_loss_clip": 0.01483575, + "auxiliary_loss_mlp": 0.01283159, + "balance_loss_clip": 1.16275907, + "balance_loss_mlp": 1.04569435, + "epoch": 0.37360589207876144, + "flos": 21508932767040.0, + "grad_norm": 2.633791103830177, + "language_loss": 0.78567469, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81334209, + "num_input_tokens_seen": 133512640, + "step": 6214, + "time_per_iteration": 2.7880513668060303 + }, + { + "auxiliary_loss_clip": 0.01485027, + "auxiliary_loss_mlp": 0.01273615, + "balance_loss_clip": 1.16567266, + "balance_loss_mlp": 1.03863001, + "epoch": 0.37366601533142946, + "flos": 38438867021760.0, + "grad_norm": 2.461895425022113, + "language_loss": 0.85133564, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87892205, + "num_input_tokens_seen": 133535540, + "step": 6215, + "time_per_iteration": 2.923271417617798 + }, + { + "auxiliary_loss_clip": 0.01486811, + "auxiliary_loss_mlp": 0.0128312, + "balance_loss_clip": 1.16685033, + "balance_loss_mlp": 1.04393816, + "epoch": 0.3737261385840974, + "flos": 18443093135040.0, + "grad_norm": 1.7835537384009044, + "language_loss": 0.68135852, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.70905781, + "num_input_tokens_seen": 133555795, + "step": 6216, + "time_per_iteration": 2.839756488800049 + }, + { + "auxiliary_loss_clip": 0.01494648, + "auxiliary_loss_mlp": 0.01276443, + "balance_loss_clip": 1.17359185, + "balance_loss_mlp": 1.03955054, + "epoch": 0.3737862618367654, + "flos": 22932087395040.0, + "grad_norm": 2.5019406347182405, + "language_loss": 0.65867352, + "learning_rate": 2.883099843007303e-06, + "loss": 0.68638438, + "num_input_tokens_seen": 133575905, + "step": 6217, + "time_per_iteration": 2.750554084777832 + }, + { + "auxiliary_loss_clip": 0.01489289, + "auxiliary_loss_mlp": 0.01276236, + "balance_loss_clip": 1.16941905, + "balance_loss_mlp": 1.03877151, + "epoch": 0.37384638508943335, + "flos": 15411237498720.0, + "grad_norm": 1.8700380103698693, + "language_loss": 0.80848515, + "learning_rate": 2.88275038695833e-06, + "loss": 0.83614039, + "num_input_tokens_seen": 133592585, + "step": 6218, + "time_per_iteration": 2.7082512378692627 + }, + { + "auxiliary_loss_clip": 0.015032, + "auxiliary_loss_mlp": 0.01272213, + "balance_loss_clip": 1.1852119, + "balance_loss_mlp": 1.03646517, + "epoch": 0.3739065083421013, + "flos": 24283177790400.0, + "grad_norm": 1.5326001557603595, + "language_loss": 0.78980267, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.81755674, + "num_input_tokens_seen": 133615070, + "step": 6219, + "time_per_iteration": 2.7798943519592285 + }, + { + "auxiliary_loss_clip": 0.01495396, + "auxiliary_loss_mlp": 0.01267046, + "balance_loss_clip": 1.17611372, + "balance_loss_mlp": 1.02786493, + "epoch": 0.3739666315947693, + "flos": 23005061903520.0, + "grad_norm": 1.7530946911948864, + "language_loss": 0.76890612, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79653054, + "num_input_tokens_seen": 133633490, + "step": 6220, + "time_per_iteration": 2.7831149101257324 + }, + { + "auxiliary_loss_clip": 0.01493481, + "auxiliary_loss_mlp": 0.01269181, + "balance_loss_clip": 1.17487562, + "balance_loss_mlp": 1.02637601, + "epoch": 0.37402675484743725, + "flos": 19393165087200.0, + "grad_norm": 1.699713407956796, + "language_loss": 0.82928252, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85690916, + "num_input_tokens_seen": 133653425, + "step": 6221, + "time_per_iteration": 4.524784803390503 + }, + { + "auxiliary_loss_clip": 0.01490414, + "auxiliary_loss_mlp": 0.01274841, + "balance_loss_clip": 1.17202497, + "balance_loss_mlp": 1.03909302, + "epoch": 0.3740868781001052, + "flos": 17127921071520.0, + "grad_norm": 1.687346189991524, + "language_loss": 0.76466274, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.79231536, + "num_input_tokens_seen": 133670220, + "step": 6222, + "time_per_iteration": 2.7552530765533447 + }, + { + "auxiliary_loss_clip": 0.0149794, + "auxiliary_loss_mlp": 0.01283126, + "balance_loss_clip": 1.17903948, + "balance_loss_mlp": 1.04547, + "epoch": 0.3741470013527732, + "flos": 20045346356160.0, + "grad_norm": 1.8326152028835185, + "language_loss": 0.70715243, + "learning_rate": 2.881002604868789e-06, + "loss": 0.73496312, + "num_input_tokens_seen": 133688910, + "step": 6223, + "time_per_iteration": 2.7977356910705566 + }, + { + "auxiliary_loss_clip": 0.01495146, + "auxiliary_loss_mlp": 0.01279972, + "balance_loss_clip": 1.17689466, + "balance_loss_mlp": 1.04212534, + "epoch": 0.37420712460544114, + "flos": 36899916628320.0, + "grad_norm": 2.171271269957745, + "language_loss": 0.68865824, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71640944, + "num_input_tokens_seen": 133708690, + "step": 6224, + "time_per_iteration": 2.8600540161132812 + }, + { + "auxiliary_loss_clip": 0.01486936, + "auxiliary_loss_mlp": 0.01269875, + "balance_loss_clip": 1.16844916, + "balance_loss_mlp": 1.03431702, + "epoch": 0.3742672478581091, + "flos": 22203442226880.0, + "grad_norm": 1.888824697191794, + "language_loss": 0.70039451, + "learning_rate": 2.880303258086228e-06, + "loss": 0.72796267, + "num_input_tokens_seen": 133728095, + "step": 6225, + "time_per_iteration": 2.8460514545440674 + }, + { + "auxiliary_loss_clip": 0.01492978, + "auxiliary_loss_mlp": 0.0126432, + "balance_loss_clip": 1.17380512, + "balance_loss_mlp": 1.02666426, + "epoch": 0.3743273711107771, + "flos": 24683892808320.0, + "grad_norm": 2.1346864381567747, + "language_loss": 0.7977246, + "learning_rate": 2.879953534616536e-06, + "loss": 0.82529759, + "num_input_tokens_seen": 133745590, + "step": 6226, + "time_per_iteration": 2.835731029510498 + }, + { + "auxiliary_loss_clip": 0.01490133, + "auxiliary_loss_mlp": 0.01264733, + "balance_loss_clip": 1.1706624, + "balance_loss_mlp": 1.02288127, + "epoch": 0.37438749436344504, + "flos": 24461783317440.0, + "grad_norm": 1.7950897766678557, + "language_loss": 0.67788327, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70543194, + "num_input_tokens_seen": 133766155, + "step": 6227, + "time_per_iteration": 2.8612759113311768 + }, + { + "auxiliary_loss_clip": 0.01493859, + "auxiliary_loss_mlp": 0.01260359, + "balance_loss_clip": 1.17455459, + "balance_loss_mlp": 1.02117765, + "epoch": 0.374447617616113, + "flos": 21800906657280.0, + "grad_norm": 1.8571754871957042, + "language_loss": 0.83010727, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85764945, + "num_input_tokens_seen": 133783185, + "step": 6228, + "time_per_iteration": 2.7702231407165527 + }, + { + "auxiliary_loss_clip": 0.01491643, + "auxiliary_loss_mlp": 0.01273988, + "balance_loss_clip": 1.17321169, + "balance_loss_mlp": 1.03709531, + "epoch": 0.374507740868781, + "flos": 17970162171840.0, + "grad_norm": 1.5920018332803891, + "language_loss": 0.75050932, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.77816558, + "num_input_tokens_seen": 133800975, + "step": 6229, + "time_per_iteration": 4.241339921951294 + }, + { + "auxiliary_loss_clip": 0.01484956, + "auxiliary_loss_mlp": 0.01270205, + "balance_loss_clip": 1.1650933, + "balance_loss_mlp": 1.02873456, + "epoch": 0.374567864121449, + "flos": 16107605438400.0, + "grad_norm": 2.1801240002003035, + "language_loss": 0.83459151, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.86214316, + "num_input_tokens_seen": 133818020, + "step": 6230, + "time_per_iteration": 2.820265054702759 + }, + { + "auxiliary_loss_clip": 0.01480353, + "auxiliary_loss_mlp": 0.01273678, + "balance_loss_clip": 1.161798, + "balance_loss_mlp": 1.03335226, + "epoch": 0.37462798737411696, + "flos": 25775400326400.0, + "grad_norm": 2.8643790031077287, + "language_loss": 0.73610544, + "learning_rate": 2.878204417014456e-06, + "loss": 0.76364571, + "num_input_tokens_seen": 133840690, + "step": 6231, + "time_per_iteration": 4.374856472015381 + }, + { + "auxiliary_loss_clip": 0.01485583, + "auxiliary_loss_mlp": 0.01275798, + "balance_loss_clip": 1.16502857, + "balance_loss_mlp": 1.03795171, + "epoch": 0.3746881106267849, + "flos": 16656393450240.0, + "grad_norm": 2.2397380307175365, + "language_loss": 0.73733974, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.76495361, + "num_input_tokens_seen": 133858350, + "step": 6232, + "time_per_iteration": 2.720649242401123 + }, + { + "auxiliary_loss_clip": 0.01482895, + "auxiliary_loss_mlp": 0.01266306, + "balance_loss_clip": 1.16261864, + "balance_loss_mlp": 1.02540779, + "epoch": 0.3747482338794529, + "flos": 26180439154560.0, + "grad_norm": 1.775165422615158, + "language_loss": 0.76887703, + "learning_rate": 2.877504536769561e-06, + "loss": 0.79636902, + "num_input_tokens_seen": 133879775, + "step": 6233, + "time_per_iteration": 2.8160476684570312 + }, + { + "auxiliary_loss_clip": 0.01487822, + "auxiliary_loss_mlp": 0.01282412, + "balance_loss_clip": 1.16894603, + "balance_loss_mlp": 1.045138, + "epoch": 0.37480835713212085, + "flos": 12022701802560.0, + "grad_norm": 1.7905517129374326, + "language_loss": 0.69953144, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.72723377, + "num_input_tokens_seen": 133898295, + "step": 6234, + "time_per_iteration": 2.7767515182495117 + }, + { + "auxiliary_loss_clip": 0.01477551, + "auxiliary_loss_mlp": 0.01266376, + "balance_loss_clip": 1.1588192, + "balance_loss_mlp": 1.02853012, + "epoch": 0.3748684803847888, + "flos": 19681118592480.0, + "grad_norm": 1.8857942997530202, + "language_loss": 0.82623458, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.85367393, + "num_input_tokens_seen": 133915230, + "step": 6235, + "time_per_iteration": 4.211822271347046 + }, + { + "auxiliary_loss_clip": 0.01486346, + "auxiliary_loss_mlp": 0.01272879, + "balance_loss_clip": 1.16584253, + "balance_loss_mlp": 1.03350711, + "epoch": 0.3749286036374568, + "flos": 20523245908320.0, + "grad_norm": 2.48622644887857, + "language_loss": 0.78299892, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.8105911, + "num_input_tokens_seen": 133934110, + "step": 6236, + "time_per_iteration": 2.7632651329040527 + }, + { + "auxiliary_loss_clip": 0.01483979, + "auxiliary_loss_mlp": 0.01293498, + "balance_loss_clip": 1.16499174, + "balance_loss_mlp": 1.05336285, + "epoch": 0.37498872689012475, + "flos": 20706744168000.0, + "grad_norm": 2.467675073961256, + "language_loss": 0.73249322, + "learning_rate": 2.876104377085234e-06, + "loss": 0.76026803, + "num_input_tokens_seen": 133952395, + "step": 6237, + "time_per_iteration": 2.778409481048584 + }, + { + "auxiliary_loss_clip": 0.01476409, + "auxiliary_loss_mlp": 0.01263852, + "balance_loss_clip": 1.15658498, + "balance_loss_mlp": 1.0248611, + "epoch": 0.3750488501427927, + "flos": 21576711117600.0, + "grad_norm": 2.02295765825199, + "language_loss": 0.93160391, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.95900649, + "num_input_tokens_seen": 133969635, + "step": 6238, + "time_per_iteration": 2.852388620376587 + }, + { + "auxiliary_loss_clip": 0.01481068, + "auxiliary_loss_mlp": 0.0126694, + "balance_loss_clip": 1.16205728, + "balance_loss_mlp": 1.02966583, + "epoch": 0.3751089733954607, + "flos": 15925662233280.0, + "grad_norm": 1.9718797196628244, + "language_loss": 0.70939159, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.7368716, + "num_input_tokens_seen": 133987215, + "step": 6239, + "time_per_iteration": 2.7037112712860107 + }, + { + "auxiliary_loss_clip": 0.01489101, + "auxiliary_loss_mlp": 0.01268723, + "balance_loss_clip": 1.16921997, + "balance_loss_mlp": 1.02896917, + "epoch": 0.37516909664812864, + "flos": 36287674076160.0, + "grad_norm": 2.0265384066565444, + "language_loss": 0.65626603, + "learning_rate": 2.875053908444895e-06, + "loss": 0.68384421, + "num_input_tokens_seen": 134009250, + "step": 6240, + "time_per_iteration": 2.8410022258758545 + }, + { + "auxiliary_loss_clip": 0.01482532, + "auxiliary_loss_mlp": 0.01272735, + "balance_loss_clip": 1.16230881, + "balance_loss_mlp": 1.03240895, + "epoch": 0.3752292199007966, + "flos": 13517579309760.0, + "grad_norm": 3.2988492837446612, + "language_loss": 0.758569, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.78612167, + "num_input_tokens_seen": 134026875, + "step": 6241, + "time_per_iteration": 2.7870230674743652 + }, + { + "auxiliary_loss_clip": 0.0148478, + "auxiliary_loss_mlp": 0.01273184, + "balance_loss_clip": 1.16542935, + "balance_loss_mlp": 1.03228569, + "epoch": 0.3752893431534646, + "flos": 27200527218720.0, + "grad_norm": 2.079299055025328, + "language_loss": 0.83917284, + "learning_rate": 2.874353430085213e-06, + "loss": 0.86675245, + "num_input_tokens_seen": 134047185, + "step": 6242, + "time_per_iteration": 2.781526565551758 + }, + { + "auxiliary_loss_clip": 0.01480238, + "auxiliary_loss_mlp": 0.01269175, + "balance_loss_clip": 1.15990019, + "balance_loss_mlp": 1.03094709, + "epoch": 0.3753494664061326, + "flos": 30010273364160.0, + "grad_norm": 2.3130229736884846, + "language_loss": 0.67994279, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70743692, + "num_input_tokens_seen": 134067330, + "step": 6243, + "time_per_iteration": 2.8462717533111572 + }, + { + "auxiliary_loss_clip": 0.01480503, + "auxiliary_loss_mlp": 0.01275616, + "balance_loss_clip": 1.16139698, + "balance_loss_mlp": 1.03662491, + "epoch": 0.37540958965880056, + "flos": 24464248647840.0, + "grad_norm": 2.122244721405181, + "language_loss": 0.84066975, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.86823094, + "num_input_tokens_seen": 134085525, + "step": 6244, + "time_per_iteration": 2.778608560562134 + }, + { + "auxiliary_loss_clip": 0.01481356, + "auxiliary_loss_mlp": 0.0126311, + "balance_loss_clip": 1.16183841, + "balance_loss_mlp": 1.02507257, + "epoch": 0.3754697129114685, + "flos": 16510254792480.0, + "grad_norm": 2.5163118319584017, + "language_loss": 0.83282596, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.86027062, + "num_input_tokens_seen": 134101855, + "step": 6245, + "time_per_iteration": 2.82757830619812 + }, + { + "auxiliary_loss_clip": 0.01480765, + "auxiliary_loss_mlp": 0.01275807, + "balance_loss_clip": 1.16087365, + "balance_loss_mlp": 1.0392952, + "epoch": 0.3755298361641365, + "flos": 19392937518240.0, + "grad_norm": 6.9208326845112325, + "language_loss": 0.63976014, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66732585, + "num_input_tokens_seen": 134119360, + "step": 6246, + "time_per_iteration": 2.7311038970947266 + }, + { + "auxiliary_loss_clip": 0.01482135, + "auxiliary_loss_mlp": 0.01272419, + "balance_loss_clip": 1.16340625, + "balance_loss_mlp": 1.03171194, + "epoch": 0.37558995941680445, + "flos": 14722151765760.0, + "grad_norm": 2.130657780080769, + "language_loss": 0.74742186, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77496743, + "num_input_tokens_seen": 134137475, + "step": 6247, + "time_per_iteration": 2.7614593505859375 + }, + { + "auxiliary_loss_clip": 0.01479486, + "auxiliary_loss_mlp": 0.01279581, + "balance_loss_clip": 1.15982461, + "balance_loss_mlp": 1.04307032, + "epoch": 0.3756500826694724, + "flos": 21692089673280.0, + "grad_norm": 3.0010140736511413, + "language_loss": 0.55817401, + "learning_rate": 2.872251199697598e-06, + "loss": 0.58576465, + "num_input_tokens_seen": 134154580, + "step": 6248, + "time_per_iteration": 2.7961151599884033 + }, + { + "auxiliary_loss_clip": 0.01488363, + "auxiliary_loss_mlp": 0.01275514, + "balance_loss_clip": 1.16929066, + "balance_loss_mlp": 1.03766751, + "epoch": 0.3757102059221404, + "flos": 26508103807680.0, + "grad_norm": 1.8821279624008465, + "language_loss": 0.84381676, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.87145555, + "num_input_tokens_seen": 134174285, + "step": 6249, + "time_per_iteration": 2.8586349487304688 + }, + { + "auxiliary_loss_clip": 0.01472399, + "auxiliary_loss_mlp": 0.012785, + "balance_loss_clip": 1.15342188, + "balance_loss_mlp": 1.04179764, + "epoch": 0.37577032917480835, + "flos": 37340911716480.0, + "grad_norm": 2.202058864440854, + "language_loss": 0.68434852, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.7118575, + "num_input_tokens_seen": 134195940, + "step": 6250, + "time_per_iteration": 2.9017386436462402 + }, + { + "auxiliary_loss_clip": 0.01481189, + "auxiliary_loss_mlp": 0.01267736, + "balance_loss_clip": 1.16230857, + "balance_loss_mlp": 1.02969861, + "epoch": 0.3758304524274763, + "flos": 21910785629760.0, + "grad_norm": 1.9492667950499192, + "language_loss": 0.78172415, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.8092134, + "num_input_tokens_seen": 134212235, + "step": 6251, + "time_per_iteration": 2.731201648712158 + }, + { + "auxiliary_loss_clip": 0.01478449, + "auxiliary_loss_mlp": 0.01277773, + "balance_loss_clip": 1.15944946, + "balance_loss_mlp": 1.04107094, + "epoch": 0.3758905756801443, + "flos": 36571986478080.0, + "grad_norm": 2.054404755671412, + "language_loss": 0.58150995, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60907215, + "num_input_tokens_seen": 134233810, + "step": 6252, + "time_per_iteration": 2.933704137802124 + }, + { + "auxiliary_loss_clip": 0.01484543, + "auxiliary_loss_mlp": 0.01271126, + "balance_loss_clip": 1.16558337, + "balance_loss_mlp": 1.03194427, + "epoch": 0.37595069893281224, + "flos": 24530737440960.0, + "grad_norm": 1.8027000827564752, + "language_loss": 0.89849985, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.9260565, + "num_input_tokens_seen": 134252020, + "step": 6253, + "time_per_iteration": 2.787428379058838 + }, + { + "auxiliary_loss_clip": 0.01490786, + "auxiliary_loss_mlp": 0.01274632, + "balance_loss_clip": 1.17104328, + "balance_loss_mlp": 1.03735805, + "epoch": 0.3760108221854802, + "flos": 16436863074240.0, + "grad_norm": 1.996923853969894, + "language_loss": 0.76654607, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.7942003, + "num_input_tokens_seen": 134269495, + "step": 6254, + "time_per_iteration": 2.7867186069488525 + }, + { + "auxiliary_loss_clip": 0.01484779, + "auxiliary_loss_mlp": 0.01289304, + "balance_loss_clip": 1.16661179, + "balance_loss_mlp": 1.0516485, + "epoch": 0.37607094543814823, + "flos": 13773445227360.0, + "grad_norm": 2.7961524914849005, + "language_loss": 0.62128878, + "learning_rate": 2.869797092829169e-06, + "loss": 0.64902961, + "num_input_tokens_seen": 134287035, + "step": 6255, + "time_per_iteration": 2.698209285736084 + }, + { + "auxiliary_loss_clip": 0.01487849, + "auxiliary_loss_mlp": 0.01270609, + "balance_loss_clip": 1.16814768, + "balance_loss_mlp": 1.03009272, + "epoch": 0.3761310686908162, + "flos": 19859572406880.0, + "grad_norm": 3.026606514995314, + "language_loss": 0.73821914, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76580369, + "num_input_tokens_seen": 134304840, + "step": 6256, + "time_per_iteration": 2.7815630435943604 + }, + { + "auxiliary_loss_clip": 0.01490845, + "auxiliary_loss_mlp": 0.01279797, + "balance_loss_clip": 1.17177129, + "balance_loss_mlp": 1.042714, + "epoch": 0.37619119194348416, + "flos": 12752484815520.0, + "grad_norm": 2.2141847062780697, + "language_loss": 0.70663106, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.73433745, + "num_input_tokens_seen": 134323180, + "step": 6257, + "time_per_iteration": 2.774613857269287 + }, + { + "auxiliary_loss_clip": 0.01489076, + "auxiliary_loss_mlp": 0.0126129, + "balance_loss_clip": 1.16916716, + "balance_loss_mlp": 1.02363431, + "epoch": 0.3762513151961521, + "flos": 17532504761760.0, + "grad_norm": 2.2269386571708454, + "language_loss": 0.84691006, + "learning_rate": 2.868744837734889e-06, + "loss": 0.87441373, + "num_input_tokens_seen": 134341390, + "step": 6258, + "time_per_iteration": 2.7201383113861084 + }, + { + "auxiliary_loss_clip": 0.01480119, + "auxiliary_loss_mlp": 0.01274355, + "balance_loss_clip": 1.16036916, + "balance_loss_mlp": 1.03650904, + "epoch": 0.3763114384488201, + "flos": 23619352576320.0, + "grad_norm": 2.573688086410474, + "language_loss": 0.80826867, + "learning_rate": 2.868394020133277e-06, + "loss": 0.8358134, + "num_input_tokens_seen": 134360425, + "step": 6259, + "time_per_iteration": 2.8161048889160156 + }, + { + "auxiliary_loss_clip": 0.01483788, + "auxiliary_loss_mlp": 0.01272958, + "balance_loss_clip": 1.1631577, + "balance_loss_mlp": 1.02862692, + "epoch": 0.37637156170148806, + "flos": 25409124442080.0, + "grad_norm": 4.068705640242768, + "language_loss": 0.7116996, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73926705, + "num_input_tokens_seen": 134379775, + "step": 6260, + "time_per_iteration": 4.510223627090454 + }, + { + "auxiliary_loss_clip": 0.01481513, + "auxiliary_loss_mlp": 0.01269909, + "balance_loss_clip": 1.16139865, + "balance_loss_mlp": 1.0271039, + "epoch": 0.376431684954156, + "flos": 23443174451520.0, + "grad_norm": 2.2501115949531143, + "language_loss": 0.78438187, + "learning_rate": 2.867692286154594e-06, + "loss": 0.81189609, + "num_input_tokens_seen": 134400315, + "step": 6261, + "time_per_iteration": 2.859696626663208 + }, + { + "auxiliary_loss_clip": 0.01486346, + "auxiliary_loss_mlp": 0.01274652, + "balance_loss_clip": 1.16640782, + "balance_loss_mlp": 1.03165555, + "epoch": 0.376491808206824, + "flos": 34207369590240.0, + "grad_norm": 1.8781179384060203, + "language_loss": 0.80539334, + "learning_rate": 2.867341369804132e-06, + "loss": 0.83300328, + "num_input_tokens_seen": 134422875, + "step": 6262, + "time_per_iteration": 2.9514524936676025 + }, + { + "auxiliary_loss_clip": 0.01489103, + "auxiliary_loss_mlp": 0.01271409, + "balance_loss_clip": 1.17092752, + "balance_loss_mlp": 1.03299069, + "epoch": 0.37655193145949195, + "flos": 35188391285280.0, + "grad_norm": 1.8602457017203842, + "language_loss": 0.80656952, + "learning_rate": 2.866990420563998e-06, + "loss": 0.83417463, + "num_input_tokens_seen": 134443025, + "step": 6263, + "time_per_iteration": 2.8748059272766113 + }, + { + "auxiliary_loss_clip": 0.01481352, + "auxiliary_loss_mlp": 0.01271417, + "balance_loss_clip": 1.16249442, + "balance_loss_mlp": 1.03166354, + "epoch": 0.3766120547121599, + "flos": 16763617451520.0, + "grad_norm": 2.0887160466604517, + "language_loss": 0.79741746, + "learning_rate": 2.866639438447501e-06, + "loss": 0.82494515, + "num_input_tokens_seen": 134460945, + "step": 6264, + "time_per_iteration": 2.794989585876465 + }, + { + "auxiliary_loss_clip": 0.01478674, + "auxiliary_loss_mlp": 0.01262566, + "balance_loss_clip": 1.15976, + "balance_loss_mlp": 1.02414739, + "epoch": 0.3766721779648279, + "flos": 23552636214240.0, + "grad_norm": 2.764496475469489, + "language_loss": 0.73739839, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.7648108, + "num_input_tokens_seen": 134480440, + "step": 6265, + "time_per_iteration": 2.752434730529785 + }, + { + "auxiliary_loss_clip": 0.01485016, + "auxiliary_loss_mlp": 0.01261021, + "balance_loss_clip": 1.16759706, + "balance_loss_mlp": 1.02298355, + "epoch": 0.37673230121749585, + "flos": 29131582937760.0, + "grad_norm": 1.9239069489937288, + "language_loss": 0.68470442, + "learning_rate": 2.865937375638654e-06, + "loss": 0.71216488, + "num_input_tokens_seen": 134501110, + "step": 6266, + "time_per_iteration": 2.8546323776245117 + }, + { + "auxiliary_loss_clip": 0.01492245, + "auxiliary_loss_mlp": 0.01277895, + "balance_loss_clip": 1.17358434, + "balance_loss_mlp": 1.03661585, + "epoch": 0.3767924244701638, + "flos": 28149271685280.0, + "grad_norm": 2.843858527430628, + "language_loss": 0.63454729, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.66224873, + "num_input_tokens_seen": 134522460, + "step": 6267, + "time_per_iteration": 2.8842573165893555 + }, + { + "auxiliary_loss_clip": 0.01742201, + "auxiliary_loss_mlp": 0.01233154, + "balance_loss_clip": 1.43763602, + "balance_loss_mlp": 1.01724243, + "epoch": 0.37685254772283183, + "flos": 60803315745120.0, + "grad_norm": 0.7585894664174039, + "language_loss": 0.58877754, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.61853111, + "num_input_tokens_seen": 134589545, + "step": 6268, + "time_per_iteration": 4.913910150527954 + }, + { + "auxiliary_loss_clip": 0.01487729, + "auxiliary_loss_mlp": 0.01274571, + "balance_loss_clip": 1.16887414, + "balance_loss_mlp": 1.03729713, + "epoch": 0.3769126709754998, + "flos": 26035021131840.0, + "grad_norm": 1.8438577861365493, + "language_loss": 0.65200561, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67962861, + "num_input_tokens_seen": 134610550, + "step": 6269, + "time_per_iteration": 4.252074480056763 + }, + { + "auxiliary_loss_clip": 0.01495634, + "auxiliary_loss_mlp": 0.0127745, + "balance_loss_clip": 1.1787138, + "balance_loss_mlp": 1.04151106, + "epoch": 0.37697279422816776, + "flos": 23581765405440.0, + "grad_norm": 1.7138508190407196, + "language_loss": 0.71055365, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.73828447, + "num_input_tokens_seen": 134630485, + "step": 6270, + "time_per_iteration": 2.7652053833007812 + }, + { + "auxiliary_loss_clip": 0.0173877, + "auxiliary_loss_mlp": 0.01233299, + "balance_loss_clip": 1.43485856, + "balance_loss_mlp": 1.01509857, + "epoch": 0.3770329174808357, + "flos": 64752738536160.0, + "grad_norm": 0.758166000814266, + "language_loss": 0.56070602, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.59042668, + "num_input_tokens_seen": 134693510, + "step": 6271, + "time_per_iteration": 3.1839354038238525 + }, + { + "auxiliary_loss_clip": 0.01484562, + "auxiliary_loss_mlp": 0.01262694, + "balance_loss_clip": 1.16560388, + "balance_loss_mlp": 1.02408457, + "epoch": 0.3770930407335037, + "flos": 21837469767840.0, + "grad_norm": 1.7810116116647776, + "language_loss": 0.79704267, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82451522, + "num_input_tokens_seen": 134713115, + "step": 6272, + "time_per_iteration": 2.7062199115753174 + }, + { + "auxiliary_loss_clip": 0.01495481, + "auxiliary_loss_mlp": 0.01275999, + "balance_loss_clip": 1.17836523, + "balance_loss_mlp": 1.03815234, + "epoch": 0.37715316398617166, + "flos": 22750144189920.0, + "grad_norm": 1.6433409773393983, + "language_loss": 0.74253643, + "learning_rate": 2.863479122159103e-06, + "loss": 0.77025115, + "num_input_tokens_seen": 134732635, + "step": 6273, + "time_per_iteration": 2.8152661323547363 + }, + { + "auxiliary_loss_clip": 0.01498984, + "auxiliary_loss_mlp": 0.01299536, + "balance_loss_clip": 1.18270183, + "balance_loss_mlp": 1.0639782, + "epoch": 0.3772132872388396, + "flos": 18916479236160.0, + "grad_norm": 2.3665182844758696, + "language_loss": 0.72014618, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.74813139, + "num_input_tokens_seen": 134750695, + "step": 6274, + "time_per_iteration": 4.264742612838745 + }, + { + "auxiliary_loss_clip": 0.01499772, + "auxiliary_loss_mlp": 0.01271067, + "balance_loss_clip": 1.18293381, + "balance_loss_mlp": 1.03150415, + "epoch": 0.3772734104915076, + "flos": 17348058298080.0, + "grad_norm": 1.8211268079020533, + "language_loss": 0.84435117, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.87205958, + "num_input_tokens_seen": 134768935, + "step": 6275, + "time_per_iteration": 2.7385518550872803 + }, + { + "auxiliary_loss_clip": 0.01487951, + "auxiliary_loss_mlp": 0.01266393, + "balance_loss_clip": 1.17298615, + "balance_loss_mlp": 1.03045428, + "epoch": 0.37733353374417555, + "flos": 32345078353920.0, + "grad_norm": 1.6181117075472222, + "language_loss": 0.75641513, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.78395861, + "num_input_tokens_seen": 134791260, + "step": 6276, + "time_per_iteration": 2.8419859409332275 + }, + { + "auxiliary_loss_clip": 0.01489973, + "auxiliary_loss_mlp": 0.01280446, + "balance_loss_clip": 1.17438388, + "balance_loss_mlp": 1.04450655, + "epoch": 0.3773936569968435, + "flos": 23362538454720.0, + "grad_norm": 2.4395703466698175, + "language_loss": 0.85686541, + "learning_rate": 2.862073685241366e-06, + "loss": 0.88456964, + "num_input_tokens_seen": 134808350, + "step": 6277, + "time_per_iteration": 2.779724597930908 + }, + { + "auxiliary_loss_clip": 0.01489071, + "auxiliary_loss_mlp": 0.01284441, + "balance_loss_clip": 1.17170465, + "balance_loss_mlp": 1.04831123, + "epoch": 0.3774537802495115, + "flos": 21468728553120.0, + "grad_norm": 4.844723566326223, + "language_loss": 0.78744113, + "learning_rate": 2.861722244253818e-06, + "loss": 0.81517625, + "num_input_tokens_seen": 134826005, + "step": 6278, + "time_per_iteration": 2.818415641784668 + }, + { + "auxiliary_loss_clip": 0.01480072, + "auxiliary_loss_mlp": 0.0128076, + "balance_loss_clip": 1.16378236, + "balance_loss_mlp": 1.04329515, + "epoch": 0.37751390350217945, + "flos": 24976094267520.0, + "grad_norm": 2.047106609172069, + "language_loss": 0.82879853, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85640693, + "num_input_tokens_seen": 134844995, + "step": 6279, + "time_per_iteration": 2.7828409671783447 + }, + { + "auxiliary_loss_clip": 0.01484314, + "auxiliary_loss_mlp": 0.01269907, + "balance_loss_clip": 1.16830897, + "balance_loss_mlp": 1.03492212, + "epoch": 0.3775740267548474, + "flos": 27821607032160.0, + "grad_norm": 2.0228995372859453, + "language_loss": 0.75204039, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77958256, + "num_input_tokens_seen": 134865285, + "step": 6280, + "time_per_iteration": 2.8077306747436523 + }, + { + "auxiliary_loss_clip": 0.01477776, + "auxiliary_loss_mlp": 0.01273265, + "balance_loss_clip": 1.16127753, + "balance_loss_mlp": 1.03541827, + "epoch": 0.3776341500075154, + "flos": 22567442421600.0, + "grad_norm": 1.421435551948664, + "language_loss": 0.76176971, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78928012, + "num_input_tokens_seen": 134886535, + "step": 6281, + "time_per_iteration": 2.8174493312835693 + }, + { + "auxiliary_loss_clip": 0.01478363, + "auxiliary_loss_mlp": 0.01268581, + "balance_loss_clip": 1.1623292, + "balance_loss_mlp": 1.03340459, + "epoch": 0.3776942732601834, + "flos": 23079553538400.0, + "grad_norm": 3.415840887883307, + "language_loss": 0.84173989, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86920929, + "num_input_tokens_seen": 134907435, + "step": 6282, + "time_per_iteration": 2.8423774242401123 + }, + { + "auxiliary_loss_clip": 0.01481483, + "auxiliary_loss_mlp": 0.01267864, + "balance_loss_clip": 1.16632354, + "balance_loss_mlp": 1.02982676, + "epoch": 0.37775439651285136, + "flos": 21726490878720.0, + "grad_norm": 1.7037948945030494, + "language_loss": 0.69563901, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.72313249, + "num_input_tokens_seen": 134925360, + "step": 6283, + "time_per_iteration": 2.8499178886413574 + }, + { + "auxiliary_loss_clip": 0.01489599, + "auxiliary_loss_mlp": 0.01275139, + "balance_loss_clip": 1.17401898, + "balance_loss_mlp": 1.0386281, + "epoch": 0.37781451976551933, + "flos": 23990028127200.0, + "grad_norm": 1.9826326140083625, + "language_loss": 0.76112938, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78877681, + "num_input_tokens_seen": 134944205, + "step": 6284, + "time_per_iteration": 2.8380486965179443 + }, + { + "auxiliary_loss_clip": 0.01478962, + "auxiliary_loss_mlp": 0.01269183, + "balance_loss_clip": 1.16211152, + "balance_loss_mlp": 1.03000104, + "epoch": 0.3778746430181873, + "flos": 13729258556640.0, + "grad_norm": 2.0659909522556212, + "language_loss": 0.85526466, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.88274604, + "num_input_tokens_seen": 134960255, + "step": 6285, + "time_per_iteration": 2.824946880340576 + }, + { + "auxiliary_loss_clip": 0.01479068, + "auxiliary_loss_mlp": 0.01272693, + "balance_loss_clip": 1.16245294, + "balance_loss_mlp": 1.03656316, + "epoch": 0.37793476627085526, + "flos": 19462346779680.0, + "grad_norm": 1.8053300936836838, + "language_loss": 0.84221816, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86973584, + "num_input_tokens_seen": 134978605, + "step": 6286, + "time_per_iteration": 2.815124273300171 + }, + { + "auxiliary_loss_clip": 0.01485199, + "auxiliary_loss_mlp": 0.01273343, + "balance_loss_clip": 1.17006946, + "balance_loss_mlp": 1.03625941, + "epoch": 0.3779948895235232, + "flos": 10708553799360.0, + "grad_norm": 2.358354080681472, + "language_loss": 0.81441486, + "learning_rate": 2.858557806518775e-06, + "loss": 0.84200025, + "num_input_tokens_seen": 134995020, + "step": 6287, + "time_per_iteration": 2.8580310344696045 + }, + { + "auxiliary_loss_clip": 0.01475808, + "auxiliary_loss_mlp": 0.01272108, + "balance_loss_clip": 1.16075897, + "balance_loss_mlp": 1.03292704, + "epoch": 0.3780550127761912, + "flos": 22312221282720.0, + "grad_norm": 3.461521419689523, + "language_loss": 0.73838401, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.76586318, + "num_input_tokens_seen": 135012620, + "step": 6288, + "time_per_iteration": 2.787837505340576 + }, + { + "auxiliary_loss_clip": 0.01486648, + "auxiliary_loss_mlp": 0.0126702, + "balance_loss_clip": 1.17101049, + "balance_loss_mlp": 1.03069949, + "epoch": 0.37811513602885916, + "flos": 28953242907840.0, + "grad_norm": 3.641927579954384, + "language_loss": 0.75488704, + "learning_rate": 2.857854239668352e-06, + "loss": 0.78242373, + "num_input_tokens_seen": 135033365, + "step": 6289, + "time_per_iteration": 2.8252370357513428 + }, + { + "auxiliary_loss_clip": 0.01484957, + "auxiliary_loss_mlp": 0.012723, + "balance_loss_clip": 1.1691978, + "balance_loss_mlp": 1.03292775, + "epoch": 0.3781752592815271, + "flos": 23115206373120.0, + "grad_norm": 1.9701039483376896, + "language_loss": 0.74005949, + "learning_rate": 2.857502407441593e-06, + "loss": 0.76763207, + "num_input_tokens_seen": 135052185, + "step": 6290, + "time_per_iteration": 2.846417188644409 + }, + { + "auxiliary_loss_clip": 0.01477968, + "auxiliary_loss_mlp": 0.01276402, + "balance_loss_clip": 1.16134453, + "balance_loss_mlp": 1.0362668, + "epoch": 0.3782353825341951, + "flos": 19757999701440.0, + "grad_norm": 2.259734174826887, + "language_loss": 0.79920685, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.82675058, + "num_input_tokens_seen": 135070425, + "step": 6291, + "time_per_iteration": 2.768998622894287 + }, + { + "auxiliary_loss_clip": 0.01496715, + "auxiliary_loss_mlp": 0.01282351, + "balance_loss_clip": 1.18101025, + "balance_loss_mlp": 1.04278827, + "epoch": 0.37829550578686305, + "flos": 22052828046240.0, + "grad_norm": 2.1246095462104595, + "language_loss": 0.76484364, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.79263425, + "num_input_tokens_seen": 135090525, + "step": 6292, + "time_per_iteration": 2.8358914852142334 + }, + { + "auxiliary_loss_clip": 0.01487727, + "auxiliary_loss_mlp": 0.01278988, + "balance_loss_clip": 1.17045951, + "balance_loss_mlp": 1.04209518, + "epoch": 0.378355629039531, + "flos": 16472136627360.0, + "grad_norm": 2.1505565387215517, + "language_loss": 0.69528806, + "learning_rate": 2.856446715715224e-06, + "loss": 0.72295523, + "num_input_tokens_seen": 135109575, + "step": 6293, + "time_per_iteration": 2.7307097911834717 + }, + { + "auxiliary_loss_clip": 0.01479068, + "auxiliary_loss_mlp": 0.01270631, + "balance_loss_clip": 1.16333437, + "balance_loss_mlp": 1.03373849, + "epoch": 0.378415752292199, + "flos": 19976923226880.0, + "grad_norm": 2.2649252185431674, + "language_loss": 0.71379912, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.74129611, + "num_input_tokens_seen": 135127000, + "step": 6294, + "time_per_iteration": 2.7455554008483887 + }, + { + "auxiliary_loss_clip": 0.01481068, + "auxiliary_loss_mlp": 0.01279834, + "balance_loss_clip": 1.16431713, + "balance_loss_mlp": 1.04217839, + "epoch": 0.378475875544867, + "flos": 14649139329120.0, + "grad_norm": 2.2137765415676434, + "language_loss": 0.82967168, + "learning_rate": 2.855742758826011e-06, + "loss": 0.85728073, + "num_input_tokens_seen": 135145285, + "step": 6295, + "time_per_iteration": 2.7747819423675537 + }, + { + "auxiliary_loss_clip": 0.01479216, + "auxiliary_loss_mlp": 0.01263541, + "balance_loss_clip": 1.16176987, + "balance_loss_mlp": 1.02493167, + "epoch": 0.37853599879753497, + "flos": 26653408045920.0, + "grad_norm": 2.2215946482587516, + "language_loss": 0.71821511, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.74564266, + "num_input_tokens_seen": 135165240, + "step": 6296, + "time_per_iteration": 2.7974538803100586 + }, + { + "auxiliary_loss_clip": 0.01482119, + "auxiliary_loss_mlp": 0.01262907, + "balance_loss_clip": 1.16584551, + "balance_loss_mlp": 1.02753985, + "epoch": 0.37859612205020293, + "flos": 17313922589760.0, + "grad_norm": 3.1919482573356235, + "language_loss": 0.77203542, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79948568, + "num_input_tokens_seen": 135184045, + "step": 6297, + "time_per_iteration": 2.7108066082000732 + }, + { + "auxiliary_loss_clip": 0.01474012, + "auxiliary_loss_mlp": 0.01266915, + "balance_loss_clip": 1.15749967, + "balance_loss_mlp": 1.02849627, + "epoch": 0.3786562453028709, + "flos": 18222197345280.0, + "grad_norm": 2.3581572976282907, + "language_loss": 0.79244769, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81985694, + "num_input_tokens_seen": 135202365, + "step": 6298, + "time_per_iteration": 4.492744207382202 + }, + { + "auxiliary_loss_clip": 0.01484399, + "auxiliary_loss_mlp": 0.01264228, + "balance_loss_clip": 1.16779482, + "balance_loss_mlp": 1.02733541, + "epoch": 0.37871636855553886, + "flos": 21216996804960.0, + "grad_norm": 1.7305442729914509, + "language_loss": 0.84263694, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.87012315, + "num_input_tokens_seen": 135220955, + "step": 6299, + "time_per_iteration": 2.8237860202789307 + }, + { + "auxiliary_loss_clip": 0.01475988, + "auxiliary_loss_mlp": 0.01270558, + "balance_loss_clip": 1.15955794, + "balance_loss_mlp": 1.03309369, + "epoch": 0.3787764918082068, + "flos": 20954455531200.0, + "grad_norm": 2.3728393311872726, + "language_loss": 0.76357043, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.79103589, + "num_input_tokens_seen": 135239715, + "step": 6300, + "time_per_iteration": 2.8494935035705566 + }, + { + "auxiliary_loss_clip": 0.01482068, + "auxiliary_loss_mlp": 0.01277609, + "balance_loss_clip": 1.16597641, + "balance_loss_mlp": 1.03690147, + "epoch": 0.3788366150608748, + "flos": 17309750492160.0, + "grad_norm": 1.9579687233382888, + "language_loss": 0.82577145, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.85336816, + "num_input_tokens_seen": 135257035, + "step": 6301, + "time_per_iteration": 2.774900197982788 + }, + { + "auxiliary_loss_clip": 0.01474474, + "auxiliary_loss_mlp": 0.01274598, + "balance_loss_clip": 1.15825415, + "balance_loss_mlp": 1.03865886, + "epoch": 0.37889673831354276, + "flos": 24313103472960.0, + "grad_norm": 1.7661233773721454, + "language_loss": 0.67874122, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.70623195, + "num_input_tokens_seen": 135275720, + "step": 6302, + "time_per_iteration": 2.803497076034546 + }, + { + "auxiliary_loss_clip": 0.01481388, + "auxiliary_loss_mlp": 0.01268393, + "balance_loss_clip": 1.16385603, + "balance_loss_mlp": 1.03283501, + "epoch": 0.3789568615662107, + "flos": 26685799058880.0, + "grad_norm": 1.9293459744824923, + "language_loss": 0.68246937, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.70996726, + "num_input_tokens_seen": 135294140, + "step": 6303, + "time_per_iteration": 2.826831579208374 + }, + { + "auxiliary_loss_clip": 0.01471335, + "auxiliary_loss_mlp": 0.01266574, + "balance_loss_clip": 1.1549226, + "balance_loss_mlp": 1.02929997, + "epoch": 0.3790169848188787, + "flos": 23587378773120.0, + "grad_norm": 4.934999488671244, + "language_loss": 0.78260732, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.80998641, + "num_input_tokens_seen": 135314845, + "step": 6304, + "time_per_iteration": 2.818760633468628 + }, + { + "auxiliary_loss_clip": 0.01481501, + "auxiliary_loss_mlp": 0.01282743, + "balance_loss_clip": 1.1640377, + "balance_loss_mlp": 1.04489708, + "epoch": 0.37907710807154665, + "flos": 18439148606400.0, + "grad_norm": 1.8086846449548841, + "language_loss": 0.80395949, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.83160192, + "num_input_tokens_seen": 135333055, + "step": 6305, + "time_per_iteration": 2.7320196628570557 + }, + { + "auxiliary_loss_clip": 0.01700855, + "auxiliary_loss_mlp": 0.01219543, + "balance_loss_clip": 1.40005219, + "balance_loss_mlp": 1.00592041, + "epoch": 0.3791372313242146, + "flos": 50112701965440.0, + "grad_norm": 0.9808448598324472, + "language_loss": 0.64498019, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.6741842, + "num_input_tokens_seen": 135387865, + "step": 6306, + "time_per_iteration": 4.77236533164978 + }, + { + "auxiliary_loss_clip": 0.01479748, + "auxiliary_loss_mlp": 0.01287783, + "balance_loss_clip": 1.16166544, + "balance_loss_mlp": 1.05165291, + "epoch": 0.3791973545768826, + "flos": 24318868553280.0, + "grad_norm": 1.4842996203789383, + "language_loss": 0.73485893, + "learning_rate": 2.851516295441817e-06, + "loss": 0.7625342, + "num_input_tokens_seen": 135409095, + "step": 6307, + "time_per_iteration": 4.357557058334351 + }, + { + "auxiliary_loss_clip": 0.01474669, + "auxiliary_loss_mlp": 0.01282671, + "balance_loss_clip": 1.15825891, + "balance_loss_mlp": 1.04444313, + "epoch": 0.3792574778295506, + "flos": 21582058988160.0, + "grad_norm": 10.832226828492145, + "language_loss": 0.78389949, + "learning_rate": 2.851163879959112e-06, + "loss": 0.81147289, + "num_input_tokens_seen": 135429585, + "step": 6308, + "time_per_iteration": 2.8845043182373047 + }, + { + "auxiliary_loss_clip": 0.01482784, + "auxiliary_loss_mlp": 0.01279143, + "balance_loss_clip": 1.16483235, + "balance_loss_mlp": 1.0420593, + "epoch": 0.37931760108221857, + "flos": 22274899608960.0, + "grad_norm": 2.718458110915107, + "language_loss": 0.72825289, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75587213, + "num_input_tokens_seen": 135446320, + "step": 6309, + "time_per_iteration": 2.7820093631744385 + }, + { + "auxiliary_loss_clip": 0.01482721, + "auxiliary_loss_mlp": 0.01275619, + "balance_loss_clip": 1.16629922, + "balance_loss_mlp": 1.03815389, + "epoch": 0.37937772433488653, + "flos": 19685063121120.0, + "grad_norm": 1.5238505509689237, + "language_loss": 0.7879709, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.81555438, + "num_input_tokens_seen": 135465720, + "step": 6310, + "time_per_iteration": 2.9045298099517822 + }, + { + "auxiliary_loss_clip": 0.01480612, + "auxiliary_loss_mlp": 0.01273837, + "balance_loss_clip": 1.1631248, + "balance_loss_mlp": 1.03751612, + "epoch": 0.3794378475875545, + "flos": 19101418765920.0, + "grad_norm": 5.723844238582651, + "language_loss": 0.76311749, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.79066199, + "num_input_tokens_seen": 135485155, + "step": 6311, + "time_per_iteration": 2.827854871749878 + }, + { + "auxiliary_loss_clip": 0.01473436, + "auxiliary_loss_mlp": 0.01264007, + "balance_loss_clip": 1.15698814, + "balance_loss_mlp": 1.02730489, + "epoch": 0.37949797084022246, + "flos": 20341757841120.0, + "grad_norm": 2.4185961848233757, + "language_loss": 0.70765948, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73503393, + "num_input_tokens_seen": 135502675, + "step": 6312, + "time_per_iteration": 4.333822727203369 + }, + { + "auxiliary_loss_clip": 0.01668606, + "auxiliary_loss_mlp": 0.0121875, + "balance_loss_clip": 1.37007761, + "balance_loss_mlp": 1.00665283, + "epoch": 0.37955809409289043, + "flos": 63978541283520.0, + "grad_norm": 0.7790574413354724, + "language_loss": 0.56037712, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58925068, + "num_input_tokens_seen": 135562005, + "step": 6313, + "time_per_iteration": 3.2969741821289062 + }, + { + "auxiliary_loss_clip": 0.01469932, + "auxiliary_loss_mlp": 0.01268496, + "balance_loss_clip": 1.1523211, + "balance_loss_mlp": 1.03255689, + "epoch": 0.3796182173455584, + "flos": 31543382820960.0, + "grad_norm": 2.1651551430190525, + "language_loss": 0.71865022, + "learning_rate": 2.849048709730083e-06, + "loss": 0.7460345, + "num_input_tokens_seen": 135582600, + "step": 6314, + "time_per_iteration": 2.856375217437744 + }, + { + "auxiliary_loss_clip": 0.01473275, + "auxiliary_loss_mlp": 0.01287921, + "balance_loss_clip": 1.15506136, + "balance_loss_mlp": 1.04988408, + "epoch": 0.37967834059822636, + "flos": 12132466990560.0, + "grad_norm": 1.902102596456507, + "language_loss": 0.73172289, + "learning_rate": 2.848696068594545e-06, + "loss": 0.7593348, + "num_input_tokens_seen": 135600280, + "step": 6315, + "time_per_iteration": 2.739745855331421 + }, + { + "auxiliary_loss_clip": 0.01465345, + "auxiliary_loss_mlp": 0.012751, + "balance_loss_clip": 1.14696479, + "balance_loss_mlp": 1.03858852, + "epoch": 0.3797384638508943, + "flos": 39351731084640.0, + "grad_norm": 2.0101072846318053, + "language_loss": 0.70638454, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73378897, + "num_input_tokens_seen": 135621560, + "step": 6316, + "time_per_iteration": 2.925950527191162 + }, + { + "auxiliary_loss_clip": 0.01473446, + "auxiliary_loss_mlp": 0.01269972, + "balance_loss_clip": 1.15592766, + "balance_loss_mlp": 1.03613138, + "epoch": 0.3797985871035623, + "flos": 34056641625120.0, + "grad_norm": 1.8135340355369483, + "language_loss": 0.65299869, + "learning_rate": 2.847990689788923e-06, + "loss": 0.68043286, + "num_input_tokens_seen": 135641745, + "step": 6317, + "time_per_iteration": 2.8536534309387207 + }, + { + "auxiliary_loss_clip": 0.0147677, + "auxiliary_loss_mlp": 0.01270044, + "balance_loss_clip": 1.15808702, + "balance_loss_mlp": 1.03753853, + "epoch": 0.37985871035623026, + "flos": 23224440566880.0, + "grad_norm": 2.251687502224833, + "language_loss": 0.85611516, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.88358331, + "num_input_tokens_seen": 135660650, + "step": 6318, + "time_per_iteration": 2.765097141265869 + }, + { + "auxiliary_loss_clip": 0.01488042, + "auxiliary_loss_mlp": 0.01274368, + "balance_loss_clip": 1.17010689, + "balance_loss_mlp": 1.03594899, + "epoch": 0.3799188336088982, + "flos": 18116983536480.0, + "grad_norm": 2.3059746784134973, + "language_loss": 0.7654652, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.79308927, + "num_input_tokens_seen": 135679980, + "step": 6319, + "time_per_iteration": 2.740666151046753 + }, + { + "auxiliary_loss_clip": 0.01473083, + "auxiliary_loss_mlp": 0.01268522, + "balance_loss_clip": 1.15542459, + "balance_loss_mlp": 1.0280056, + "epoch": 0.3799789568615662, + "flos": 21874070806560.0, + "grad_norm": 1.7499806156829387, + "language_loss": 0.64118129, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66859734, + "num_input_tokens_seen": 135699400, + "step": 6320, + "time_per_iteration": 2.823181629180908 + }, + { + "auxiliary_loss_clip": 0.01476386, + "auxiliary_loss_mlp": 0.01264082, + "balance_loss_clip": 1.15761805, + "balance_loss_mlp": 1.02604461, + "epoch": 0.3800390801142342, + "flos": 32965703029440.0, + "grad_norm": 2.2359674119188258, + "language_loss": 0.71275067, + "learning_rate": 2.846579546413992e-06, + "loss": 0.7401554, + "num_input_tokens_seen": 135723455, + "step": 6321, + "time_per_iteration": 2.876513957977295 + }, + { + "auxiliary_loss_clip": 0.01476763, + "auxiliary_loss_mlp": 0.01281556, + "balance_loss_clip": 1.15809584, + "balance_loss_mlp": 1.04485393, + "epoch": 0.38009920336690217, + "flos": 26909539460640.0, + "grad_norm": 1.9698198569910919, + "language_loss": 0.74431324, + "learning_rate": 2.846226680280859e-06, + "loss": 0.77189648, + "num_input_tokens_seen": 135744335, + "step": 6322, + "time_per_iteration": 2.846895217895508 + }, + { + "auxiliary_loss_clip": 0.01474543, + "auxiliary_loss_mlp": 0.01268323, + "balance_loss_clip": 1.15680373, + "balance_loss_mlp": 1.0308578, + "epoch": 0.38015932661957014, + "flos": 22490940594240.0, + "grad_norm": 2.5266460867373, + "language_loss": 0.85090327, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87833196, + "num_input_tokens_seen": 135761440, + "step": 6323, + "time_per_iteration": 2.8333892822265625 + }, + { + "auxiliary_loss_clip": 0.0147429, + "auxiliary_loss_mlp": 0.01283001, + "balance_loss_clip": 1.15663552, + "balance_loss_mlp": 1.04572725, + "epoch": 0.3802194498722381, + "flos": 21983153287680.0, + "grad_norm": 1.9728460918453472, + "language_loss": 0.73316216, + "learning_rate": 2.845520851760973e-06, + "loss": 0.76073503, + "num_input_tokens_seen": 135779955, + "step": 6324, + "time_per_iteration": 2.758005380630493 + }, + { + "auxiliary_loss_clip": 0.01472986, + "auxiliary_loss_mlp": 0.01278929, + "balance_loss_clip": 1.15339077, + "balance_loss_mlp": 1.0395565, + "epoch": 0.38027957312490607, + "flos": 21327027490080.0, + "grad_norm": 1.9516582513074971, + "language_loss": 0.84158683, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86910599, + "num_input_tokens_seen": 135799840, + "step": 6325, + "time_per_iteration": 2.7827675342559814 + }, + { + "auxiliary_loss_clip": 0.0147074, + "auxiliary_loss_mlp": 0.01268901, + "balance_loss_clip": 1.15269542, + "balance_loss_mlp": 1.03372478, + "epoch": 0.38033969637757403, + "flos": 16693146201600.0, + "grad_norm": 2.6151106775502506, + "language_loss": 0.79190123, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81929761, + "num_input_tokens_seen": 135817880, + "step": 6326, + "time_per_iteration": 2.706254243850708 + }, + { + "auxiliary_loss_clip": 0.01468371, + "auxiliary_loss_mlp": 0.01270256, + "balance_loss_clip": 1.14947462, + "balance_loss_mlp": 1.03488922, + "epoch": 0.380399819630242, + "flos": 36213827220000.0, + "grad_norm": 1.9340733518714477, + "language_loss": 0.7291401, + "learning_rate": 2.844461868547842e-06, + "loss": 0.75652635, + "num_input_tokens_seen": 135838940, + "step": 6327, + "time_per_iteration": 2.9145796298980713 + }, + { + "auxiliary_loss_clip": 0.01476334, + "auxiliary_loss_mlp": 0.01266084, + "balance_loss_clip": 1.15761387, + "balance_loss_mlp": 1.02766538, + "epoch": 0.38045994288290996, + "flos": 21291033301920.0, + "grad_norm": 1.7860554414835763, + "language_loss": 0.8338716, + "learning_rate": 2.844108810081459e-06, + "loss": 0.86129576, + "num_input_tokens_seen": 135858325, + "step": 6328, + "time_per_iteration": 2.741974115371704 + }, + { + "auxiliary_loss_clip": 0.01462163, + "auxiliary_loss_mlp": 0.01258574, + "balance_loss_clip": 1.14202261, + "balance_loss_mlp": 1.0228256, + "epoch": 0.38052006613557793, + "flos": 20924984986560.0, + "grad_norm": 1.5536619155752511, + "language_loss": 0.61664104, + "learning_rate": 2.843755719606385e-06, + "loss": 0.64384842, + "num_input_tokens_seen": 135878430, + "step": 6329, + "time_per_iteration": 2.821760892868042 + }, + { + "auxiliary_loss_clip": 0.01474804, + "auxiliary_loss_mlp": 0.01271257, + "balance_loss_clip": 1.15513396, + "balance_loss_mlp": 1.03302884, + "epoch": 0.3805801893882459, + "flos": 20992535768160.0, + "grad_norm": 1.9242492391502453, + "language_loss": 0.56617218, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.59363282, + "num_input_tokens_seen": 135894755, + "step": 6330, + "time_per_iteration": 2.7624547481536865 + }, + { + "auxiliary_loss_clip": 0.01466567, + "auxiliary_loss_mlp": 0.01270846, + "balance_loss_clip": 1.14714384, + "balance_loss_mlp": 1.03299952, + "epoch": 0.38064031264091386, + "flos": 25561066108320.0, + "grad_norm": 1.6443470437702907, + "language_loss": 0.66464531, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.69201946, + "num_input_tokens_seen": 135918275, + "step": 6331, + "time_per_iteration": 2.8466897010803223 + }, + { + "auxiliary_loss_clip": 0.01468178, + "auxiliary_loss_mlp": 0.01284084, + "balance_loss_clip": 1.14865279, + "balance_loss_mlp": 1.04871774, + "epoch": 0.3807004358935818, + "flos": 15087669086880.0, + "grad_norm": 1.6451042841304062, + "language_loss": 0.76248574, + "learning_rate": 2.842696256262919e-06, + "loss": 0.79000837, + "num_input_tokens_seen": 135937430, + "step": 6332, + "time_per_iteration": 2.774663209915161 + }, + { + "auxiliary_loss_clip": 0.01458803, + "auxiliary_loss_mlp": 0.01269523, + "balance_loss_clip": 1.13757718, + "balance_loss_mlp": 1.03110433, + "epoch": 0.3807605591462498, + "flos": 16401286095840.0, + "grad_norm": 3.2881058186556236, + "language_loss": 0.8194437, + "learning_rate": 2.842343037886987e-06, + "loss": 0.84672689, + "num_input_tokens_seen": 135954210, + "step": 6333, + "time_per_iteration": 2.6682920455932617 + }, + { + "auxiliary_loss_clip": 0.01464863, + "auxiliary_loss_mlp": 0.0127511, + "balance_loss_clip": 1.14551473, + "balance_loss_mlp": 1.03650057, + "epoch": 0.3808206823989178, + "flos": 29059518705120.0, + "grad_norm": 1.7969995762495805, + "language_loss": 0.86323404, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.89063382, + "num_input_tokens_seen": 135974425, + "step": 6334, + "time_per_iteration": 2.861781597137451 + }, + { + "auxiliary_loss_clip": 0.01464093, + "auxiliary_loss_mlp": 0.01271376, + "balance_loss_clip": 1.14502311, + "balance_loss_mlp": 1.03238487, + "epoch": 0.3808808056515858, + "flos": 15707686911840.0, + "grad_norm": 1.829768176618608, + "language_loss": 0.79104358, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81839824, + "num_input_tokens_seen": 135991985, + "step": 6335, + "time_per_iteration": 4.518392324447632 + }, + { + "auxiliary_loss_clip": 0.01467912, + "auxiliary_loss_mlp": 0.01278096, + "balance_loss_clip": 1.14849329, + "balance_loss_mlp": 1.03834236, + "epoch": 0.38094092890425374, + "flos": 20706706239840.0, + "grad_norm": 1.9103233200914842, + "language_loss": 0.73023772, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.75769782, + "num_input_tokens_seen": 136010015, + "step": 6336, + "time_per_iteration": 2.7624783515930176 + }, + { + "auxiliary_loss_clip": 0.01461708, + "auxiliary_loss_mlp": 0.0127269, + "balance_loss_clip": 1.14247727, + "balance_loss_mlp": 1.03713238, + "epoch": 0.3810010521569217, + "flos": 20670067272960.0, + "grad_norm": 1.8098336513659654, + "language_loss": 0.6919446, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71928859, + "num_input_tokens_seen": 136028440, + "step": 6337, + "time_per_iteration": 2.769641876220703 + }, + { + "auxiliary_loss_clip": 0.01476, + "auxiliary_loss_mlp": 0.01273245, + "balance_loss_clip": 1.15628481, + "balance_loss_mlp": 1.03597116, + "epoch": 0.38106117540958967, + "flos": 31830084696960.0, + "grad_norm": 1.8514982349682145, + "language_loss": 0.63881826, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.66631067, + "num_input_tokens_seen": 136048360, + "step": 6338, + "time_per_iteration": 2.8723950386047363 + }, + { + "auxiliary_loss_clip": 0.01465126, + "auxiliary_loss_mlp": 0.01264082, + "balance_loss_clip": 1.14704287, + "balance_loss_mlp": 1.02470934, + "epoch": 0.38112129866225763, + "flos": 16904256526080.0, + "grad_norm": 1.780223408501458, + "language_loss": 0.6926893, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71998131, + "num_input_tokens_seen": 136065500, + "step": 6339, + "time_per_iteration": 2.7696516513824463 + }, + { + "auxiliary_loss_clip": 0.01466244, + "auxiliary_loss_mlp": 0.01259695, + "balance_loss_clip": 1.14760149, + "balance_loss_mlp": 1.02146685, + "epoch": 0.3811814219149256, + "flos": 20889370080000.0, + "grad_norm": 2.5344712145074784, + "language_loss": 0.68438721, + "learning_rate": 2.839869615637177e-06, + "loss": 0.71164662, + "num_input_tokens_seen": 136084060, + "step": 6340, + "time_per_iteration": 2.7417471408843994 + }, + { + "auxiliary_loss_clip": 0.01468468, + "auxiliary_loss_mlp": 0.01283175, + "balance_loss_clip": 1.14823604, + "balance_loss_mlp": 1.043612, + "epoch": 0.38124154516759357, + "flos": 16692691063680.0, + "grad_norm": 2.068154482162739, + "language_loss": 0.90055943, + "learning_rate": 2.839516142102522e-06, + "loss": 0.92807579, + "num_input_tokens_seen": 136102310, + "step": 6341, + "time_per_iteration": 2.7445898056030273 + }, + { + "auxiliary_loss_clip": 0.01462889, + "auxiliary_loss_mlp": 0.01275406, + "balance_loss_clip": 1.14343131, + "balance_loss_mlp": 1.03660643, + "epoch": 0.38130166842026153, + "flos": 19683963204480.0, + "grad_norm": 1.7501891743894509, + "language_loss": 0.75125092, + "learning_rate": 2.83916263673333e-06, + "loss": 0.77863395, + "num_input_tokens_seen": 136120725, + "step": 6342, + "time_per_iteration": 2.8027727603912354 + }, + { + "auxiliary_loss_clip": 0.01468453, + "auxiliary_loss_mlp": 0.01267693, + "balance_loss_clip": 1.14910519, + "balance_loss_mlp": 1.02984691, + "epoch": 0.3813617916729295, + "flos": 22200521758560.0, + "grad_norm": 1.7878070023867452, + "language_loss": 0.83313936, + "learning_rate": 2.838809099543007e-06, + "loss": 0.86050081, + "num_input_tokens_seen": 136139105, + "step": 6343, + "time_per_iteration": 2.7597484588623047 + }, + { + "auxiliary_loss_clip": 0.01464994, + "auxiliary_loss_mlp": 0.01272877, + "balance_loss_clip": 1.14533889, + "balance_loss_mlp": 1.0335052, + "epoch": 0.38142191492559746, + "flos": 19098801722880.0, + "grad_norm": 2.4127428010450447, + "language_loss": 0.76854891, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79592764, + "num_input_tokens_seen": 136158265, + "step": 6344, + "time_per_iteration": 2.825883388519287 + }, + { + "auxiliary_loss_clip": 0.01476706, + "auxiliary_loss_mlp": 0.01270105, + "balance_loss_clip": 1.15718675, + "balance_loss_mlp": 1.02901602, + "epoch": 0.3814820381782654, + "flos": 24100058812320.0, + "grad_norm": 2.393487911308821, + "language_loss": 0.73214877, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75961685, + "num_input_tokens_seen": 136176100, + "step": 6345, + "time_per_iteration": 5.856181621551514 + }, + { + "auxiliary_loss_clip": 0.01471419, + "auxiliary_loss_mlp": 0.01268212, + "balance_loss_clip": 1.15245771, + "balance_loss_mlp": 1.03055644, + "epoch": 0.3815421614309334, + "flos": 15780320066880.0, + "grad_norm": 1.9272993550006232, + "language_loss": 0.69946468, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.726861, + "num_input_tokens_seen": 136195125, + "step": 6346, + "time_per_iteration": 2.7297608852386475 + }, + { + "auxiliary_loss_clip": 0.01473962, + "auxiliary_loss_mlp": 0.01270353, + "balance_loss_clip": 1.15456533, + "balance_loss_mlp": 1.0313623, + "epoch": 0.38160228468360136, + "flos": 19901710956960.0, + "grad_norm": 3.9188140603293236, + "language_loss": 0.75884551, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.78628862, + "num_input_tokens_seen": 136213885, + "step": 6347, + "time_per_iteration": 2.774217128753662 + }, + { + "auxiliary_loss_clip": 0.0146676, + "auxiliary_loss_mlp": 0.01265516, + "balance_loss_clip": 1.14851499, + "balance_loss_mlp": 1.02824175, + "epoch": 0.3816624079362694, + "flos": 19283058545760.0, + "grad_norm": 1.5067413898031434, + "language_loss": 0.74383122, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.77115399, + "num_input_tokens_seen": 136232700, + "step": 6348, + "time_per_iteration": 2.773770809173584 + }, + { + "auxiliary_loss_clip": 0.01466135, + "auxiliary_loss_mlp": 0.01271053, + "balance_loss_clip": 1.14687884, + "balance_loss_mlp": 1.03244328, + "epoch": 0.38172253118893734, + "flos": 21179637203040.0, + "grad_norm": 1.9141071713583602, + "language_loss": 0.87600547, + "learning_rate": 2.836687208908142e-06, + "loss": 0.90337735, + "num_input_tokens_seen": 136248975, + "step": 6349, + "time_per_iteration": 2.72806978225708 + }, + { + "auxiliary_loss_clip": 0.01473187, + "auxiliary_loss_mlp": 0.01266957, + "balance_loss_clip": 1.1537503, + "balance_loss_mlp": 1.02815747, + "epoch": 0.3817826544416053, + "flos": 17531101419840.0, + "grad_norm": 1.800823204711284, + "language_loss": 0.7705127, + "learning_rate": 2.836333449345341e-06, + "loss": 0.79791421, + "num_input_tokens_seen": 136266710, + "step": 6350, + "time_per_iteration": 2.7756052017211914 + }, + { + "auxiliary_loss_clip": 0.01462696, + "auxiliary_loss_mlp": 0.01275094, + "balance_loss_clip": 1.14375091, + "balance_loss_mlp": 1.03591263, + "epoch": 0.38184277769427327, + "flos": 16328425371840.0, + "grad_norm": 2.1885388308243816, + "language_loss": 0.76158428, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78896213, + "num_input_tokens_seen": 136284445, + "step": 6351, + "time_per_iteration": 4.312021970748901 + }, + { + "auxiliary_loss_clip": 0.01472228, + "auxiliary_loss_mlp": 0.01271175, + "balance_loss_clip": 1.15339756, + "balance_loss_mlp": 1.03161168, + "epoch": 0.38190290094694124, + "flos": 30445958509920.0, + "grad_norm": 1.8093243287864236, + "language_loss": 0.74257618, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.77001023, + "num_input_tokens_seen": 136305730, + "step": 6352, + "time_per_iteration": 2.8431100845336914 + }, + { + "auxiliary_loss_clip": 0.01461527, + "auxiliary_loss_mlp": 0.01269961, + "balance_loss_clip": 1.14292264, + "balance_loss_mlp": 1.03535688, + "epoch": 0.3819630241996092, + "flos": 14211671559840.0, + "grad_norm": 2.122959852326034, + "language_loss": 0.64594209, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.67325699, + "num_input_tokens_seen": 136323850, + "step": 6353, + "time_per_iteration": 2.805330276489258 + }, + { + "auxiliary_loss_clip": 0.0146088, + "auxiliary_loss_mlp": 0.01268767, + "balance_loss_clip": 1.14214039, + "balance_loss_mlp": 1.03206515, + "epoch": 0.38202314745227717, + "flos": 25012050527520.0, + "grad_norm": 1.605652772393759, + "language_loss": 0.83155233, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85884881, + "num_input_tokens_seen": 136344880, + "step": 6354, + "time_per_iteration": 2.7863900661468506 + }, + { + "auxiliary_loss_clip": 0.01469567, + "auxiliary_loss_mlp": 0.01269939, + "balance_loss_clip": 1.15066481, + "balance_loss_mlp": 1.03476262, + "epoch": 0.38208327070494513, + "flos": 20816661068640.0, + "grad_norm": 1.8817611209696277, + "language_loss": 0.80600059, + "learning_rate": 2.834564176091943e-06, + "loss": 0.8333956, + "num_input_tokens_seen": 136366060, + "step": 6355, + "time_per_iteration": 2.799612283706665 + }, + { + "auxiliary_loss_clip": 0.01469774, + "auxiliary_loss_mlp": 0.01283108, + "balance_loss_clip": 1.15082955, + "balance_loss_mlp": 1.04545259, + "epoch": 0.3821433939576131, + "flos": 22639923864000.0, + "grad_norm": 1.8089174862669952, + "language_loss": 0.75684988, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.78437865, + "num_input_tokens_seen": 136385625, + "step": 6356, + "time_per_iteration": 2.813093423843384 + }, + { + "auxiliary_loss_clip": 0.0147198, + "auxiliary_loss_mlp": 0.01274396, + "balance_loss_clip": 1.15292525, + "balance_loss_mlp": 1.03635883, + "epoch": 0.38220351721028106, + "flos": 26872293643200.0, + "grad_norm": 1.854840701612602, + "language_loss": 0.8115555, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83901924, + "num_input_tokens_seen": 136405750, + "step": 6357, + "time_per_iteration": 2.8046412467956543 + }, + { + "auxiliary_loss_clip": 0.01472289, + "auxiliary_loss_mlp": 0.01279152, + "balance_loss_clip": 1.15344, + "balance_loss_mlp": 1.03787267, + "epoch": 0.38226364046294903, + "flos": 23369706876960.0, + "grad_norm": 1.7252468142264776, + "language_loss": 0.77747649, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.80499089, + "num_input_tokens_seen": 136426085, + "step": 6358, + "time_per_iteration": 2.7417640686035156 + }, + { + "auxiliary_loss_clip": 0.01464554, + "auxiliary_loss_mlp": 0.01277235, + "balance_loss_clip": 1.14508152, + "balance_loss_mlp": 1.03919768, + "epoch": 0.382323763715617, + "flos": 19648272441600.0, + "grad_norm": 2.4386823113713585, + "language_loss": 0.78826153, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.81567943, + "num_input_tokens_seen": 136442670, + "step": 6359, + "time_per_iteration": 2.7169530391693115 + }, + { + "auxiliary_loss_clip": 0.01469917, + "auxiliary_loss_mlp": 0.0127358, + "balance_loss_clip": 1.15203774, + "balance_loss_mlp": 1.03706861, + "epoch": 0.38238388696828496, + "flos": 54128689770240.0, + "grad_norm": 1.9266931073019231, + "language_loss": 0.69586229, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.72329724, + "num_input_tokens_seen": 136465730, + "step": 6360, + "time_per_iteration": 3.0354092121124268 + }, + { + "auxiliary_loss_clip": 0.01470238, + "auxiliary_loss_mlp": 0.01271838, + "balance_loss_clip": 1.15151024, + "balance_loss_mlp": 1.03704333, + "epoch": 0.382444010220953, + "flos": 24938507096640.0, + "grad_norm": 1.4765443547443604, + "language_loss": 0.79216087, + "learning_rate": 2.83244000399261e-06, + "loss": 0.81958151, + "num_input_tokens_seen": 136487215, + "step": 6361, + "time_per_iteration": 2.7668628692626953 + }, + { + "auxiliary_loss_clip": 0.01457977, + "auxiliary_loss_mlp": 0.01264053, + "balance_loss_clip": 1.13937855, + "balance_loss_mlp": 1.02811432, + "epoch": 0.38250413347362094, + "flos": 42340499966880.0, + "grad_norm": 1.3672376949085743, + "language_loss": 0.65440214, + "learning_rate": 2.832085864749337e-06, + "loss": 0.68162239, + "num_input_tokens_seen": 136510365, + "step": 6362, + "time_per_iteration": 2.9319686889648438 + }, + { + "auxiliary_loss_clip": 0.01460217, + "auxiliary_loss_mlp": 0.01276607, + "balance_loss_clip": 1.14197683, + "balance_loss_mlp": 1.03761673, + "epoch": 0.3825642567262889, + "flos": 16291065769920.0, + "grad_norm": 1.6701586935429624, + "language_loss": 0.81722522, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84459347, + "num_input_tokens_seen": 136527100, + "step": 6363, + "time_per_iteration": 2.725879192352295 + }, + { + "auxiliary_loss_clip": 0.01475805, + "auxiliary_loss_mlp": 0.01278925, + "balance_loss_clip": 1.15730715, + "balance_loss_mlp": 1.04374897, + "epoch": 0.3826243799789569, + "flos": 45657957562560.0, + "grad_norm": 1.6557363793232414, + "language_loss": 0.59098202, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.61852932, + "num_input_tokens_seen": 136550870, + "step": 6364, + "time_per_iteration": 2.985969066619873 + }, + { + "auxiliary_loss_clip": 0.01470311, + "auxiliary_loss_mlp": 0.01278341, + "balance_loss_clip": 1.15187502, + "balance_loss_mlp": 1.04202008, + "epoch": 0.38268450323162484, + "flos": 25303948561440.0, + "grad_norm": 1.8820235719123042, + "language_loss": 0.68778002, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71526647, + "num_input_tokens_seen": 136569895, + "step": 6365, + "time_per_iteration": 2.7886006832122803 + }, + { + "auxiliary_loss_clip": 0.01464826, + "auxiliary_loss_mlp": 0.01278214, + "balance_loss_clip": 1.1452508, + "balance_loss_mlp": 1.04303825, + "epoch": 0.3827446264842928, + "flos": 21838228331040.0, + "grad_norm": 2.0947231647460725, + "language_loss": 0.73135591, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75878632, + "num_input_tokens_seen": 136588585, + "step": 6366, + "time_per_iteration": 2.78009033203125 + }, + { + "auxiliary_loss_clip": 0.01465097, + "auxiliary_loss_mlp": 0.01281629, + "balance_loss_clip": 1.14638269, + "balance_loss_mlp": 1.0441643, + "epoch": 0.38280474973696077, + "flos": 25736713238880.0, + "grad_norm": 2.4673876076931918, + "language_loss": 0.68548763, + "learning_rate": 2.830314695509902e-06, + "loss": 0.712955, + "num_input_tokens_seen": 136606640, + "step": 6367, + "time_per_iteration": 2.764122724533081 + }, + { + "auxiliary_loss_clip": 0.01460452, + "auxiliary_loss_mlp": 0.01274191, + "balance_loss_clip": 1.14047623, + "balance_loss_mlp": 1.03806162, + "epoch": 0.38286487298962874, + "flos": 24898037385600.0, + "grad_norm": 1.8771351604350022, + "language_loss": 0.64172804, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66907448, + "num_input_tokens_seen": 136624940, + "step": 6368, + "time_per_iteration": 2.7716946601867676 + }, + { + "auxiliary_loss_clip": 0.01465382, + "auxiliary_loss_mlp": 0.01277116, + "balance_loss_clip": 1.14462328, + "balance_loss_mlp": 1.04136777, + "epoch": 0.3829249962422967, + "flos": 28545966318240.0, + "grad_norm": 1.6556445079311959, + "language_loss": 0.678716, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.706141, + "num_input_tokens_seen": 136645540, + "step": 6369, + "time_per_iteration": 2.809635877609253 + }, + { + "auxiliary_loss_clip": 0.0145943, + "auxiliary_loss_mlp": 0.01271003, + "balance_loss_clip": 1.13948238, + "balance_loss_mlp": 1.03506434, + "epoch": 0.38298511949496467, + "flos": 21473393716800.0, + "grad_norm": 1.8099758345807089, + "language_loss": 0.78348637, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.81079066, + "num_input_tokens_seen": 136664530, + "step": 6370, + "time_per_iteration": 2.7601146697998047 + }, + { + "auxiliary_loss_clip": 0.01457632, + "auxiliary_loss_mlp": 0.01262581, + "balance_loss_clip": 1.13771152, + "balance_loss_mlp": 1.02740526, + "epoch": 0.38304524274763263, + "flos": 31682353056480.0, + "grad_norm": 2.6834517649968816, + "language_loss": 0.64563882, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.67284095, + "num_input_tokens_seen": 136682315, + "step": 6371, + "time_per_iteration": 2.816133975982666 + }, + { + "auxiliary_loss_clip": 0.01459469, + "auxiliary_loss_mlp": 0.01267834, + "balance_loss_clip": 1.13798046, + "balance_loss_mlp": 1.02655411, + "epoch": 0.3831053660003006, + "flos": 25078842745920.0, + "grad_norm": 2.039638264013565, + "language_loss": 0.72660804, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.7538811, + "num_input_tokens_seen": 136701185, + "step": 6372, + "time_per_iteration": 4.396399259567261 + }, + { + "auxiliary_loss_clip": 0.01458029, + "auxiliary_loss_mlp": 0.01269812, + "balance_loss_clip": 1.13654065, + "balance_loss_mlp": 1.03196597, + "epoch": 0.38316548925296856, + "flos": 23261307102720.0, + "grad_norm": 6.223270894168111, + "language_loss": 0.84676552, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.87404394, + "num_input_tokens_seen": 136721265, + "step": 6373, + "time_per_iteration": 2.8384335041046143 + }, + { + "auxiliary_loss_clip": 0.01465959, + "auxiliary_loss_mlp": 0.01277792, + "balance_loss_clip": 1.14440012, + "balance_loss_mlp": 1.04013586, + "epoch": 0.3832256125056366, + "flos": 34426368972000.0, + "grad_norm": 2.232891058701407, + "language_loss": 0.75108874, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.77852619, + "num_input_tokens_seen": 136741885, + "step": 6374, + "time_per_iteration": 2.873041868209839 + }, + { + "auxiliary_loss_clip": 0.01455021, + "auxiliary_loss_mlp": 0.01276674, + "balance_loss_clip": 1.13362002, + "balance_loss_mlp": 1.03787351, + "epoch": 0.38328573575830455, + "flos": 21764950397280.0, + "grad_norm": 2.386888220248972, + "language_loss": 0.76342571, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.79074264, + "num_input_tokens_seen": 136760905, + "step": 6375, + "time_per_iteration": 2.7469558715820312 + }, + { + "auxiliary_loss_clip": 0.01456434, + "auxiliary_loss_mlp": 0.01272345, + "balance_loss_clip": 1.13443029, + "balance_loss_mlp": 1.03526163, + "epoch": 0.3833458590109725, + "flos": 17381700940320.0, + "grad_norm": 3.4311718178836546, + "language_loss": 0.7295264, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.75681424, + "num_input_tokens_seen": 136777240, + "step": 6376, + "time_per_iteration": 2.797088861465454 + }, + { + "auxiliary_loss_clip": 0.01454223, + "auxiliary_loss_mlp": 0.01264838, + "balance_loss_clip": 1.13283968, + "balance_loss_mlp": 1.03099704, + "epoch": 0.3834059822636405, + "flos": 29427160003200.0, + "grad_norm": 1.799854266468265, + "language_loss": 0.68296444, + "learning_rate": 2.826769997289796e-06, + "loss": 0.71015507, + "num_input_tokens_seen": 136801040, + "step": 6377, + "time_per_iteration": 2.8078298568725586 + }, + { + "auxiliary_loss_clip": 0.01459923, + "auxiliary_loss_mlp": 0.01264533, + "balance_loss_clip": 1.13861609, + "balance_loss_mlp": 1.02573323, + "epoch": 0.38346610551630844, + "flos": 21472900650720.0, + "grad_norm": 1.7970463884092274, + "language_loss": 0.73369765, + "learning_rate": 2.826415354814344e-06, + "loss": 0.76094228, + "num_input_tokens_seen": 136819495, + "step": 6378, + "time_per_iteration": 2.7860748767852783 + }, + { + "auxiliary_loss_clip": 0.01456299, + "auxiliary_loss_mlp": 0.01271512, + "balance_loss_clip": 1.13506174, + "balance_loss_mlp": 1.03538251, + "epoch": 0.3835262287689764, + "flos": 27563579209440.0, + "grad_norm": 1.7017745087487843, + "language_loss": 0.69424516, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.72152328, + "num_input_tokens_seen": 136838840, + "step": 6379, + "time_per_iteration": 2.855868339538574 + }, + { + "auxiliary_loss_clip": 0.01462371, + "auxiliary_loss_mlp": 0.01276491, + "balance_loss_clip": 1.14142966, + "balance_loss_mlp": 1.03940797, + "epoch": 0.3835863520216444, + "flos": 15525554065920.0, + "grad_norm": 2.3030885512734396, + "language_loss": 0.83657128, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.86395991, + "num_input_tokens_seen": 136854425, + "step": 6380, + "time_per_iteration": 2.7736003398895264 + }, + { + "auxiliary_loss_clip": 0.01459684, + "auxiliary_loss_mlp": 0.01263261, + "balance_loss_clip": 1.13866186, + "balance_loss_mlp": 1.02388918, + "epoch": 0.38364647527431234, + "flos": 21906916957440.0, + "grad_norm": 1.4957259337313717, + "language_loss": 0.81311899, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.84034842, + "num_input_tokens_seen": 136874355, + "step": 6381, + "time_per_iteration": 2.807663679122925 + }, + { + "auxiliary_loss_clip": 0.01674612, + "auxiliary_loss_mlp": 0.01230133, + "balance_loss_clip": 1.36459422, + "balance_loss_mlp": 1.01879883, + "epoch": 0.3837065985269803, + "flos": 65541007500480.0, + "grad_norm": 0.7880394228662897, + "language_loss": 0.60325396, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.63230133, + "num_input_tokens_seen": 136937475, + "step": 6382, + "time_per_iteration": 3.3063595294952393 + }, + { + "auxiliary_loss_clip": 0.01457091, + "auxiliary_loss_mlp": 0.01271412, + "balance_loss_clip": 1.1353997, + "balance_loss_mlp": 1.03394747, + "epoch": 0.38376672177964827, + "flos": 28259112729600.0, + "grad_norm": 2.3646057405382623, + "language_loss": 0.66830719, + "learning_rate": 2.824641672639794e-06, + "loss": 0.69559222, + "num_input_tokens_seen": 136955805, + "step": 6383, + "time_per_iteration": 5.6970601081848145 + }, + { + "auxiliary_loss_clip": 0.01464008, + "auxiliary_loss_mlp": 0.01274596, + "balance_loss_clip": 1.14365757, + "balance_loss_mlp": 1.03808451, + "epoch": 0.38382684503231623, + "flos": 20633466234240.0, + "grad_norm": 1.837420164654519, + "language_loss": 0.74948287, + "learning_rate": 2.824286842339587e-06, + "loss": 0.77686894, + "num_input_tokens_seen": 136975240, + "step": 6384, + "time_per_iteration": 2.7717015743255615 + }, + { + "auxiliary_loss_clip": 0.01464476, + "auxiliary_loss_mlp": 0.01271492, + "balance_loss_clip": 1.14377141, + "balance_loss_mlp": 1.03555262, + "epoch": 0.3838869682849842, + "flos": 19607651017920.0, + "grad_norm": 1.3266275891702182, + "language_loss": 0.76324111, + "learning_rate": 2.823931980782341e-06, + "loss": 0.79060078, + "num_input_tokens_seen": 136994985, + "step": 6385, + "time_per_iteration": 2.772510528564453 + }, + { + "auxiliary_loss_clip": 0.01673847, + "auxiliary_loss_mlp": 0.0123304, + "balance_loss_clip": 1.36547852, + "balance_loss_mlp": 1.02094269, + "epoch": 0.38394709153765216, + "flos": 56561501854080.0, + "grad_norm": 0.9347259032239558, + "language_loss": 0.66987479, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69894373, + "num_input_tokens_seen": 137046290, + "step": 6386, + "time_per_iteration": 3.146552324295044 + }, + { + "auxiliary_loss_clip": 0.01454269, + "auxiliary_loss_mlp": 0.01272151, + "balance_loss_clip": 1.13406134, + "balance_loss_mlp": 1.03773808, + "epoch": 0.3840072147903202, + "flos": 15890616249120.0, + "grad_norm": 3.139305439414043, + "language_loss": 0.72945654, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.75672078, + "num_input_tokens_seen": 137064725, + "step": 6387, + "time_per_iteration": 2.7894694805145264 + }, + { + "auxiliary_loss_clip": 0.0145843, + "auxiliary_loss_mlp": 0.01265974, + "balance_loss_clip": 1.13829708, + "balance_loss_mlp": 1.03327799, + "epoch": 0.38406733804298815, + "flos": 28220425642080.0, + "grad_norm": 1.7837633190611684, + "language_loss": 0.81211615, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83936018, + "num_input_tokens_seen": 137086030, + "step": 6388, + "time_per_iteration": 4.456263303756714 + }, + { + "auxiliary_loss_clip": 0.01453865, + "auxiliary_loss_mlp": 0.01271677, + "balance_loss_clip": 1.13287866, + "balance_loss_mlp": 1.03878939, + "epoch": 0.3841274612956561, + "flos": 18225572951520.0, + "grad_norm": 2.0775767735467605, + "language_loss": 0.75913334, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78638875, + "num_input_tokens_seen": 137105400, + "step": 6389, + "time_per_iteration": 2.7597813606262207 + }, + { + "auxiliary_loss_clip": 0.01457603, + "auxiliary_loss_mlp": 0.01271727, + "balance_loss_clip": 1.1365912, + "balance_loss_mlp": 1.03121042, + "epoch": 0.3841875845483241, + "flos": 19794980021760.0, + "grad_norm": 1.5918334396040918, + "language_loss": 0.7635929, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.79088622, + "num_input_tokens_seen": 137124985, + "step": 6390, + "time_per_iteration": 2.774465799331665 + }, + { + "auxiliary_loss_clip": 0.0145534, + "auxiliary_loss_mlp": 0.01276188, + "balance_loss_clip": 1.13439465, + "balance_loss_mlp": 1.03815043, + "epoch": 0.38424770780099204, + "flos": 29901456380160.0, + "grad_norm": 1.6563221337458476, + "language_loss": 0.70147491, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72879016, + "num_input_tokens_seen": 137146745, + "step": 6391, + "time_per_iteration": 2.842580556869507 + }, + { + "auxiliary_loss_clip": 0.01455041, + "auxiliary_loss_mlp": 0.01274773, + "balance_loss_clip": 1.13401818, + "balance_loss_mlp": 1.03768921, + "epoch": 0.38430783105366, + "flos": 20815826649120.0, + "grad_norm": 1.7170275028347657, + "language_loss": 0.8414821, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86878026, + "num_input_tokens_seen": 137163195, + "step": 6392, + "time_per_iteration": 2.7501893043518066 + }, + { + "auxiliary_loss_clip": 0.01459169, + "auxiliary_loss_mlp": 0.01270297, + "balance_loss_clip": 1.1391325, + "balance_loss_mlp": 1.03340459, + "epoch": 0.384367954306328, + "flos": 11000110479840.0, + "grad_norm": 2.110572978401139, + "language_loss": 0.61123431, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63852894, + "num_input_tokens_seen": 137179330, + "step": 6393, + "time_per_iteration": 2.7699170112609863 + }, + { + "auxiliary_loss_clip": 0.01461103, + "auxiliary_loss_mlp": 0.01280588, + "balance_loss_clip": 1.13996983, + "balance_loss_mlp": 1.04274178, + "epoch": 0.38442807755899594, + "flos": 25340397887520.0, + "grad_norm": 1.797199640354398, + "language_loss": 0.71435386, + "learning_rate": 2.820736822421029e-06, + "loss": 0.74177074, + "num_input_tokens_seen": 137198655, + "step": 6394, + "time_per_iteration": 2.7852985858917236 + }, + { + "auxiliary_loss_clip": 0.01459629, + "auxiliary_loss_mlp": 0.01269781, + "balance_loss_clip": 1.13918805, + "balance_loss_mlp": 1.03059959, + "epoch": 0.3844882008116639, + "flos": 21071920135680.0, + "grad_norm": 2.322923672557143, + "language_loss": 0.80837429, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83566833, + "num_input_tokens_seen": 137217120, + "step": 6395, + "time_per_iteration": 2.8107011318206787 + }, + { + "auxiliary_loss_clip": 0.01463528, + "auxiliary_loss_mlp": 0.01275771, + "balance_loss_clip": 1.14280462, + "balance_loss_mlp": 1.03811526, + "epoch": 0.38454832406433187, + "flos": 17964928085760.0, + "grad_norm": 2.005035634269014, + "language_loss": 0.71034777, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.73774081, + "num_input_tokens_seen": 137234410, + "step": 6396, + "time_per_iteration": 2.730489730834961 + }, + { + "auxiliary_loss_clip": 0.01652957, + "auxiliary_loss_mlp": 0.01217056, + "balance_loss_clip": 1.34457064, + "balance_loss_mlp": 1.00419617, + "epoch": 0.38460844731699984, + "flos": 67932249956640.0, + "grad_norm": 0.8859874416578575, + "language_loss": 0.59684467, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.62554479, + "num_input_tokens_seen": 137294940, + "step": 6397, + "time_per_iteration": 3.4206130504608154 + }, + { + "auxiliary_loss_clip": 0.01461142, + "auxiliary_loss_mlp": 0.01265241, + "balance_loss_clip": 1.14098477, + "balance_loss_mlp": 1.02644122, + "epoch": 0.3846685705696678, + "flos": 25851484944000.0, + "grad_norm": 1.884478111444461, + "language_loss": 0.85310054, + "learning_rate": 2.819315942271794e-06, + "loss": 0.88036436, + "num_input_tokens_seen": 137315035, + "step": 6398, + "time_per_iteration": 2.814610242843628 + }, + { + "auxiliary_loss_clip": 0.01452635, + "auxiliary_loss_mlp": 0.01271895, + "balance_loss_clip": 1.13240147, + "balance_loss_mlp": 1.03538394, + "epoch": 0.38472869382233577, + "flos": 16292089830240.0, + "grad_norm": 1.9882767757104285, + "language_loss": 0.80158263, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.82882792, + "num_input_tokens_seen": 137333155, + "step": 6399, + "time_per_iteration": 2.7620062828063965 + }, + { + "auxiliary_loss_clip": 0.01456525, + "auxiliary_loss_mlp": 0.01266782, + "balance_loss_clip": 1.13558865, + "balance_loss_mlp": 1.02817225, + "epoch": 0.38478881707500373, + "flos": 19355084850240.0, + "grad_norm": 2.4050684458574336, + "language_loss": 0.66969991, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69693297, + "num_input_tokens_seen": 137351515, + "step": 6400, + "time_per_iteration": 2.8194265365600586 + }, + { + "auxiliary_loss_clip": 0.01464303, + "auxiliary_loss_mlp": 0.01284259, + "balance_loss_clip": 1.14346218, + "balance_loss_mlp": 1.04545867, + "epoch": 0.38484894032767175, + "flos": 24863029329600.0, + "grad_norm": 1.6253921562661025, + "language_loss": 0.73197913, + "learning_rate": 2.81824995589303e-06, + "loss": 0.7594648, + "num_input_tokens_seen": 137371255, + "step": 6401, + "time_per_iteration": 2.7983171939849854 + }, + { + "auxiliary_loss_clip": 0.01455912, + "auxiliary_loss_mlp": 0.01276233, + "balance_loss_clip": 1.13519979, + "balance_loss_mlp": 1.04239237, + "epoch": 0.3849090635803397, + "flos": 14503569593760.0, + "grad_norm": 2.1322067094141164, + "language_loss": 0.71913171, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74645311, + "num_input_tokens_seen": 137388980, + "step": 6402, + "time_per_iteration": 2.747450828552246 + }, + { + "auxiliary_loss_clip": 0.0145477, + "auxiliary_loss_mlp": 0.01275016, + "balance_loss_clip": 1.1340065, + "balance_loss_mlp": 1.04041255, + "epoch": 0.3849691868330077, + "flos": 18517888195200.0, + "grad_norm": 2.0019199842353133, + "language_loss": 0.83432615, + "learning_rate": 2.817539143144128e-06, + "loss": 0.861624, + "num_input_tokens_seen": 137406885, + "step": 6403, + "time_per_iteration": 2.7039103507995605 + }, + { + "auxiliary_loss_clip": 0.01465601, + "auxiliary_loss_mlp": 0.01278268, + "balance_loss_clip": 1.14471304, + "balance_loss_mlp": 1.04118466, + "epoch": 0.38502931008567565, + "flos": 21618546242400.0, + "grad_norm": 2.3566438763472717, + "language_loss": 0.8266052, + "learning_rate": 2.817183690261189e-06, + "loss": 0.85404396, + "num_input_tokens_seen": 137425535, + "step": 6404, + "time_per_iteration": 2.7879343032836914 + }, + { + "auxiliary_loss_clip": 0.01456223, + "auxiliary_loss_mlp": 0.01261927, + "balance_loss_clip": 1.13356173, + "balance_loss_mlp": 1.02503443, + "epoch": 0.3850894333383436, + "flos": 25417961703360.0, + "grad_norm": 1.782653176042924, + "language_loss": 0.69677114, + "learning_rate": 2.816828206390563e-06, + "loss": 0.72395265, + "num_input_tokens_seen": 137447700, + "step": 6405, + "time_per_iteration": 2.832275152206421 + }, + { + "auxiliary_loss_clip": 0.01463704, + "auxiliary_loss_mlp": 0.0126498, + "balance_loss_clip": 1.14247847, + "balance_loss_mlp": 1.02961278, + "epoch": 0.3851495565910116, + "flos": 20229830748000.0, + "grad_norm": 1.9875601236281009, + "language_loss": 0.7912491, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81853592, + "num_input_tokens_seen": 137462245, + "step": 6406, + "time_per_iteration": 2.798673391342163 + }, + { + "auxiliary_loss_clip": 0.01462021, + "auxiliary_loss_mlp": 0.01274258, + "balance_loss_clip": 1.14089274, + "balance_loss_mlp": 1.03622091, + "epoch": 0.38520967984367954, + "flos": 16510330648800.0, + "grad_norm": 2.7732165106980813, + "language_loss": 0.84323061, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.87059343, + "num_input_tokens_seen": 137476455, + "step": 6407, + "time_per_iteration": 2.7363431453704834 + }, + { + "auxiliary_loss_clip": 0.01672151, + "auxiliary_loss_mlp": 0.01231071, + "balance_loss_clip": 1.36453128, + "balance_loss_mlp": 1.01973724, + "epoch": 0.3852698030963475, + "flos": 61320243738240.0, + "grad_norm": 0.8450755052021233, + "language_loss": 0.64964634, + "learning_rate": 2.815761568987365e-06, + "loss": 0.67867857, + "num_input_tokens_seen": 137539845, + "step": 6408, + "time_per_iteration": 3.39662504196167 + }, + { + "auxiliary_loss_clip": 0.01450364, + "auxiliary_loss_mlp": 0.01270736, + "balance_loss_clip": 1.1285367, + "balance_loss_mlp": 1.03098273, + "epoch": 0.3853299263490155, + "flos": 22895220859200.0, + "grad_norm": 1.4430719148028845, + "language_loss": 0.73640645, + "learning_rate": 2.8154059613008e-06, + "loss": 0.76361746, + "num_input_tokens_seen": 137559880, + "step": 6409, + "time_per_iteration": 2.8057775497436523 + }, + { + "auxiliary_loss_clip": 0.01455336, + "auxiliary_loss_mlp": 0.0127647, + "balance_loss_clip": 1.13273668, + "balance_loss_mlp": 1.03690684, + "epoch": 0.38539004960168344, + "flos": 20049556381920.0, + "grad_norm": 71.26850295639638, + "language_loss": 0.70460701, + "learning_rate": 2.81505032269396e-06, + "loss": 0.73192501, + "num_input_tokens_seen": 137578225, + "step": 6410, + "time_per_iteration": 2.7952592372894287 + }, + { + "auxiliary_loss_clip": 0.01657331, + "auxiliary_loss_mlp": 0.01239281, + "balance_loss_clip": 1.35145247, + "balance_loss_mlp": 1.02870941, + "epoch": 0.3854501728543514, + "flos": 68738155515360.0, + "grad_norm": 0.7063370566794089, + "language_loss": 0.60234451, + "learning_rate": 2.81469465318033e-06, + "loss": 0.6313107, + "num_input_tokens_seen": 137645770, + "step": 6411, + "time_per_iteration": 4.981153249740601 + }, + { + "auxiliary_loss_clip": 0.01452605, + "auxiliary_loss_mlp": 0.01267988, + "balance_loss_clip": 1.13038301, + "balance_loss_mlp": 1.0314765, + "epoch": 0.38551029610701937, + "flos": 20487100007520.0, + "grad_norm": 2.5428851743513636, + "language_loss": 0.77513623, + "learning_rate": 2.814338952773397e-06, + "loss": 0.80234218, + "num_input_tokens_seen": 137664090, + "step": 6412, + "time_per_iteration": 2.8062801361083984 + }, + { + "auxiliary_loss_clip": 0.0145463, + "auxiliary_loss_mlp": 0.0127358, + "balance_loss_clip": 1.13253951, + "balance_loss_mlp": 1.03382647, + "epoch": 0.38557041935968733, + "flos": 23473744912800.0, + "grad_norm": 1.912453898695602, + "language_loss": 0.77656698, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.8038491, + "num_input_tokens_seen": 137683190, + "step": 6413, + "time_per_iteration": 2.754146099090576 + }, + { + "auxiliary_loss_clip": 0.01647615, + "auxiliary_loss_mlp": 0.01227882, + "balance_loss_clip": 1.34277821, + "balance_loss_mlp": 1.01959991, + "epoch": 0.38563054261235535, + "flos": 63972662418720.0, + "grad_norm": 0.8111682809741602, + "language_loss": 0.61241651, + "learning_rate": 2.813627459333576e-06, + "loss": 0.64117146, + "num_input_tokens_seen": 137737315, + "step": 6414, + "time_per_iteration": 3.120846748352051 + }, + { + "auxiliary_loss_clip": 0.01459896, + "auxiliary_loss_mlp": 0.01272258, + "balance_loss_clip": 1.13792872, + "balance_loss_mlp": 1.03383946, + "epoch": 0.3856906658650233, + "flos": 23990066055360.0, + "grad_norm": 2.9321379996121264, + "language_loss": 0.77904159, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.80636311, + "num_input_tokens_seen": 137753535, + "step": 6415, + "time_per_iteration": 2.77166748046875 + }, + { + "auxiliary_loss_clip": 0.01460387, + "auxiliary_loss_mlp": 0.01260025, + "balance_loss_clip": 1.13914895, + "balance_loss_mlp": 1.02484894, + "epoch": 0.3857507891176913, + "flos": 25009661053440.0, + "grad_norm": 1.717564525997772, + "language_loss": 0.79967183, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.82687593, + "num_input_tokens_seen": 137773405, + "step": 6416, + "time_per_iteration": 2.8255465030670166 + }, + { + "auxiliary_loss_clip": 0.01458128, + "auxiliary_loss_mlp": 0.01269593, + "balance_loss_clip": 1.13777506, + "balance_loss_mlp": 1.03403556, + "epoch": 0.38581091237035925, + "flos": 21538555024320.0, + "grad_norm": 1.8087420723981686, + "language_loss": 0.79456049, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.82183766, + "num_input_tokens_seen": 137790810, + "step": 6417, + "time_per_iteration": 2.8443264961242676 + }, + { + "auxiliary_loss_clip": 0.01451656, + "auxiliary_loss_mlp": 0.01263692, + "balance_loss_clip": 1.13212729, + "balance_loss_mlp": 1.02737164, + "epoch": 0.3858710356230272, + "flos": 17385797181600.0, + "grad_norm": 1.8873772455252509, + "language_loss": 0.80592304, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.83307654, + "num_input_tokens_seen": 137810265, + "step": 6418, + "time_per_iteration": 2.759660005569458 + }, + { + "auxiliary_loss_clip": 0.01455657, + "auxiliary_loss_mlp": 0.01263525, + "balance_loss_clip": 1.13569665, + "balance_loss_mlp": 1.02911186, + "epoch": 0.3859311588756952, + "flos": 20341681984800.0, + "grad_norm": 1.9894135161368742, + "language_loss": 0.79469121, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.82188302, + "num_input_tokens_seen": 137828580, + "step": 6419, + "time_per_iteration": 2.7878055572509766 + }, + { + "auxiliary_loss_clip": 0.01457337, + "auxiliary_loss_mlp": 0.01265648, + "balance_loss_clip": 1.13772762, + "balance_loss_mlp": 1.02780187, + "epoch": 0.38599128212836314, + "flos": 26323467703200.0, + "grad_norm": 2.3865103194203217, + "language_loss": 0.67696071, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.70419055, + "num_input_tokens_seen": 137846145, + "step": 6420, + "time_per_iteration": 4.300091981887817 + }, + { + "auxiliary_loss_clip": 0.01466388, + "auxiliary_loss_mlp": 0.01264937, + "balance_loss_clip": 1.14744639, + "balance_loss_mlp": 1.0331943, + "epoch": 0.3860514053810311, + "flos": 13555583690400.0, + "grad_norm": 1.9471401432121054, + "language_loss": 0.81327999, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.84059322, + "num_input_tokens_seen": 137863705, + "step": 6421, + "time_per_iteration": 4.322140216827393 + }, + { + "auxiliary_loss_clip": 0.01463233, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 1.14375091, + "balance_loss_mlp": 1.03315473, + "epoch": 0.3861115286336991, + "flos": 20956086442080.0, + "grad_norm": 1.9729355284803327, + "language_loss": 0.72419155, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.75153589, + "num_input_tokens_seen": 137880285, + "step": 6422, + "time_per_iteration": 2.7902469635009766 + }, + { + "auxiliary_loss_clip": 0.014672, + "auxiliary_loss_mlp": 0.01274501, + "balance_loss_clip": 1.14900208, + "balance_loss_mlp": 1.04523778, + "epoch": 0.38617165188636704, + "flos": 16364571272640.0, + "grad_norm": 2.022246757157443, + "language_loss": 0.66729426, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.69471127, + "num_input_tokens_seen": 137898335, + "step": 6423, + "time_per_iteration": 2.7774412631988525 + }, + { + "auxiliary_loss_clip": 0.01469399, + "auxiliary_loss_mlp": 0.01278631, + "balance_loss_clip": 1.15015185, + "balance_loss_mlp": 1.04097557, + "epoch": 0.386231775139035, + "flos": 34790141597760.0, + "grad_norm": 2.0727458937699854, + "language_loss": 0.69200355, + "learning_rate": 2.810068143123449e-06, + "loss": 0.71948385, + "num_input_tokens_seen": 137918605, + "step": 6424, + "time_per_iteration": 2.9161436557769775 + }, + { + "auxiliary_loss_clip": 0.01467343, + "auxiliary_loss_mlp": 0.01265689, + "balance_loss_clip": 1.14895916, + "balance_loss_mlp": 1.03146625, + "epoch": 0.38629189839170297, + "flos": 21728349358560.0, + "grad_norm": 1.3614324267221796, + "language_loss": 0.72300303, + "learning_rate": 2.809712042331429e-06, + "loss": 0.75033331, + "num_input_tokens_seen": 137938245, + "step": 6425, + "time_per_iteration": 2.8081843852996826 + }, + { + "auxiliary_loss_clip": 0.01465213, + "auxiliary_loss_mlp": 0.01274279, + "balance_loss_clip": 1.14571822, + "balance_loss_mlp": 1.03586006, + "epoch": 0.38635202164437094, + "flos": 27925872636960.0, + "grad_norm": 2.9198218407067276, + "language_loss": 0.80231577, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82971066, + "num_input_tokens_seen": 137956770, + "step": 6426, + "time_per_iteration": 4.389045715332031 + }, + { + "auxiliary_loss_clip": 0.01469333, + "auxiliary_loss_mlp": 0.0126204, + "balance_loss_clip": 1.15032053, + "balance_loss_mlp": 1.02533793, + "epoch": 0.38641214489703896, + "flos": 23588895899520.0, + "grad_norm": 2.103800907316383, + "language_loss": 0.74749231, + "learning_rate": 2.80899974864781e-06, + "loss": 0.77480608, + "num_input_tokens_seen": 137977040, + "step": 6427, + "time_per_iteration": 2.8391330242156982 + }, + { + "auxiliary_loss_clip": 0.01472156, + "auxiliary_loss_mlp": 0.01294449, + "balance_loss_clip": 1.15265596, + "balance_loss_mlp": 1.06117976, + "epoch": 0.3864722681497069, + "flos": 12642909268320.0, + "grad_norm": 2.0573111064927603, + "language_loss": 0.70275855, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.73042464, + "num_input_tokens_seen": 137993545, + "step": 6428, + "time_per_iteration": 2.7769033908843994 + }, + { + "auxiliary_loss_clip": 0.01467969, + "auxiliary_loss_mlp": 0.01284571, + "balance_loss_clip": 1.14801908, + "balance_loss_mlp": 1.05015802, + "epoch": 0.3865323914023749, + "flos": 17600624465760.0, + "grad_norm": 2.20890698271356, + "language_loss": 0.84528995, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.87281537, + "num_input_tokens_seen": 138010140, + "step": 6429, + "time_per_iteration": 2.738325834274292 + }, + { + "auxiliary_loss_clip": 0.01478964, + "auxiliary_loss_mlp": 0.01274765, + "balance_loss_clip": 1.15769339, + "balance_loss_mlp": 1.03882551, + "epoch": 0.38659251465504285, + "flos": 18480945803040.0, + "grad_norm": 2.3474786820676865, + "language_loss": 0.81163406, + "learning_rate": 2.807931078076015e-06, + "loss": 0.83917129, + "num_input_tokens_seen": 138028880, + "step": 6430, + "time_per_iteration": 2.780819892883301 + }, + { + "auxiliary_loss_clip": 0.01645644, + "auxiliary_loss_mlp": 0.01211288, + "balance_loss_clip": 1.34512389, + "balance_loss_mlp": 0.99919128, + "epoch": 0.3866526379077108, + "flos": 64172697356160.0, + "grad_norm": 0.7164704602142239, + "language_loss": 0.58742756, + "learning_rate": 2.807574793260416e-06, + "loss": 0.6159969, + "num_input_tokens_seen": 138098090, + "step": 6431, + "time_per_iteration": 3.363409996032715 + }, + { + "auxiliary_loss_clip": 0.01478557, + "auxiliary_loss_mlp": 0.01278084, + "balance_loss_clip": 1.15798163, + "balance_loss_mlp": 1.03813934, + "epoch": 0.3867127611603788, + "flos": 14389821948960.0, + "grad_norm": 1.8873902075694846, + "language_loss": 0.79304653, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.82061291, + "num_input_tokens_seen": 138114735, + "step": 6432, + "time_per_iteration": 2.8643877506256104 + }, + { + "auxiliary_loss_clip": 0.01471554, + "auxiliary_loss_mlp": 0.01274406, + "balance_loss_clip": 1.15074539, + "balance_loss_mlp": 1.03331757, + "epoch": 0.38677288441304675, + "flos": 20012803630560.0, + "grad_norm": 2.210251881708686, + "language_loss": 0.80710137, + "learning_rate": 2.806862131772779e-06, + "loss": 0.83456099, + "num_input_tokens_seen": 138130480, + "step": 6433, + "time_per_iteration": 2.7928435802459717 + }, + { + "auxiliary_loss_clip": 0.01486686, + "auxiliary_loss_mlp": 0.01274884, + "balance_loss_clip": 1.16500711, + "balance_loss_mlp": 1.03856313, + "epoch": 0.3868330076657147, + "flos": 22239095061600.0, + "grad_norm": 2.356242763425109, + "language_loss": 0.70857739, + "learning_rate": 2.806505755127765e-06, + "loss": 0.73619306, + "num_input_tokens_seen": 138150640, + "step": 6434, + "time_per_iteration": 2.746436834335327 + }, + { + "auxiliary_loss_clip": 0.0147164, + "auxiliary_loss_mlp": 0.01282025, + "balance_loss_clip": 1.15110779, + "balance_loss_mlp": 1.04551339, + "epoch": 0.3868931309183827, + "flos": 16729330030560.0, + "grad_norm": 1.8711285505516038, + "language_loss": 0.77457941, + "learning_rate": 2.806149347899972e-06, + "loss": 0.80211604, + "num_input_tokens_seen": 138169700, + "step": 6435, + "time_per_iteration": 2.787822723388672 + }, + { + "auxiliary_loss_clip": 0.01470709, + "auxiliary_loss_mlp": 0.01274928, + "balance_loss_clip": 1.15094984, + "balance_loss_mlp": 1.04185033, + "epoch": 0.38695325417105064, + "flos": 22676942112480.0, + "grad_norm": 1.7451004183326453, + "language_loss": 0.79830557, + "learning_rate": 2.805792910102915e-06, + "loss": 0.82576191, + "num_input_tokens_seen": 138185835, + "step": 6436, + "time_per_iteration": 2.7530975341796875 + }, + { + "auxiliary_loss_clip": 0.01475127, + "auxiliary_loss_mlp": 0.01267757, + "balance_loss_clip": 1.1553303, + "balance_loss_mlp": 1.03296244, + "epoch": 0.3870133774237186, + "flos": 23114258169120.0, + "grad_norm": 1.8650372738459762, + "language_loss": 0.76706183, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79449069, + "num_input_tokens_seen": 138204080, + "step": 6437, + "time_per_iteration": 2.803152084350586 + }, + { + "auxiliary_loss_clip": 0.01475596, + "auxiliary_loss_mlp": 0.01270425, + "balance_loss_clip": 1.15520978, + "balance_loss_mlp": 1.03486705, + "epoch": 0.3870735006763866, + "flos": 17677619359200.0, + "grad_norm": 2.0237584584289126, + "language_loss": 0.81832945, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84578967, + "num_input_tokens_seen": 138220710, + "step": 6438, + "time_per_iteration": 2.8280959129333496 + }, + { + "auxiliary_loss_clip": 0.01476747, + "auxiliary_loss_mlp": 0.01277336, + "balance_loss_clip": 1.15679681, + "balance_loss_mlp": 1.04158771, + "epoch": 0.38713362392905454, + "flos": 23298097782240.0, + "grad_norm": 1.3957362862158875, + "language_loss": 0.75527692, + "learning_rate": 2.804723413431326e-06, + "loss": 0.78281772, + "num_input_tokens_seen": 138241720, + "step": 6439, + "time_per_iteration": 2.790107011795044 + }, + { + "auxiliary_loss_clip": 0.01482037, + "auxiliary_loss_mlp": 0.01278996, + "balance_loss_clip": 1.16129899, + "balance_loss_mlp": 1.04649019, + "epoch": 0.38719374718172256, + "flos": 21033233048160.0, + "grad_norm": 1.465854195664495, + "language_loss": 0.73743516, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76504552, + "num_input_tokens_seen": 138261885, + "step": 6440, + "time_per_iteration": 2.759737014770508 + }, + { + "auxiliary_loss_clip": 0.01480163, + "auxiliary_loss_mlp": 0.0126624, + "balance_loss_clip": 1.15944672, + "balance_loss_mlp": 1.03068209, + "epoch": 0.3872538704343905, + "flos": 19611823115520.0, + "grad_norm": 1.978432319418159, + "language_loss": 0.81535852, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84282255, + "num_input_tokens_seen": 138280255, + "step": 6441, + "time_per_iteration": 2.8056576251983643 + }, + { + "auxiliary_loss_clip": 0.01479515, + "auxiliary_loss_mlp": 0.01260729, + "balance_loss_clip": 1.15985703, + "balance_loss_mlp": 1.02555323, + "epoch": 0.3873139936870585, + "flos": 17531973767520.0, + "grad_norm": 1.9836334231517903, + "language_loss": 0.81516904, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.84257144, + "num_input_tokens_seen": 138296675, + "step": 6442, + "time_per_iteration": 2.8258373737335205 + }, + { + "auxiliary_loss_clip": 0.01474143, + "auxiliary_loss_mlp": 0.01275896, + "balance_loss_clip": 1.15259063, + "balance_loss_mlp": 1.03824008, + "epoch": 0.38737411693972645, + "flos": 17788560320160.0, + "grad_norm": 1.7826816976752036, + "language_loss": 0.84030831, + "learning_rate": 2.803296990719624e-06, + "loss": 0.8678087, + "num_input_tokens_seen": 138314985, + "step": 6443, + "time_per_iteration": 2.7486648559570312 + }, + { + "auxiliary_loss_clip": 0.01620013, + "auxiliary_loss_mlp": 0.01242477, + "balance_loss_clip": 1.31885004, + "balance_loss_mlp": 1.03266907, + "epoch": 0.3874342401923944, + "flos": 58309969589280.0, + "grad_norm": 0.7714518855378419, + "language_loss": 0.50178874, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.53041363, + "num_input_tokens_seen": 138373275, + "step": 6444, + "time_per_iteration": 3.335813283920288 + }, + { + "auxiliary_loss_clip": 0.01478357, + "auxiliary_loss_mlp": 0.01264675, + "balance_loss_clip": 1.15832543, + "balance_loss_mlp": 1.02854538, + "epoch": 0.3874943634450624, + "flos": 17713841116320.0, + "grad_norm": 1.8009301477726303, + "language_loss": 0.78612387, + "learning_rate": 2.802583596543065e-06, + "loss": 0.81355417, + "num_input_tokens_seen": 138391145, + "step": 6445, + "time_per_iteration": 2.7603726387023926 + }, + { + "auxiliary_loss_clip": 0.01477963, + "auxiliary_loss_mlp": 0.01271425, + "balance_loss_clip": 1.15834928, + "balance_loss_mlp": 1.03529513, + "epoch": 0.38755448669773035, + "flos": 19246874716800.0, + "grad_norm": 2.1126369456528002, + "language_loss": 0.81077671, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83827055, + "num_input_tokens_seen": 138409875, + "step": 6446, + "time_per_iteration": 2.822618246078491 + }, + { + "auxiliary_loss_clip": 0.01478698, + "auxiliary_loss_mlp": 0.01262153, + "balance_loss_clip": 1.15827727, + "balance_loss_mlp": 1.02316213, + "epoch": 0.3876146099503983, + "flos": 20596334201280.0, + "grad_norm": 1.6956387702134506, + "language_loss": 0.77481043, + "learning_rate": 2.801870080630306e-06, + "loss": 0.80221891, + "num_input_tokens_seen": 138428965, + "step": 6447, + "time_per_iteration": 2.788416862487793 + }, + { + "auxiliary_loss_clip": 0.01478722, + "auxiliary_loss_mlp": 0.01264278, + "balance_loss_clip": 1.15949667, + "balance_loss_mlp": 1.02490592, + "epoch": 0.3876747332030663, + "flos": 19283134402080.0, + "grad_norm": 1.7006885928581708, + "language_loss": 0.76003146, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78746146, + "num_input_tokens_seen": 138448090, + "step": 6448, + "time_per_iteration": 4.560269832611084 + }, + { + "auxiliary_loss_clip": 0.01479555, + "auxiliary_loss_mlp": 0.0127881, + "balance_loss_clip": 1.16082144, + "balance_loss_mlp": 1.04344332, + "epoch": 0.38773485645573424, + "flos": 18947504835360.0, + "grad_norm": 1.7788803111466633, + "language_loss": 0.76014733, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78773105, + "num_input_tokens_seen": 138466105, + "step": 6449, + "time_per_iteration": 2.784003257751465 + }, + { + "auxiliary_loss_clip": 0.01466643, + "auxiliary_loss_mlp": 0.01276458, + "balance_loss_clip": 1.14576387, + "balance_loss_mlp": 1.03899348, + "epoch": 0.3877949797084022, + "flos": 23073333320160.0, + "grad_norm": 1.6680931740070875, + "language_loss": 0.78549707, + "learning_rate": 2.800799578742542e-06, + "loss": 0.81292808, + "num_input_tokens_seen": 138485160, + "step": 6450, + "time_per_iteration": 2.879066228866577 + }, + { + "auxiliary_loss_clip": 0.01463061, + "auxiliary_loss_mlp": 0.01280021, + "balance_loss_clip": 1.14288557, + "balance_loss_mlp": 1.04103065, + "epoch": 0.3878551029610702, + "flos": 29098092008160.0, + "grad_norm": 3.211605297918839, + "language_loss": 0.78550255, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.81293339, + "num_input_tokens_seen": 138504135, + "step": 6451, + "time_per_iteration": 2.8468945026397705 + }, + { + "auxiliary_loss_clip": 0.01470538, + "auxiliary_loss_mlp": 0.01264833, + "balance_loss_clip": 1.1510365, + "balance_loss_mlp": 1.031564, + "epoch": 0.38791522621373814, + "flos": 20998528417440.0, + "grad_norm": 2.037147840224029, + "language_loss": 0.7672376, + "learning_rate": 2.800085758962812e-06, + "loss": 0.79459131, + "num_input_tokens_seen": 138523955, + "step": 6452, + "time_per_iteration": 2.7992026805877686 + }, + { + "auxiliary_loss_clip": 0.01469617, + "auxiliary_loss_mlp": 0.01267333, + "balance_loss_clip": 1.15004551, + "balance_loss_mlp": 1.03368306, + "epoch": 0.3879753494664061, + "flos": 15488649601920.0, + "grad_norm": 1.6009819902883156, + "language_loss": 0.80072594, + "learning_rate": 2.799728803557182e-06, + "loss": 0.8280955, + "num_input_tokens_seen": 138541655, + "step": 6453, + "time_per_iteration": 2.7746047973632812 + }, + { + "auxiliary_loss_clip": 0.01474625, + "auxiliary_loss_mlp": 0.0127988, + "balance_loss_clip": 1.15509272, + "balance_loss_mlp": 1.04222453, + "epoch": 0.3880354727190741, + "flos": 22056127796160.0, + "grad_norm": 1.5845347540703907, + "language_loss": 0.71764028, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.74518538, + "num_input_tokens_seen": 138560860, + "step": 6454, + "time_per_iteration": 2.7850873470306396 + }, + { + "auxiliary_loss_clip": 0.01478201, + "auxiliary_loss_mlp": 0.01270344, + "balance_loss_clip": 1.15838957, + "balance_loss_mlp": 1.03078079, + "epoch": 0.3880955959717421, + "flos": 20342440548000.0, + "grad_norm": 1.8080755986585706, + "language_loss": 0.7757476, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80323303, + "num_input_tokens_seen": 138580200, + "step": 6455, + "time_per_iteration": 2.7705252170562744 + }, + { + "auxiliary_loss_clip": 0.01469453, + "auxiliary_loss_mlp": 0.01276006, + "balance_loss_clip": 1.14955592, + "balance_loss_mlp": 1.03930354, + "epoch": 0.38815571922441006, + "flos": 23077619202240.0, + "grad_norm": 1.5287348468617017, + "language_loss": 0.75777906, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78523362, + "num_input_tokens_seen": 138598315, + "step": 6456, + "time_per_iteration": 2.7790417671203613 + }, + { + "auxiliary_loss_clip": 0.01477013, + "auxiliary_loss_mlp": 0.01263585, + "balance_loss_clip": 1.158095, + "balance_loss_mlp": 1.02516627, + "epoch": 0.388215842477078, + "flos": 20779035969600.0, + "grad_norm": 2.252932064538493, + "language_loss": 0.6032477, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.63065368, + "num_input_tokens_seen": 138615695, + "step": 6457, + "time_per_iteration": 2.7067558765411377 + }, + { + "auxiliary_loss_clip": 0.01462255, + "auxiliary_loss_mlp": 0.01269036, + "balance_loss_clip": 1.14321017, + "balance_loss_mlp": 1.02737498, + "epoch": 0.388275965729746, + "flos": 20450233471680.0, + "grad_norm": 2.0575539840545725, + "language_loss": 0.80446678, + "learning_rate": 2.797943571912841e-06, + "loss": 0.83177972, + "num_input_tokens_seen": 138633180, + "step": 6458, + "time_per_iteration": 4.242969751358032 + }, + { + "auxiliary_loss_clip": 0.0146801, + "auxiliary_loss_mlp": 0.01271074, + "balance_loss_clip": 1.14809418, + "balance_loss_mlp": 1.03284609, + "epoch": 0.38833608898241395, + "flos": 27894922894080.0, + "grad_norm": 1.8988382698748218, + "language_loss": 0.81980443, + "learning_rate": 2.797586434755509e-06, + "loss": 0.84719527, + "num_input_tokens_seen": 138654785, + "step": 6459, + "time_per_iteration": 4.366363525390625 + }, + { + "auxiliary_loss_clip": 0.01465966, + "auxiliary_loss_mlp": 0.01274339, + "balance_loss_clip": 1.146101, + "balance_loss_mlp": 1.0391624, + "epoch": 0.3883962122350819, + "flos": 18078296448960.0, + "grad_norm": 2.3552755038925994, + "language_loss": 0.62070847, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.64811146, + "num_input_tokens_seen": 138673330, + "step": 6460, + "time_per_iteration": 2.84065318107605 + }, + { + "auxiliary_loss_clip": 0.01470721, + "auxiliary_loss_mlp": 0.01267263, + "balance_loss_clip": 1.15195143, + "balance_loss_mlp": 1.03323102, + "epoch": 0.3884563354877499, + "flos": 23624472877920.0, + "grad_norm": 1.531381818704655, + "language_loss": 0.86058748, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88796735, + "num_input_tokens_seen": 138694185, + "step": 6461, + "time_per_iteration": 2.7959983348846436 + }, + { + "auxiliary_loss_clip": 0.0146307, + "auxiliary_loss_mlp": 0.01271489, + "balance_loss_clip": 1.14295673, + "balance_loss_mlp": 1.03688502, + "epoch": 0.38851645874041785, + "flos": 27455975926560.0, + "grad_norm": 2.61756157903341, + "language_loss": 0.70950103, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73684663, + "num_input_tokens_seen": 138714625, + "step": 6462, + "time_per_iteration": 2.8234260082244873 + }, + { + "auxiliary_loss_clip": 0.01465865, + "auxiliary_loss_mlp": 0.01274607, + "balance_loss_clip": 1.14696646, + "balance_loss_mlp": 1.0375241, + "epoch": 0.3885765819930858, + "flos": 25230405130560.0, + "grad_norm": 2.5150115059944573, + "language_loss": 0.76461554, + "learning_rate": 2.796157583816052e-06, + "loss": 0.7920202, + "num_input_tokens_seen": 138733585, + "step": 6463, + "time_per_iteration": 2.7728309631347656 + }, + { + "auxiliary_loss_clip": 0.01470517, + "auxiliary_loss_mlp": 0.01280416, + "balance_loss_clip": 1.15012789, + "balance_loss_mlp": 1.04066277, + "epoch": 0.3886367052457538, + "flos": 16948519053120.0, + "grad_norm": 2.4000792909686175, + "language_loss": 0.70360529, + "learning_rate": 2.795800295571382e-06, + "loss": 0.73111457, + "num_input_tokens_seen": 138752335, + "step": 6464, + "time_per_iteration": 4.277510404586792 + }, + { + "auxiliary_loss_clip": 0.01465796, + "auxiliary_loss_mlp": 0.01267141, + "balance_loss_clip": 1.14641261, + "balance_loss_mlp": 1.03291893, + "epoch": 0.38869682849842174, + "flos": 27156150907200.0, + "grad_norm": 2.0638836623406305, + "language_loss": 0.69323081, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.72056019, + "num_input_tokens_seen": 138768450, + "step": 6465, + "time_per_iteration": 2.85610032081604 + }, + { + "auxiliary_loss_clip": 0.01469996, + "auxiliary_loss_mlp": 0.01266969, + "balance_loss_clip": 1.15088844, + "balance_loss_mlp": 1.02988553, + "epoch": 0.3887569517510897, + "flos": 21065434420320.0, + "grad_norm": 2.1574937723653, + "language_loss": 0.78059006, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80795968, + "num_input_tokens_seen": 138786775, + "step": 6466, + "time_per_iteration": 2.746307611465454 + }, + { + "auxiliary_loss_clip": 0.01467324, + "auxiliary_loss_mlp": 0.01266129, + "balance_loss_clip": 1.14809358, + "balance_loss_mlp": 1.02580333, + "epoch": 0.38881707500375773, + "flos": 29499982799040.0, + "grad_norm": 1.7012247146299189, + "language_loss": 0.69318581, + "learning_rate": 2.794728249830611e-06, + "loss": 0.72052038, + "num_input_tokens_seen": 138810100, + "step": 6467, + "time_per_iteration": 2.8185184001922607 + }, + { + "auxiliary_loss_clip": 0.0146254, + "auxiliary_loss_mlp": 0.01267537, + "balance_loss_clip": 1.14294648, + "balance_loss_mlp": 1.0291183, + "epoch": 0.3888771982564257, + "flos": 17489911073760.0, + "grad_norm": 2.2176160162893104, + "language_loss": 0.83592713, + "learning_rate": 2.794370840959936e-06, + "loss": 0.86322796, + "num_input_tokens_seen": 138825140, + "step": 6468, + "time_per_iteration": 2.7378523349761963 + }, + { + "auxiliary_loss_clip": 0.01465393, + "auxiliary_loss_mlp": 0.01268489, + "balance_loss_clip": 1.14494002, + "balance_loss_mlp": 1.03197825, + "epoch": 0.38893732150909366, + "flos": 21944466200160.0, + "grad_norm": 1.7540961548322878, + "language_loss": 0.8454662, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.872805, + "num_input_tokens_seen": 138844115, + "step": 6469, + "time_per_iteration": 2.719416379928589 + }, + { + "auxiliary_loss_clip": 0.01470698, + "auxiliary_loss_mlp": 0.01262888, + "balance_loss_clip": 1.15041661, + "balance_loss_mlp": 1.02465963, + "epoch": 0.3889974447617616, + "flos": 24278474698560.0, + "grad_norm": 1.7773730023676908, + "language_loss": 0.75014865, + "learning_rate": 2.793655932864273e-06, + "loss": 0.77748454, + "num_input_tokens_seen": 138860860, + "step": 6470, + "time_per_iteration": 2.8184738159179688 + }, + { + "auxiliary_loss_clip": 0.01467233, + "auxiliary_loss_mlp": 0.012605, + "balance_loss_clip": 1.14812326, + "balance_loss_mlp": 1.02150917, + "epoch": 0.3890575680144296, + "flos": 25669541738880.0, + "grad_norm": 1.6652584539087694, + "language_loss": 0.74787372, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77515113, + "num_input_tokens_seen": 138881910, + "step": 6471, + "time_per_iteration": 2.805393934249878 + }, + { + "auxiliary_loss_clip": 0.01467931, + "auxiliary_loss_mlp": 0.01261392, + "balance_loss_clip": 1.14834678, + "balance_loss_mlp": 1.02469027, + "epoch": 0.38911769126709755, + "flos": 22857406119360.0, + "grad_norm": 1.760108134855252, + "language_loss": 0.67891061, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70620382, + "num_input_tokens_seen": 138900975, + "step": 6472, + "time_per_iteration": 2.810702323913574 + }, + { + "auxiliary_loss_clip": 0.01468206, + "auxiliary_loss_mlp": 0.012619, + "balance_loss_clip": 1.14893878, + "balance_loss_mlp": 1.02424467, + "epoch": 0.3891778145197655, + "flos": 25449897578400.0, + "grad_norm": 1.5983806753269894, + "language_loss": 0.76278007, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.79008108, + "num_input_tokens_seen": 138920795, + "step": 6473, + "time_per_iteration": 2.8064754009246826 + }, + { + "auxiliary_loss_clip": 0.01470033, + "auxiliary_loss_mlp": 0.01262487, + "balance_loss_clip": 1.15068996, + "balance_loss_mlp": 1.02425957, + "epoch": 0.3892379377724335, + "flos": 14029728354720.0, + "grad_norm": 2.151507893974714, + "language_loss": 0.71420699, + "learning_rate": 2.792225755635257e-06, + "loss": 0.74153221, + "num_input_tokens_seen": 138938770, + "step": 6474, + "time_per_iteration": 2.7611031532287598 + }, + { + "auxiliary_loss_clip": 0.01463615, + "auxiliary_loss_mlp": 0.01259539, + "balance_loss_clip": 1.14401436, + "balance_loss_mlp": 1.0222652, + "epoch": 0.38929806102510145, + "flos": 20159549138880.0, + "grad_norm": 1.8291945695191771, + "language_loss": 0.69095951, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.71819103, + "num_input_tokens_seen": 138958880, + "step": 6475, + "time_per_iteration": 2.7506442070007324 + }, + { + "auxiliary_loss_clip": 0.01475058, + "auxiliary_loss_mlp": 0.01275063, + "balance_loss_clip": 1.15570915, + "balance_loss_mlp": 1.03263903, + "epoch": 0.3893581842777694, + "flos": 22166158481280.0, + "grad_norm": 2.2552923378673726, + "language_loss": 0.755476, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78297728, + "num_input_tokens_seen": 138977240, + "step": 6476, + "time_per_iteration": 3.059943914413452 + }, + { + "auxiliary_loss_clip": 0.0155228, + "auxiliary_loss_mlp": 0.01212234, + "balance_loss_clip": 1.25815654, + "balance_loss_mlp": 0.99861145, + "epoch": 0.3894183075304374, + "flos": 67308932381760.0, + "grad_norm": 0.791843301634659, + "language_loss": 0.58096182, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60860693, + "num_input_tokens_seen": 139039035, + "step": 6477, + "time_per_iteration": 3.2753102779388428 + }, + { + "auxiliary_loss_clip": 0.01479753, + "auxiliary_loss_mlp": 0.01272473, + "balance_loss_clip": 1.15993977, + "balance_loss_mlp": 1.03214765, + "epoch": 0.38947843078310534, + "flos": 18549255147840.0, + "grad_norm": 3.002203286669761, + "language_loss": 0.77895898, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80648124, + "num_input_tokens_seen": 139055560, + "step": 6478, + "time_per_iteration": 2.7304880619049072 + }, + { + "auxiliary_loss_clip": 0.0147217, + "auxiliary_loss_mlp": 0.01263714, + "balance_loss_clip": 1.15325999, + "balance_loss_mlp": 1.02777481, + "epoch": 0.3895385540357733, + "flos": 14607342132480.0, + "grad_norm": 2.0532912796064213, + "language_loss": 0.82464391, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.85200274, + "num_input_tokens_seen": 139071865, + "step": 6479, + "time_per_iteration": 2.781888246536255 + }, + { + "auxiliary_loss_clip": 0.01477495, + "auxiliary_loss_mlp": 0.01272984, + "balance_loss_clip": 1.15910995, + "balance_loss_mlp": 1.03837967, + "epoch": 0.38959867728844133, + "flos": 19977302508480.0, + "grad_norm": 1.6094706180466212, + "language_loss": 0.80151391, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82901865, + "num_input_tokens_seen": 139089640, + "step": 6480, + "time_per_iteration": 2.736119270324707 + }, + { + "auxiliary_loss_clip": 0.01472255, + "auxiliary_loss_mlp": 0.01261656, + "balance_loss_clip": 1.15473163, + "balance_loss_mlp": 1.02953196, + "epoch": 0.3896588005411093, + "flos": 22674211284960.0, + "grad_norm": 1.9286517038439943, + "language_loss": 0.83035827, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85769743, + "num_input_tokens_seen": 139109365, + "step": 6481, + "time_per_iteration": 2.8378520011901855 + }, + { + "auxiliary_loss_clip": 0.01479598, + "auxiliary_loss_mlp": 0.01264548, + "balance_loss_clip": 1.16248226, + "balance_loss_mlp": 1.03089833, + "epoch": 0.38971892379377726, + "flos": 20998149135840.0, + "grad_norm": 1.6831005419375578, + "language_loss": 0.75573909, + "learning_rate": 2.789363960063863e-06, + "loss": 0.78318059, + "num_input_tokens_seen": 139128260, + "step": 6482, + "time_per_iteration": 2.78511643409729 + }, + { + "auxiliary_loss_clip": 0.01469125, + "auxiliary_loss_mlp": 0.01269063, + "balance_loss_clip": 1.15086067, + "balance_loss_mlp": 1.03445935, + "epoch": 0.3897790470464452, + "flos": 22530689670240.0, + "grad_norm": 2.1909736409290606, + "language_loss": 0.78880769, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81618959, + "num_input_tokens_seen": 139147315, + "step": 6483, + "time_per_iteration": 2.8070552349090576 + }, + { + "auxiliary_loss_clip": 0.01466244, + "auxiliary_loss_mlp": 0.01283144, + "balance_loss_clip": 1.14780748, + "balance_loss_mlp": 1.04892159, + "epoch": 0.3898391702991132, + "flos": 26212109532480.0, + "grad_norm": 1.516689615014371, + "language_loss": 0.80158305, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82907701, + "num_input_tokens_seen": 139167270, + "step": 6484, + "time_per_iteration": 2.8364856243133545 + }, + { + "auxiliary_loss_clip": 0.01479577, + "auxiliary_loss_mlp": 0.01278864, + "balance_loss_clip": 1.16026568, + "balance_loss_mlp": 1.04445076, + "epoch": 0.38989929355178116, + "flos": 21067482540960.0, + "grad_norm": 1.5686469451347482, + "language_loss": 0.78324437, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.8108288, + "num_input_tokens_seen": 139185970, + "step": 6485, + "time_per_iteration": 2.79498028755188 + }, + { + "auxiliary_loss_clip": 0.01466627, + "auxiliary_loss_mlp": 0.01276548, + "balance_loss_clip": 1.14644718, + "balance_loss_mlp": 1.04537737, + "epoch": 0.3899594168044491, + "flos": 25486498617120.0, + "grad_norm": 2.8431815074726465, + "language_loss": 0.85320169, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.88063335, + "num_input_tokens_seen": 139203730, + "step": 6486, + "time_per_iteration": 4.488275527954102 + }, + { + "auxiliary_loss_clip": 0.01464386, + "auxiliary_loss_mlp": 0.01284456, + "balance_loss_clip": 1.14454031, + "balance_loss_mlp": 1.05118704, + "epoch": 0.3900195400571171, + "flos": 31142060952480.0, + "grad_norm": 2.077099947561791, + "language_loss": 0.85492194, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.88241041, + "num_input_tokens_seen": 139222560, + "step": 6487, + "time_per_iteration": 2.9032559394836426 + }, + { + "auxiliary_loss_clip": 0.01468071, + "auxiliary_loss_mlp": 0.01283209, + "balance_loss_clip": 1.15062809, + "balance_loss_mlp": 1.05184793, + "epoch": 0.39007966330978505, + "flos": 20231992653120.0, + "grad_norm": 1.5872886853367498, + "language_loss": 0.73139191, + "learning_rate": 2.787216355829633e-06, + "loss": 0.7589047, + "num_input_tokens_seen": 139242165, + "step": 6488, + "time_per_iteration": 2.8436102867126465 + }, + { + "auxiliary_loss_clip": 0.01480802, + "auxiliary_loss_mlp": 0.01281651, + "balance_loss_clip": 1.16312194, + "balance_loss_mlp": 1.0491451, + "epoch": 0.390139786562453, + "flos": 22530955167360.0, + "grad_norm": 3.4122210272513036, + "language_loss": 0.68915939, + "learning_rate": 2.786858317231779e-06, + "loss": 0.71678394, + "num_input_tokens_seen": 139262525, + "step": 6489, + "time_per_iteration": 2.83828067779541 + }, + { + "auxiliary_loss_clip": 0.01469237, + "auxiliary_loss_mlp": 0.0127607, + "balance_loss_clip": 1.15136194, + "balance_loss_mlp": 1.04432726, + "epoch": 0.390199909815121, + "flos": 26435356868160.0, + "grad_norm": 1.5690174908900405, + "language_loss": 0.8040309, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.83148396, + "num_input_tokens_seen": 139282835, + "step": 6490, + "time_per_iteration": 2.7822256088256836 + }, + { + "auxiliary_loss_clip": 0.01469139, + "auxiliary_loss_mlp": 0.01280077, + "balance_loss_clip": 1.15150094, + "balance_loss_mlp": 1.04661715, + "epoch": 0.39026003306778895, + "flos": 17276487131520.0, + "grad_norm": 1.9755611011084497, + "language_loss": 0.89401799, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.9215101, + "num_input_tokens_seen": 139299490, + "step": 6491, + "time_per_iteration": 2.7995333671569824 + }, + { + "auxiliary_loss_clip": 0.01468878, + "auxiliary_loss_mlp": 0.01281468, + "balance_loss_clip": 1.15174818, + "balance_loss_mlp": 1.04915261, + "epoch": 0.3903201563204569, + "flos": 24535023323040.0, + "grad_norm": 2.980704438835645, + "language_loss": 0.78498495, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.81248838, + "num_input_tokens_seen": 139317865, + "step": 6492, + "time_per_iteration": 2.80484938621521 + }, + { + "auxiliary_loss_clip": 0.01470224, + "auxiliary_loss_mlp": 0.01283554, + "balance_loss_clip": 1.15269172, + "balance_loss_mlp": 1.05162096, + "epoch": 0.39038027957312493, + "flos": 23770156397760.0, + "grad_norm": 2.1671508394511756, + "language_loss": 0.74604768, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.7735855, + "num_input_tokens_seen": 139339840, + "step": 6493, + "time_per_iteration": 2.8189775943756104 + }, + { + "auxiliary_loss_clip": 0.01470375, + "auxiliary_loss_mlp": 0.01279341, + "balance_loss_clip": 1.15255117, + "balance_loss_mlp": 1.04321098, + "epoch": 0.3904404028257929, + "flos": 14102361509760.0, + "grad_norm": 1.9339777416475115, + "language_loss": 0.75902802, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78652519, + "num_input_tokens_seen": 139357555, + "step": 6494, + "time_per_iteration": 2.8531112670898438 + }, + { + "auxiliary_loss_clip": 0.01473144, + "auxiliary_loss_mlp": 0.01309622, + "balance_loss_clip": 1.15522063, + "balance_loss_mlp": 1.07349181, + "epoch": 0.39050052607846086, + "flos": 16911955942560.0, + "grad_norm": 2.6071847540398934, + "language_loss": 0.74451745, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.77234513, + "num_input_tokens_seen": 139374455, + "step": 6495, + "time_per_iteration": 2.7622082233428955 + }, + { + "auxiliary_loss_clip": 0.01475438, + "auxiliary_loss_mlp": 0.01281565, + "balance_loss_clip": 1.15781212, + "balance_loss_mlp": 1.04715157, + "epoch": 0.39056064933112883, + "flos": 25917822024480.0, + "grad_norm": 1.6944045930271285, + "language_loss": 0.68183088, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70940089, + "num_input_tokens_seen": 139394770, + "step": 6496, + "time_per_iteration": 4.291001319885254 + }, + { + "auxiliary_loss_clip": 0.01544495, + "auxiliary_loss_mlp": 0.01345398, + "balance_loss_clip": 1.25112307, + "balance_loss_mlp": 1.14169312, + "epoch": 0.3906207725837968, + "flos": 60034352578560.0, + "grad_norm": 0.7048581794529416, + "language_loss": 0.53877425, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56767321, + "num_input_tokens_seen": 139454760, + "step": 6497, + "time_per_iteration": 4.957536697387695 + }, + { + "auxiliary_loss_clip": 0.01472838, + "auxiliary_loss_mlp": 0.012745, + "balance_loss_clip": 1.15512466, + "balance_loss_mlp": 1.0416131, + "epoch": 0.39068089583646476, + "flos": 21070782290880.0, + "grad_norm": 2.1812395227500083, + "language_loss": 0.69213688, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71961021, + "num_input_tokens_seen": 139472645, + "step": 6498, + "time_per_iteration": 2.831881523132324 + }, + { + "auxiliary_loss_clip": 0.01540353, + "auxiliary_loss_mlp": 0.01243919, + "balance_loss_clip": 1.24674869, + "balance_loss_mlp": 1.03639984, + "epoch": 0.3907410190891327, + "flos": 70453246105440.0, + "grad_norm": 0.7279121168605088, + "language_loss": 0.51737022, + "learning_rate": 2.783276292417936e-06, + "loss": 0.54521298, + "num_input_tokens_seen": 139536730, + "step": 6499, + "time_per_iteration": 3.308645009994507 + }, + { + "auxiliary_loss_clip": 0.01466009, + "auxiliary_loss_mlp": 0.01275165, + "balance_loss_clip": 1.148036, + "balance_loss_mlp": 1.03846288, + "epoch": 0.3908011423418007, + "flos": 27964825221600.0, + "grad_norm": 2.1909395800477385, + "language_loss": 0.73858315, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76599485, + "num_input_tokens_seen": 139557540, + "step": 6500, + "time_per_iteration": 2.85219144821167 + }, + { + "auxiliary_loss_clip": 0.0147698, + "auxiliary_loss_mlp": 0.01275906, + "balance_loss_clip": 1.16018486, + "balance_loss_mlp": 1.03920436, + "epoch": 0.39086126559446865, + "flos": 24464400360480.0, + "grad_norm": 2.115602760147273, + "language_loss": 0.69127363, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.71880257, + "num_input_tokens_seen": 139576875, + "step": 6501, + "time_per_iteration": 2.812242269515991 + }, + { + "auxiliary_loss_clip": 0.01466761, + "auxiliary_loss_mlp": 0.01270597, + "balance_loss_clip": 1.1495378, + "balance_loss_mlp": 1.03503954, + "epoch": 0.3909213888471366, + "flos": 16943171182560.0, + "grad_norm": 2.1282244202613048, + "language_loss": 0.78874737, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81612098, + "num_input_tokens_seen": 139594295, + "step": 6502, + "time_per_iteration": 4.245984792709351 + }, + { + "auxiliary_loss_clip": 0.01476374, + "auxiliary_loss_mlp": 0.0126616, + "balance_loss_clip": 1.15981078, + "balance_loss_mlp": 1.02983928, + "epoch": 0.3909815120998046, + "flos": 29280983417280.0, + "grad_norm": 2.170468872700832, + "language_loss": 0.7979157, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.82534105, + "num_input_tokens_seen": 139614080, + "step": 6503, + "time_per_iteration": 2.93500018119812 + }, + { + "auxiliary_loss_clip": 0.0146176, + "auxiliary_loss_mlp": 0.01257965, + "balance_loss_clip": 1.14508331, + "balance_loss_mlp": 1.02355194, + "epoch": 0.39104163535247255, + "flos": 18953156131200.0, + "grad_norm": 1.865469634481486, + "language_loss": 0.71987414, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.74707139, + "num_input_tokens_seen": 139632755, + "step": 6504, + "time_per_iteration": 2.77174973487854 + }, + { + "auxiliary_loss_clip": 0.01459051, + "auxiliary_loss_mlp": 0.01263331, + "balance_loss_clip": 1.14225698, + "balance_loss_mlp": 1.02548456, + "epoch": 0.3911017586051405, + "flos": 26325781320960.0, + "grad_norm": 1.6586657088592311, + "language_loss": 0.83018076, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85740459, + "num_input_tokens_seen": 139654205, + "step": 6505, + "time_per_iteration": 2.88573956489563 + }, + { + "auxiliary_loss_clip": 0.01460104, + "auxiliary_loss_mlp": 0.01266077, + "balance_loss_clip": 1.14348686, + "balance_loss_mlp": 1.02746773, + "epoch": 0.3911618818578085, + "flos": 21837962833920.0, + "grad_norm": 2.0730849174917143, + "language_loss": 0.71184993, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73911178, + "num_input_tokens_seen": 139673595, + "step": 6506, + "time_per_iteration": 2.7521135807037354 + }, + { + "auxiliary_loss_clip": 0.01467311, + "auxiliary_loss_mlp": 0.0126863, + "balance_loss_clip": 1.15194774, + "balance_loss_mlp": 1.03307223, + "epoch": 0.3912220051104765, + "flos": 16361385307200.0, + "grad_norm": 2.065971171489243, + "language_loss": 0.75281966, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.78017914, + "num_input_tokens_seen": 139690565, + "step": 6507, + "time_per_iteration": 2.8706369400024414 + }, + { + "auxiliary_loss_clip": 0.01505305, + "auxiliary_loss_mlp": 0.01259857, + "balance_loss_clip": 1.21338165, + "balance_loss_mlp": 1.04165649, + "epoch": 0.39128212836314447, + "flos": 71057751312960.0, + "grad_norm": 0.8445672217484471, + "language_loss": 0.56459892, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.59225053, + "num_input_tokens_seen": 139749420, + "step": 6508, + "time_per_iteration": 3.5447754859924316 + }, + { + "auxiliary_loss_clip": 0.01456132, + "auxiliary_loss_mlp": 0.01256667, + "balance_loss_clip": 1.13974893, + "balance_loss_mlp": 1.02244413, + "epoch": 0.39134225161581243, + "flos": 20333641214880.0, + "grad_norm": 2.2725383185924715, + "language_loss": 0.76234066, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78946865, + "num_input_tokens_seen": 139766265, + "step": 6509, + "time_per_iteration": 2.7421417236328125 + }, + { + "auxiliary_loss_clip": 0.01462089, + "auxiliary_loss_mlp": 0.01268532, + "balance_loss_clip": 1.14528632, + "balance_loss_mlp": 1.03087664, + "epoch": 0.3914023748684804, + "flos": 17020241932320.0, + "grad_norm": 4.1600817484683645, + "language_loss": 0.82936895, + "learning_rate": 2.779332635075825e-06, + "loss": 0.85667515, + "num_input_tokens_seen": 139782400, + "step": 6510, + "time_per_iteration": 2.8590736389160156 + }, + { + "auxiliary_loss_clip": 0.01458574, + "auxiliary_loss_mlp": 0.01268893, + "balance_loss_clip": 1.14172745, + "balance_loss_mlp": 1.03181005, + "epoch": 0.39146249812114836, + "flos": 18407174803200.0, + "grad_norm": 1.9332822562678853, + "language_loss": 0.7655071, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.79278171, + "num_input_tokens_seen": 139801435, + "step": 6511, + "time_per_iteration": 2.9083752632141113 + }, + { + "auxiliary_loss_clip": 0.01490414, + "auxiliary_loss_mlp": 0.01221863, + "balance_loss_clip": 1.1993562, + "balance_loss_mlp": 1.0112915, + "epoch": 0.3915226213738163, + "flos": 67644827445600.0, + "grad_norm": 0.7152140147419564, + "language_loss": 0.57765883, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.60478157, + "num_input_tokens_seen": 139869700, + "step": 6512, + "time_per_iteration": 3.4713735580444336 + }, + { + "auxiliary_loss_clip": 0.01462545, + "auxiliary_loss_mlp": 0.0127922, + "balance_loss_clip": 1.14666724, + "balance_loss_mlp": 1.04442549, + "epoch": 0.3915827446264843, + "flos": 26361737580960.0, + "grad_norm": 1.6444527293163629, + "language_loss": 0.69754648, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.72496414, + "num_input_tokens_seen": 139890140, + "step": 6513, + "time_per_iteration": 2.9933547973632812 + }, + { + "auxiliary_loss_clip": 0.01463226, + "auxiliary_loss_mlp": 0.01283481, + "balance_loss_clip": 1.14812875, + "balance_loss_mlp": 1.04696941, + "epoch": 0.39164286787915226, + "flos": 21946021254720.0, + "grad_norm": 3.2959925938668233, + "language_loss": 0.76182944, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.78929651, + "num_input_tokens_seen": 139908020, + "step": 6514, + "time_per_iteration": 2.8730006217956543 + }, + { + "auxiliary_loss_clip": 0.0145495, + "auxiliary_loss_mlp": 0.01280046, + "balance_loss_clip": 1.13895845, + "balance_loss_mlp": 1.04715884, + "epoch": 0.3917029911318202, + "flos": 16401741233760.0, + "grad_norm": 2.1852342124604953, + "language_loss": 0.7751081, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.80245811, + "num_input_tokens_seen": 139926180, + "step": 6515, + "time_per_iteration": 2.8701162338256836 + }, + { + "auxiliary_loss_clip": 0.01460058, + "auxiliary_loss_mlp": 0.01278044, + "balance_loss_clip": 1.14455712, + "balance_loss_mlp": 1.04973412, + "epoch": 0.3917631143844882, + "flos": 26214005940480.0, + "grad_norm": 1.320883295448386, + "language_loss": 0.79896444, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.82634544, + "num_input_tokens_seen": 139947420, + "step": 6516, + "time_per_iteration": 2.924555778503418 + }, + { + "auxiliary_loss_clip": 0.01457475, + "auxiliary_loss_mlp": 0.01288038, + "balance_loss_clip": 1.14088225, + "balance_loss_mlp": 1.05648613, + "epoch": 0.39182323763715615, + "flos": 18550165423680.0, + "grad_norm": 2.3598678304397613, + "language_loss": 0.70546043, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.73291552, + "num_input_tokens_seen": 139965800, + "step": 6517, + "time_per_iteration": 2.8557965755462646 + }, + { + "auxiliary_loss_clip": 0.01460821, + "auxiliary_loss_mlp": 0.01278265, + "balance_loss_clip": 1.14489865, + "balance_loss_mlp": 1.04671252, + "epoch": 0.3918833608898241, + "flos": 34316414143200.0, + "grad_norm": 1.5455921033741016, + "language_loss": 0.71857846, + "learning_rate": 2.776462273631956e-06, + "loss": 0.7459693, + "num_input_tokens_seen": 139988140, + "step": 6518, + "time_per_iteration": 2.986896514892578 + }, + { + "auxiliary_loss_clip": 0.01452843, + "auxiliary_loss_mlp": 0.01297438, + "balance_loss_clip": 1.13735819, + "balance_loss_mlp": 1.06340635, + "epoch": 0.3919434841424921, + "flos": 36942548244480.0, + "grad_norm": 1.7819502958317148, + "language_loss": 0.61404133, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.6415441, + "num_input_tokens_seen": 140010060, + "step": 6519, + "time_per_iteration": 2.9084763526916504 + }, + { + "auxiliary_loss_clip": 0.01467886, + "auxiliary_loss_mlp": 0.01286509, + "balance_loss_clip": 1.15215683, + "balance_loss_mlp": 1.05037963, + "epoch": 0.3920036073951601, + "flos": 23510952802080.0, + "grad_norm": 4.278443500921603, + "language_loss": 0.67371702, + "learning_rate": 2.775744388563563e-06, + "loss": 0.70126092, + "num_input_tokens_seen": 140029400, + "step": 6520, + "time_per_iteration": 2.781189203262329 + }, + { + "auxiliary_loss_clip": 0.01451394, + "auxiliary_loss_mlp": 0.01274511, + "balance_loss_clip": 1.13529384, + "balance_loss_mlp": 1.04143262, + "epoch": 0.39206373064782807, + "flos": 18408085079040.0, + "grad_norm": 2.0062806475407973, + "language_loss": 0.78416872, + "learning_rate": 2.775385401898104e-06, + "loss": 0.81142777, + "num_input_tokens_seen": 140048940, + "step": 6521, + "time_per_iteration": 2.7571098804473877 + }, + { + "auxiliary_loss_clip": 0.01457916, + "auxiliary_loss_mlp": 0.01271599, + "balance_loss_clip": 1.14192402, + "balance_loss_mlp": 1.0329895, + "epoch": 0.39212385390049603, + "flos": 12314713620960.0, + "grad_norm": 3.2759051381613475, + "language_loss": 0.70438886, + "learning_rate": 2.775026385829952e-06, + "loss": 0.73168397, + "num_input_tokens_seen": 140066380, + "step": 6522, + "time_per_iteration": 2.797835350036621 + }, + { + "auxiliary_loss_clip": 0.01463284, + "auxiliary_loss_mlp": 0.01274476, + "balance_loss_clip": 1.1476934, + "balance_loss_mlp": 1.03987241, + "epoch": 0.392183977153164, + "flos": 19721209021920.0, + "grad_norm": 1.7665108327318306, + "language_loss": 0.77009851, + "learning_rate": 2.774667340372722e-06, + "loss": 0.79747611, + "num_input_tokens_seen": 140085275, + "step": 6523, + "time_per_iteration": 2.7209489345550537 + }, + { + "auxiliary_loss_clip": 0.01458371, + "auxiliary_loss_mlp": 0.01284732, + "balance_loss_clip": 1.1433866, + "balance_loss_mlp": 1.04898417, + "epoch": 0.39224410040583196, + "flos": 33147646234560.0, + "grad_norm": 2.819825017284462, + "language_loss": 0.61677808, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.64420915, + "num_input_tokens_seen": 140105105, + "step": 6524, + "time_per_iteration": 4.538638591766357 + }, + { + "auxiliary_loss_clip": 0.0145496, + "auxiliary_loss_mlp": 0.01280402, + "balance_loss_clip": 1.13915467, + "balance_loss_mlp": 1.04808652, + "epoch": 0.39230422365849993, + "flos": 27784930137120.0, + "grad_norm": 2.8709369156273543, + "language_loss": 0.73891962, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76627326, + "num_input_tokens_seen": 140125645, + "step": 6525, + "time_per_iteration": 2.8070082664489746 + }, + { + "auxiliary_loss_clip": 0.01460216, + "auxiliary_loss_mlp": 0.01282157, + "balance_loss_clip": 1.14454949, + "balance_loss_mlp": 1.04831624, + "epoch": 0.3923643469111679, + "flos": 17933599061280.0, + "grad_norm": 1.9442929948314465, + "language_loss": 0.81620228, + "learning_rate": 2.773590027802719e-06, + "loss": 0.84362602, + "num_input_tokens_seen": 140141925, + "step": 6526, + "time_per_iteration": 2.726473808288574 + }, + { + "auxiliary_loss_clip": 0.01461894, + "auxiliary_loss_mlp": 0.01281974, + "balance_loss_clip": 1.1473496, + "balance_loss_mlp": 1.04546285, + "epoch": 0.39242447016383586, + "flos": 24061751006400.0, + "grad_norm": 1.6868318599244512, + "language_loss": 0.70253092, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72996962, + "num_input_tokens_seen": 140160965, + "step": 6527, + "time_per_iteration": 2.771958827972412 + }, + { + "auxiliary_loss_clip": 0.01460978, + "auxiliary_loss_mlp": 0.0126864, + "balance_loss_clip": 1.14616978, + "balance_loss_mlp": 1.03365445, + "epoch": 0.3924845934165038, + "flos": 10665239476320.0, + "grad_norm": 2.8984709499678645, + "language_loss": 0.81842732, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84572345, + "num_input_tokens_seen": 140177780, + "step": 6528, + "time_per_iteration": 2.7390639781951904 + }, + { + "auxiliary_loss_clip": 0.01462659, + "auxiliary_loss_mlp": 0.01265342, + "balance_loss_clip": 1.14850354, + "balance_loss_mlp": 1.03169179, + "epoch": 0.3925447166691718, + "flos": 31248450534240.0, + "grad_norm": 2.493804870568252, + "language_loss": 0.69150281, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.71878284, + "num_input_tokens_seen": 140201660, + "step": 6529, + "time_per_iteration": 2.8499245643615723 + }, + { + "auxiliary_loss_clip": 0.01457186, + "auxiliary_loss_mlp": 0.01277687, + "balance_loss_clip": 1.14257288, + "balance_loss_mlp": 1.04193926, + "epoch": 0.39260483992183975, + "flos": 29417109040800.0, + "grad_norm": 2.701719737064521, + "language_loss": 0.80617899, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.83352768, + "num_input_tokens_seen": 140218585, + "step": 6530, + "time_per_iteration": 2.858945369720459 + }, + { + "auxiliary_loss_clip": 0.01457563, + "auxiliary_loss_mlp": 0.01265078, + "balance_loss_clip": 1.14328623, + "balance_loss_mlp": 1.03085518, + "epoch": 0.3926649631745077, + "flos": 22860061090560.0, + "grad_norm": 1.4751680681414072, + "language_loss": 0.75645196, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.78367841, + "num_input_tokens_seen": 140239905, + "step": 6531, + "time_per_iteration": 2.8311634063720703 + }, + { + "auxiliary_loss_clip": 0.01518262, + "auxiliary_loss_mlp": 0.01355614, + "balance_loss_clip": 1.23263264, + "balance_loss_mlp": 1.14580536, + "epoch": 0.3927250864271757, + "flos": 63899915479200.0, + "grad_norm": 0.8465521316063794, + "language_loss": 0.60311514, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.63185394, + "num_input_tokens_seen": 140293820, + "step": 6532, + "time_per_iteration": 3.1748061180114746 + }, + { + "auxiliary_loss_clip": 0.01528052, + "auxiliary_loss_mlp": 0.01317764, + "balance_loss_clip": 1.24349689, + "balance_loss_mlp": 1.10795593, + "epoch": 0.3927852096798437, + "flos": 68917026539520.0, + "grad_norm": 0.8050556647407636, + "language_loss": 0.55407453, + "learning_rate": 2.771075272396981e-06, + "loss": 0.58253264, + "num_input_tokens_seen": 140360420, + "step": 6533, + "time_per_iteration": 3.3650574684143066 + }, + { + "auxiliary_loss_clip": 0.01463901, + "auxiliary_loss_mlp": 0.01273401, + "balance_loss_clip": 1.14943647, + "balance_loss_mlp": 1.03307521, + "epoch": 0.39284533293251167, + "flos": 29718564971040.0, + "grad_norm": 2.3864673924658746, + "language_loss": 0.76222265, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78959572, + "num_input_tokens_seen": 140381950, + "step": 6534, + "time_per_iteration": 4.284024477005005 + }, + { + "auxiliary_loss_clip": 0.01461192, + "auxiliary_loss_mlp": 0.01264799, + "balance_loss_clip": 1.14651334, + "balance_loss_mlp": 1.0218029, + "epoch": 0.39290545618517964, + "flos": 18554034096000.0, + "grad_norm": 2.4245243455083325, + "language_loss": 0.78352249, + "learning_rate": 2.770356507494851e-06, + "loss": 0.81078243, + "num_input_tokens_seen": 140399410, + "step": 6535, + "time_per_iteration": 4.255434989929199 + }, + { + "auxiliary_loss_clip": 0.01459496, + "auxiliary_loss_mlp": 0.01267706, + "balance_loss_clip": 1.14581978, + "balance_loss_mlp": 1.02661705, + "epoch": 0.3929655794378476, + "flos": 26251858608480.0, + "grad_norm": 2.4004434107651846, + "language_loss": 0.69158655, + "learning_rate": 2.769997081218978e-06, + "loss": 0.71885854, + "num_input_tokens_seen": 140419055, + "step": 6536, + "time_per_iteration": 2.8389852046966553 + }, + { + "auxiliary_loss_clip": 0.01465342, + "auxiliary_loss_mlp": 0.01261374, + "balance_loss_clip": 1.15193248, + "balance_loss_mlp": 1.02066612, + "epoch": 0.39302570269051557, + "flos": 29280490351200.0, + "grad_norm": 1.6839703697710506, + "language_loss": 0.68998253, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71724963, + "num_input_tokens_seen": 140438800, + "step": 6537, + "time_per_iteration": 2.795309543609619 + }, + { + "auxiliary_loss_clip": 0.01468311, + "auxiliary_loss_mlp": 0.01263029, + "balance_loss_clip": 1.1543442, + "balance_loss_mlp": 1.01965141, + "epoch": 0.39308582594318353, + "flos": 17349347855520.0, + "grad_norm": 2.1247634934015083, + "language_loss": 0.78648031, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81379366, + "num_input_tokens_seen": 140456880, + "step": 6538, + "time_per_iteration": 2.772413969039917 + }, + { + "auxiliary_loss_clip": 0.01555579, + "auxiliary_loss_mlp": 0.01234261, + "balance_loss_clip": 1.27212524, + "balance_loss_mlp": 1.01300812, + "epoch": 0.3931459491958515, + "flos": 61012719302400.0, + "grad_norm": 0.8471381221539814, + "language_loss": 0.6187411, + "learning_rate": 2.768918627255683e-06, + "loss": 0.64663953, + "num_input_tokens_seen": 140507510, + "step": 6539, + "time_per_iteration": 3.138611316680908 + }, + { + "auxiliary_loss_clip": 0.01468569, + "auxiliary_loss_mlp": 0.01261711, + "balance_loss_clip": 1.15523005, + "balance_loss_mlp": 1.01776123, + "epoch": 0.39320607244851946, + "flos": 39018908201760.0, + "grad_norm": 4.817325186965218, + "language_loss": 0.67970079, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70700359, + "num_input_tokens_seen": 140528740, + "step": 6540, + "time_per_iteration": 4.353183746337891 + }, + { + "auxiliary_loss_clip": 0.01467435, + "auxiliary_loss_mlp": 0.01262254, + "balance_loss_clip": 1.15480518, + "balance_loss_mlp": 1.02040255, + "epoch": 0.3932661957011874, + "flos": 24681996400320.0, + "grad_norm": 2.6589198236205784, + "language_loss": 0.72880065, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.75609761, + "num_input_tokens_seen": 140547560, + "step": 6541, + "time_per_iteration": 2.7974348068237305 + }, + { + "auxiliary_loss_clip": 0.01532374, + "auxiliary_loss_mlp": 0.01228096, + "balance_loss_clip": 1.24822867, + "balance_loss_mlp": 1.00684357, + "epoch": 0.3933263189538554, + "flos": 70102938340800.0, + "grad_norm": 0.8595875856997647, + "language_loss": 0.60253453, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.63013923, + "num_input_tokens_seen": 140601175, + "step": 6542, + "time_per_iteration": 3.106333017349243 + }, + { + "auxiliary_loss_clip": 0.01458136, + "auxiliary_loss_mlp": 0.01269336, + "balance_loss_clip": 1.14503264, + "balance_loss_mlp": 1.02748382, + "epoch": 0.39338644220652336, + "flos": 22931252975520.0, + "grad_norm": 1.6116435022698912, + "language_loss": 0.82421541, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.85149014, + "num_input_tokens_seen": 140622200, + "step": 6543, + "time_per_iteration": 2.7763724327087402 + }, + { + "auxiliary_loss_clip": 0.01461856, + "auxiliary_loss_mlp": 0.01267711, + "balance_loss_clip": 1.14850295, + "balance_loss_mlp": 1.0279572, + "epoch": 0.3934465654591913, + "flos": 30850883553600.0, + "grad_norm": 1.7871600076076415, + "language_loss": 0.69549704, + "learning_rate": 2.767120621015908e-06, + "loss": 0.72279269, + "num_input_tokens_seen": 140643125, + "step": 6544, + "time_per_iteration": 2.8949646949768066 + }, + { + "auxiliary_loss_clip": 0.01464992, + "auxiliary_loss_mlp": 0.01267288, + "balance_loss_clip": 1.15157318, + "balance_loss_mlp": 1.02581787, + "epoch": 0.3935066887118593, + "flos": 29238806939040.0, + "grad_norm": 2.210570076398781, + "language_loss": 0.75737113, + "learning_rate": 2.76676093244553e-06, + "loss": 0.78469396, + "num_input_tokens_seen": 140662500, + "step": 6545, + "time_per_iteration": 2.8235652446746826 + }, + { + "auxiliary_loss_clip": 0.01468256, + "auxiliary_loss_mlp": 0.01258598, + "balance_loss_clip": 1.15579939, + "balance_loss_mlp": 1.02685511, + "epoch": 0.3935668119645273, + "flos": 19137299169600.0, + "grad_norm": 1.548062916762332, + "language_loss": 0.74826515, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.77553374, + "num_input_tokens_seen": 140681960, + "step": 6546, + "time_per_iteration": 2.7660036087036133 + }, + { + "auxiliary_loss_clip": 0.01465177, + "auxiliary_loss_mlp": 0.01281002, + "balance_loss_clip": 1.15111399, + "balance_loss_mlp": 1.04506254, + "epoch": 0.3936269352171953, + "flos": 18518153692320.0, + "grad_norm": 1.9693088942736983, + "language_loss": 0.81822395, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.84568578, + "num_input_tokens_seen": 140699170, + "step": 6547, + "time_per_iteration": 2.7326109409332275 + }, + { + "auxiliary_loss_clip": 0.01462436, + "auxiliary_loss_mlp": 0.01283098, + "balance_loss_clip": 1.14930773, + "balance_loss_mlp": 1.05249929, + "epoch": 0.39368705846986324, + "flos": 15634636547040.0, + "grad_norm": 1.8900689420591557, + "language_loss": 0.84093618, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86839157, + "num_input_tokens_seen": 140714920, + "step": 6548, + "time_per_iteration": 2.7944605350494385 + }, + { + "auxiliary_loss_clip": 0.01458651, + "auxiliary_loss_mlp": 0.01272133, + "balance_loss_clip": 1.14557159, + "balance_loss_mlp": 1.04077148, + "epoch": 0.3937471817225312, + "flos": 21328051550400.0, + "grad_norm": 1.589587055602684, + "language_loss": 0.73257703, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.75988483, + "num_input_tokens_seen": 140734595, + "step": 6549, + "time_per_iteration": 2.80206036567688 + }, + { + "auxiliary_loss_clip": 0.01465732, + "auxiliary_loss_mlp": 0.01287854, + "balance_loss_clip": 1.15085721, + "balance_loss_mlp": 1.05687451, + "epoch": 0.39380730497519917, + "flos": 20778770472480.0, + "grad_norm": 1.6163817826240552, + "language_loss": 0.77762938, + "learning_rate": 2.764962053731699e-06, + "loss": 0.80516529, + "num_input_tokens_seen": 140754050, + "step": 6550, + "time_per_iteration": 2.7162086963653564 + }, + { + "auxiliary_loss_clip": 0.01452795, + "auxiliary_loss_mlp": 0.01278108, + "balance_loss_clip": 1.13884783, + "balance_loss_mlp": 1.04846311, + "epoch": 0.39386742822786713, + "flos": 21611567460960.0, + "grad_norm": 2.088863976210732, + "language_loss": 0.81515098, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.84245998, + "num_input_tokens_seen": 140771440, + "step": 6551, + "time_per_iteration": 2.8384616374969482 + }, + { + "auxiliary_loss_clip": 0.01455227, + "auxiliary_loss_mlp": 0.01275321, + "balance_loss_clip": 1.14116716, + "balance_loss_mlp": 1.04395938, + "epoch": 0.3939275514805351, + "flos": 12415869116640.0, + "grad_norm": 5.0530127217925465, + "language_loss": 0.80771184, + "learning_rate": 2.764242299098596e-06, + "loss": 0.83501732, + "num_input_tokens_seen": 140786715, + "step": 6552, + "time_per_iteration": 2.719461679458618 + }, + { + "auxiliary_loss_clip": 0.01463539, + "auxiliary_loss_mlp": 0.01273314, + "balance_loss_clip": 1.15018106, + "balance_loss_mlp": 1.04271591, + "epoch": 0.39398767473320306, + "flos": 18554034096000.0, + "grad_norm": 1.7240669931917765, + "language_loss": 0.71104175, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73841035, + "num_input_tokens_seen": 140804950, + "step": 6553, + "time_per_iteration": 2.8656883239746094 + }, + { + "auxiliary_loss_clip": 0.01459329, + "auxiliary_loss_mlp": 0.01279312, + "balance_loss_clip": 1.1454823, + "balance_loss_mlp": 1.04756975, + "epoch": 0.39404779798587103, + "flos": 29311136668800.0, + "grad_norm": 1.5911571714373829, + "language_loss": 0.63833213, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.66571856, + "num_input_tokens_seen": 140822800, + "step": 6554, + "time_per_iteration": 2.8164734840393066 + }, + { + "auxiliary_loss_clip": 0.0145605, + "auxiliary_loss_mlp": 0.01272132, + "balance_loss_clip": 1.14346123, + "balance_loss_mlp": 1.04344141, + "epoch": 0.394107921238539, + "flos": 34899944713920.0, + "grad_norm": 1.8992844112277871, + "language_loss": 0.79428077, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.82156259, + "num_input_tokens_seen": 140842940, + "step": 6555, + "time_per_iteration": 2.8259506225585938 + }, + { + "auxiliary_loss_clip": 0.01466732, + "auxiliary_loss_mlp": 0.01282553, + "balance_loss_clip": 1.15309012, + "balance_loss_mlp": 1.05023766, + "epoch": 0.39416804449120696, + "flos": 25083697550400.0, + "grad_norm": 1.8285401244945332, + "language_loss": 0.71318352, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.74067634, + "num_input_tokens_seen": 140863060, + "step": 6556, + "time_per_iteration": 2.7712018489837646 + }, + { + "auxiliary_loss_clip": 0.01457636, + "auxiliary_loss_mlp": 0.012807, + "balance_loss_clip": 1.14400399, + "balance_loss_mlp": 1.04991078, + "epoch": 0.3942281677438749, + "flos": 32309387591040.0, + "grad_norm": 2.057649930233107, + "language_loss": 0.8375932, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.86497653, + "num_input_tokens_seen": 140883795, + "step": 6557, + "time_per_iteration": 2.834172248840332 + }, + { + "auxiliary_loss_clip": 0.01466698, + "auxiliary_loss_mlp": 0.01275541, + "balance_loss_clip": 1.15366983, + "balance_loss_mlp": 1.04093671, + "epoch": 0.3942882909965429, + "flos": 24938962234560.0, + "grad_norm": 2.143668606033879, + "language_loss": 0.80395865, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.83138108, + "num_input_tokens_seen": 140903055, + "step": 6558, + "time_per_iteration": 2.7573375701904297 + }, + { + "auxiliary_loss_clip": 0.01466571, + "auxiliary_loss_mlp": 0.01280497, + "balance_loss_clip": 1.15414667, + "balance_loss_mlp": 1.04837298, + "epoch": 0.39434841424921085, + "flos": 11876411432160.0, + "grad_norm": 1.8296012875856065, + "language_loss": 0.71334165, + "learning_rate": 2.761722245724792e-06, + "loss": 0.74081236, + "num_input_tokens_seen": 140920685, + "step": 6559, + "time_per_iteration": 2.7084007263183594 + }, + { + "auxiliary_loss_clip": 0.01460475, + "auxiliary_loss_mlp": 0.01275047, + "balance_loss_clip": 1.14635825, + "balance_loss_mlp": 1.04006124, + "epoch": 0.3944085375018789, + "flos": 16363433427840.0, + "grad_norm": 2.2011779609932858, + "language_loss": 0.80465746, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.83201265, + "num_input_tokens_seen": 140937320, + "step": 6560, + "time_per_iteration": 2.7953033447265625 + }, + { + "auxiliary_loss_clip": 0.01463964, + "auxiliary_loss_mlp": 0.0129284, + "balance_loss_clip": 1.15164328, + "balance_loss_mlp": 1.06166923, + "epoch": 0.39446866075454684, + "flos": 10634858655840.0, + "grad_norm": 2.1365924748605876, + "language_loss": 0.82512963, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85269773, + "num_input_tokens_seen": 140954855, + "step": 6561, + "time_per_iteration": 2.6995701789855957 + }, + { + "auxiliary_loss_clip": 0.01461455, + "auxiliary_loss_mlp": 0.01275818, + "balance_loss_clip": 1.14813828, + "balance_loss_mlp": 1.04426622, + "epoch": 0.3945287840072148, + "flos": 18189502907040.0, + "grad_norm": 4.338935770647972, + "language_loss": 0.79970944, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.82708216, + "num_input_tokens_seen": 140973250, + "step": 6562, + "time_per_iteration": 4.373692989349365 + }, + { + "auxiliary_loss_clip": 0.01467418, + "auxiliary_loss_mlp": 0.01265691, + "balance_loss_clip": 1.15452719, + "balance_loss_mlp": 1.03337598, + "epoch": 0.39458890725988277, + "flos": 23042156008320.0, + "grad_norm": 1.6185875357000572, + "language_loss": 0.81510735, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.84243852, + "num_input_tokens_seen": 140993050, + "step": 6563, + "time_per_iteration": 2.815650463104248 + }, + { + "auxiliary_loss_clip": 0.01461332, + "auxiliary_loss_mlp": 0.01274836, + "balance_loss_clip": 1.1479528, + "balance_loss_mlp": 1.04232979, + "epoch": 0.39464903051255074, + "flos": 17160615509760.0, + "grad_norm": 2.29649452560783, + "language_loss": 0.69861275, + "learning_rate": 2.759921340790127e-06, + "loss": 0.72597444, + "num_input_tokens_seen": 141010815, + "step": 6564, + "time_per_iteration": 2.788593053817749 + }, + { + "auxiliary_loss_clip": 0.01469816, + "auxiliary_loss_mlp": 0.01270922, + "balance_loss_clip": 1.1555084, + "balance_loss_mlp": 1.03364825, + "epoch": 0.3947091537652187, + "flos": 15890881746240.0, + "grad_norm": 4.7800321304638915, + "language_loss": 0.83231521, + "learning_rate": 2.759561073299676e-06, + "loss": 0.8597225, + "num_input_tokens_seen": 141028720, + "step": 6565, + "time_per_iteration": 2.8130593299865723 + }, + { + "auxiliary_loss_clip": 0.01464163, + "auxiliary_loss_mlp": 0.01269618, + "balance_loss_clip": 1.15154696, + "balance_loss_mlp": 1.03692126, + "epoch": 0.39476927701788667, + "flos": 18547131170880.0, + "grad_norm": 2.0852327363920806, + "language_loss": 0.83390599, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.86124384, + "num_input_tokens_seen": 141046025, + "step": 6566, + "time_per_iteration": 2.7792625427246094 + }, + { + "auxiliary_loss_clip": 0.0146788, + "auxiliary_loss_mlp": 0.01266488, + "balance_loss_clip": 1.15354502, + "balance_loss_mlp": 1.02883267, + "epoch": 0.39482940027055463, + "flos": 22278047646240.0, + "grad_norm": 1.9515399687867325, + "language_loss": 0.77758342, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.80492711, + "num_input_tokens_seen": 141066865, + "step": 6567, + "time_per_iteration": 2.8124074935913086 + }, + { + "auxiliary_loss_clip": 0.01460895, + "auxiliary_loss_mlp": 0.01264455, + "balance_loss_clip": 1.14798164, + "balance_loss_mlp": 1.03309405, + "epoch": 0.3948895235232226, + "flos": 14759435511360.0, + "grad_norm": 5.536769781604655, + "language_loss": 0.80669439, + "learning_rate": 2.758480098067182e-06, + "loss": 0.83394784, + "num_input_tokens_seen": 141084210, + "step": 6568, + "time_per_iteration": 2.7631428241729736 + }, + { + "auxiliary_loss_clip": 0.01461305, + "auxiliary_loss_mlp": 0.01268242, + "balance_loss_clip": 1.14786458, + "balance_loss_mlp": 1.03363836, + "epoch": 0.39494964677589056, + "flos": 22568011344000.0, + "grad_norm": 1.8687907340223007, + "language_loss": 0.84485483, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.8721503, + "num_input_tokens_seen": 141103895, + "step": 6569, + "time_per_iteration": 2.739877223968506 + }, + { + "auxiliary_loss_clip": 0.01478981, + "auxiliary_loss_mlp": 0.01268911, + "balance_loss_clip": 1.16639829, + "balance_loss_mlp": 1.03773999, + "epoch": 0.3950097700285585, + "flos": 22965236971200.0, + "grad_norm": 1.9998377871927977, + "language_loss": 0.74760902, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.77508795, + "num_input_tokens_seen": 141124000, + "step": 6570, + "time_per_iteration": 2.8210198879241943 + }, + { + "auxiliary_loss_clip": 0.01458357, + "auxiliary_loss_mlp": 0.01251574, + "balance_loss_clip": 1.14614582, + "balance_loss_mlp": 1.0200218, + "epoch": 0.3950698932812265, + "flos": 20597358261600.0, + "grad_norm": 1.7575729869339451, + "language_loss": 0.80050087, + "learning_rate": 2.757398863979922e-06, + "loss": 0.82760018, + "num_input_tokens_seen": 141142535, + "step": 6571, + "time_per_iteration": 2.7268786430358887 + }, + { + "auxiliary_loss_clip": 0.01466779, + "auxiliary_loss_mlp": 0.01266206, + "balance_loss_clip": 1.15419388, + "balance_loss_mlp": 1.03293729, + "epoch": 0.39513001653389446, + "flos": 20377865813760.0, + "grad_norm": 1.8193982252580814, + "language_loss": 0.77841347, + "learning_rate": 2.757038395157997e-06, + "loss": 0.80574334, + "num_input_tokens_seen": 141161575, + "step": 6572, + "time_per_iteration": 4.17066764831543 + }, + { + "auxiliary_loss_clip": 0.01468374, + "auxiliary_loss_mlp": 0.012712, + "balance_loss_clip": 1.15495729, + "balance_loss_mlp": 1.03487968, + "epoch": 0.3951901397865625, + "flos": 26465775616800.0, + "grad_norm": 1.8757404355605174, + "language_loss": 0.75228679, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77968252, + "num_input_tokens_seen": 141181150, + "step": 6573, + "time_per_iteration": 4.355619668960571 + }, + { + "auxiliary_loss_clip": 0.01471074, + "auxiliary_loss_mlp": 0.01261578, + "balance_loss_clip": 1.15906811, + "balance_loss_mlp": 1.0305984, + "epoch": 0.39525026303923044, + "flos": 43841863189440.0, + "grad_norm": 1.5277048133063054, + "language_loss": 0.67754203, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70486856, + "num_input_tokens_seen": 141206310, + "step": 6574, + "time_per_iteration": 3.0093913078308105 + }, + { + "auxiliary_loss_clip": 0.01476443, + "auxiliary_loss_mlp": 0.01262478, + "balance_loss_clip": 1.16344178, + "balance_loss_mlp": 1.02653933, + "epoch": 0.3953103862918984, + "flos": 18042757398720.0, + "grad_norm": 2.6609533827267686, + "language_loss": 0.72397894, + "learning_rate": 2.755956816505072e-06, + "loss": 0.75136817, + "num_input_tokens_seen": 141223925, + "step": 6575, + "time_per_iteration": 2.7686564922332764 + }, + { + "auxiliary_loss_clip": 0.01469101, + "auxiliary_loss_mlp": 0.01255731, + "balance_loss_clip": 1.15688348, + "balance_loss_mlp": 1.02017331, + "epoch": 0.3953705095445664, + "flos": 16977534459840.0, + "grad_norm": 2.1725349022350486, + "language_loss": 0.73613501, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.76338327, + "num_input_tokens_seen": 141239010, + "step": 6576, + "time_per_iteration": 2.7405664920806885 + }, + { + "auxiliary_loss_clip": 0.01468186, + "auxiliary_loss_mlp": 0.0126555, + "balance_loss_clip": 1.1560986, + "balance_loss_mlp": 1.03495193, + "epoch": 0.39543063279723434, + "flos": 17412536898720.0, + "grad_norm": 2.2413895209308596, + "language_loss": 0.83925849, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.86659586, + "num_input_tokens_seen": 141252255, + "step": 6577, + "time_per_iteration": 2.7142586708068848 + }, + { + "auxiliary_loss_clip": 0.01470274, + "auxiliary_loss_mlp": 0.01256299, + "balance_loss_clip": 1.15788698, + "balance_loss_mlp": 1.02570081, + "epoch": 0.3954907560499023, + "flos": 22786441803360.0, + "grad_norm": 2.228893543360737, + "language_loss": 0.89785177, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92511749, + "num_input_tokens_seen": 141269325, + "step": 6578, + "time_per_iteration": 4.240872621536255 + }, + { + "auxiliary_loss_clip": 0.01469269, + "auxiliary_loss_mlp": 0.01259799, + "balance_loss_clip": 1.15630603, + "balance_loss_mlp": 1.02576697, + "epoch": 0.39555087930257027, + "flos": 21946552248960.0, + "grad_norm": 4.7413421924154635, + "language_loss": 0.77846527, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80575597, + "num_input_tokens_seen": 141288505, + "step": 6579, + "time_per_iteration": 2.7610790729522705 + }, + { + "auxiliary_loss_clip": 0.01475641, + "auxiliary_loss_mlp": 0.01261453, + "balance_loss_clip": 1.16326272, + "balance_loss_mlp": 1.02532315, + "epoch": 0.39561100255523823, + "flos": 20406274369920.0, + "grad_norm": 2.167063571960138, + "language_loss": 0.68570817, + "learning_rate": 2.754153612280037e-06, + "loss": 0.71307909, + "num_input_tokens_seen": 141303680, + "step": 6580, + "time_per_iteration": 2.7946088314056396 + }, + { + "auxiliary_loss_clip": 0.01471879, + "auxiliary_loss_mlp": 0.01264433, + "balance_loss_clip": 1.16087162, + "balance_loss_mlp": 1.03383493, + "epoch": 0.3956711258079062, + "flos": 27967062983040.0, + "grad_norm": 2.123749327374431, + "language_loss": 0.58919579, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.61655891, + "num_input_tokens_seen": 141324090, + "step": 6581, + "time_per_iteration": 2.8263046741485596 + }, + { + "auxiliary_loss_clip": 0.01482661, + "auxiliary_loss_mlp": 0.01268018, + "balance_loss_clip": 1.17034626, + "balance_loss_mlp": 1.0357033, + "epoch": 0.39573124906057416, + "flos": 14430026162880.0, + "grad_norm": 2.181744356754768, + "language_loss": 0.69438744, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.72189426, + "num_input_tokens_seen": 141342235, + "step": 6582, + "time_per_iteration": 2.719616651535034 + }, + { + "auxiliary_loss_clip": 0.01481052, + "auxiliary_loss_mlp": 0.01269251, + "balance_loss_clip": 1.16880405, + "balance_loss_mlp": 1.03598177, + "epoch": 0.39579137231324213, + "flos": 18735673875840.0, + "grad_norm": 2.1807631177260207, + "language_loss": 0.755898, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78340101, + "num_input_tokens_seen": 141361195, + "step": 6583, + "time_per_iteration": 2.747875213623047 + }, + { + "auxiliary_loss_clip": 0.01466062, + "auxiliary_loss_mlp": 0.01262186, + "balance_loss_clip": 1.15418839, + "balance_loss_mlp": 1.027964, + "epoch": 0.3958514955659101, + "flos": 17678377922400.0, + "grad_norm": 1.5480840942483094, + "language_loss": 0.65777409, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.68505657, + "num_input_tokens_seen": 141378275, + "step": 6584, + "time_per_iteration": 2.74239182472229 + }, + { + "auxiliary_loss_clip": 0.01478891, + "auxiliary_loss_mlp": 0.01273158, + "balance_loss_clip": 1.16674423, + "balance_loss_mlp": 1.03492975, + "epoch": 0.39591161881857806, + "flos": 29311364237760.0, + "grad_norm": 2.5593543273919637, + "language_loss": 0.72624767, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.75376809, + "num_input_tokens_seen": 141396960, + "step": 6585, + "time_per_iteration": 2.834733247756958 + }, + { + "auxiliary_loss_clip": 0.01469106, + "auxiliary_loss_mlp": 0.01260339, + "balance_loss_clip": 1.15638661, + "balance_loss_mlp": 1.02554417, + "epoch": 0.3959717420712461, + "flos": 25773996984480.0, + "grad_norm": 1.9330129663588687, + "language_loss": 0.73568332, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.76297772, + "num_input_tokens_seen": 141417320, + "step": 6586, + "time_per_iteration": 2.778127431869507 + }, + { + "auxiliary_loss_clip": 0.01478244, + "auxiliary_loss_mlp": 0.01269252, + "balance_loss_clip": 1.1669035, + "balance_loss_mlp": 1.03445697, + "epoch": 0.39603186532391405, + "flos": 20925933190560.0, + "grad_norm": 1.8535933284418231, + "language_loss": 0.71460199, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.74207693, + "num_input_tokens_seen": 141435985, + "step": 6587, + "time_per_iteration": 2.769855499267578 + }, + { + "auxiliary_loss_clip": 0.01575935, + "auxiliary_loss_mlp": 0.01384506, + "balance_loss_clip": 1.29421234, + "balance_loss_mlp": 1.18080139, + "epoch": 0.396091988576582, + "flos": 54886463765280.0, + "grad_norm": 0.9666706566249252, + "language_loss": 0.61057067, + "learning_rate": 2.751266999157285e-06, + "loss": 0.6401751, + "num_input_tokens_seen": 141486075, + "step": 6588, + "time_per_iteration": 3.1153244972229004 + }, + { + "auxiliary_loss_clip": 0.01471021, + "auxiliary_loss_mlp": 0.01265122, + "balance_loss_clip": 1.15913737, + "balance_loss_mlp": 1.03051805, + "epoch": 0.39615211182925, + "flos": 20704923616320.0, + "grad_norm": 2.190637152882935, + "language_loss": 0.81588262, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.84324408, + "num_input_tokens_seen": 141505280, + "step": 6589, + "time_per_iteration": 2.7076168060302734 + }, + { + "auxiliary_loss_clip": 0.0147204, + "auxiliary_loss_mlp": 0.0126319, + "balance_loss_clip": 1.15967321, + "balance_loss_mlp": 1.02648807, + "epoch": 0.39621223508191794, + "flos": 20996328584160.0, + "grad_norm": 2.6847092361257574, + "language_loss": 0.70225453, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72960681, + "num_input_tokens_seen": 141523930, + "step": 6590, + "time_per_iteration": 2.7891645431518555 + }, + { + "auxiliary_loss_clip": 0.01475243, + "auxiliary_loss_mlp": 0.01263478, + "balance_loss_clip": 1.16340685, + "balance_loss_mlp": 1.02715731, + "epoch": 0.3962723583345859, + "flos": 23371148147040.0, + "grad_norm": 2.4773290283989478, + "language_loss": 0.75921321, + "learning_rate": 2.750184048805956e-06, + "loss": 0.78660047, + "num_input_tokens_seen": 141541320, + "step": 6591, + "time_per_iteration": 2.823442220687866 + }, + { + "auxiliary_loss_clip": 0.01474308, + "auxiliary_loss_mlp": 0.01256828, + "balance_loss_clip": 1.16250086, + "balance_loss_mlp": 1.02108002, + "epoch": 0.39633248158725387, + "flos": 25117757402400.0, + "grad_norm": 1.7358014577490384, + "language_loss": 0.78959477, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81690621, + "num_input_tokens_seen": 141561880, + "step": 6592, + "time_per_iteration": 2.7988803386688232 + }, + { + "auxiliary_loss_clip": 0.01471326, + "auxiliary_loss_mlp": 0.01263947, + "balance_loss_clip": 1.15953493, + "balance_loss_mlp": 1.03239501, + "epoch": 0.39639260483992184, + "flos": 39790867692960.0, + "grad_norm": 1.7575748002032594, + "language_loss": 0.69673133, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.72408402, + "num_input_tokens_seen": 141586460, + "step": 6593, + "time_per_iteration": 3.0103793144226074 + }, + { + "auxiliary_loss_clip": 0.01470215, + "auxiliary_loss_mlp": 0.01255164, + "balance_loss_clip": 1.1583941, + "balance_loss_mlp": 1.01769865, + "epoch": 0.3964527280925898, + "flos": 17348930645760.0, + "grad_norm": 1.9911928298915955, + "language_loss": 0.77627689, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80353069, + "num_input_tokens_seen": 141605955, + "step": 6594, + "time_per_iteration": 2.775972843170166 + }, + { + "auxiliary_loss_clip": 0.01575885, + "auxiliary_loss_mlp": 0.01225998, + "balance_loss_clip": 1.29469192, + "balance_loss_mlp": 1.01771545, + "epoch": 0.39651285134525777, + "flos": 71725217630400.0, + "grad_norm": 0.9404269301863912, + "language_loss": 0.62965798, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65767682, + "num_input_tokens_seen": 141673140, + "step": 6595, + "time_per_iteration": 3.3990204334259033 + }, + { + "auxiliary_loss_clip": 0.01468708, + "auxiliary_loss_mlp": 0.0126749, + "balance_loss_clip": 1.15704274, + "balance_loss_mlp": 1.02697301, + "epoch": 0.39657297459792573, + "flos": 25778093225760.0, + "grad_norm": 2.1268485769426166, + "language_loss": 0.6328119, + "learning_rate": 2.748378562795223e-06, + "loss": 0.66017383, + "num_input_tokens_seen": 141692955, + "step": 6596, + "time_per_iteration": 2.8315083980560303 + }, + { + "auxiliary_loss_clip": 0.01469596, + "auxiliary_loss_mlp": 0.01252177, + "balance_loss_clip": 1.15835214, + "balance_loss_mlp": 1.02005315, + "epoch": 0.3966330978505937, + "flos": 20268176482080.0, + "grad_norm": 2.1789646649632575, + "language_loss": 0.79038811, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.81760591, + "num_input_tokens_seen": 141710680, + "step": 6597, + "time_per_iteration": 2.786574363708496 + }, + { + "auxiliary_loss_clip": 0.01472999, + "auxiliary_loss_mlp": 0.01261744, + "balance_loss_clip": 1.16086984, + "balance_loss_mlp": 1.01989245, + "epoch": 0.39669322110326166, + "flos": 20633276593440.0, + "grad_norm": 6.371763845360435, + "language_loss": 0.67837125, + "learning_rate": 2.747656169644941e-06, + "loss": 0.70571876, + "num_input_tokens_seen": 141729860, + "step": 6598, + "time_per_iteration": 2.7399191856384277 + }, + { + "auxiliary_loss_clip": 0.01465533, + "auxiliary_loss_mlp": 0.01253777, + "balance_loss_clip": 1.15366077, + "balance_loss_mlp": 1.01669383, + "epoch": 0.3967533443559297, + "flos": 21728235574080.0, + "grad_norm": 1.680785127224042, + "language_loss": 0.79163659, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81882966, + "num_input_tokens_seen": 141749060, + "step": 6599, + "time_per_iteration": 2.981285810470581 + }, + { + "auxiliary_loss_clip": 0.0147066, + "auxiliary_loss_mlp": 0.012562, + "balance_loss_clip": 1.15846515, + "balance_loss_mlp": 1.01949847, + "epoch": 0.39681346760859765, + "flos": 25486536545280.0, + "grad_norm": 1.868863446189881, + "language_loss": 0.72917467, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.75644326, + "num_input_tokens_seen": 141769860, + "step": 6600, + "time_per_iteration": 4.420992374420166 + }, + { + "auxiliary_loss_clip": 0.01469008, + "auxiliary_loss_mlp": 0.01262746, + "balance_loss_clip": 1.15732729, + "balance_loss_mlp": 1.02604365, + "epoch": 0.3968735908612656, + "flos": 20961699809760.0, + "grad_norm": 2.0722548981278393, + "language_loss": 0.85644007, + "learning_rate": 2.746572367319791e-06, + "loss": 0.88375759, + "num_input_tokens_seen": 141788465, + "step": 6601, + "time_per_iteration": 2.790835380554199 + }, + { + "auxiliary_loss_clip": 0.01473404, + "auxiliary_loss_mlp": 0.01265709, + "balance_loss_clip": 1.16057479, + "balance_loss_mlp": 1.02786303, + "epoch": 0.3969337141139336, + "flos": 10708629655680.0, + "grad_norm": 2.315403414313404, + "language_loss": 0.69969082, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72708189, + "num_input_tokens_seen": 141804955, + "step": 6602, + "time_per_iteration": 2.7948122024536133 + }, + { + "auxiliary_loss_clip": 0.01470523, + "auxiliary_loss_mlp": 0.01264117, + "balance_loss_clip": 1.15888071, + "balance_loss_mlp": 1.02989423, + "epoch": 0.39699383736660154, + "flos": 17595124882560.0, + "grad_norm": 3.7205834573433876, + "language_loss": 0.8301726, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85751903, + "num_input_tokens_seen": 141820025, + "step": 6603, + "time_per_iteration": 2.727151870727539 + }, + { + "auxiliary_loss_clip": 0.01471239, + "auxiliary_loss_mlp": 0.01262062, + "balance_loss_clip": 1.15913105, + "balance_loss_mlp": 1.03108263, + "epoch": 0.3970539606192695, + "flos": 17787915541440.0, + "grad_norm": 1.8001689128658387, + "language_loss": 0.73202527, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.75935829, + "num_input_tokens_seen": 141838735, + "step": 6604, + "time_per_iteration": 2.7570652961730957 + }, + { + "auxiliary_loss_clip": 0.01473067, + "auxiliary_loss_mlp": 0.01259223, + "balance_loss_clip": 1.1617837, + "balance_loss_mlp": 1.02748036, + "epoch": 0.3971140838719375, + "flos": 24791761588320.0, + "grad_norm": 1.5805264245142938, + "language_loss": 0.82647431, + "learning_rate": 2.745126901275491e-06, + "loss": 0.85379732, + "num_input_tokens_seen": 141858090, + "step": 6605, + "time_per_iteration": 2.772662401199341 + }, + { + "auxiliary_loss_clip": 0.01467256, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 1.15514684, + "balance_loss_mlp": 1.03318095, + "epoch": 0.39717420712460544, + "flos": 24245894044800.0, + "grad_norm": 1.7045066130625772, + "language_loss": 0.73762262, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.76495016, + "num_input_tokens_seen": 141877540, + "step": 6606, + "time_per_iteration": 2.828002691268921 + }, + { + "auxiliary_loss_clip": 0.014727, + "auxiliary_loss_mlp": 0.01256386, + "balance_loss_clip": 1.16126859, + "balance_loss_mlp": 1.01968384, + "epoch": 0.3972343303772734, + "flos": 25887061922400.0, + "grad_norm": 6.402676150546017, + "language_loss": 0.73768437, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76497525, + "num_input_tokens_seen": 141897315, + "step": 6607, + "time_per_iteration": 2.926969528198242 + }, + { + "auxiliary_loss_clip": 0.01475814, + "auxiliary_loss_mlp": 0.01269739, + "balance_loss_clip": 1.16429877, + "balance_loss_mlp": 1.0339905, + "epoch": 0.39729445362994137, + "flos": 45627197460480.0, + "grad_norm": 2.202512543686981, + "language_loss": 0.68014729, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70760286, + "num_input_tokens_seen": 141919580, + "step": 6608, + "time_per_iteration": 2.9137935638427734 + }, + { + "auxiliary_loss_clip": 0.01472502, + "auxiliary_loss_mlp": 0.01270322, + "balance_loss_clip": 1.16155648, + "balance_loss_mlp": 1.0307591, + "epoch": 0.39735457688260933, + "flos": 20196074321280.0, + "grad_norm": 2.1240975983590644, + "language_loss": 0.74321902, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.77064729, + "num_input_tokens_seen": 141937045, + "step": 6609, + "time_per_iteration": 2.7661960124969482 + }, + { + "auxiliary_loss_clip": 0.01470977, + "auxiliary_loss_mlp": 0.01269997, + "balance_loss_clip": 1.15900087, + "balance_loss_mlp": 1.03539348, + "epoch": 0.3974147001352773, + "flos": 23333409263520.0, + "grad_norm": 1.7614426418307936, + "language_loss": 0.71392846, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.74133825, + "num_input_tokens_seen": 141956695, + "step": 6610, + "time_per_iteration": 2.779461622238159 + }, + { + "auxiliary_loss_clip": 0.01470496, + "auxiliary_loss_mlp": 0.01264718, + "balance_loss_clip": 1.16000307, + "balance_loss_mlp": 1.03469169, + "epoch": 0.39747482338794526, + "flos": 21690762187680.0, + "grad_norm": 1.961377104756007, + "language_loss": 0.78093475, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80828691, + "num_input_tokens_seen": 141975935, + "step": 6611, + "time_per_iteration": 5.786315679550171 + }, + { + "auxiliary_loss_clip": 0.0148092, + "auxiliary_loss_mlp": 0.01266741, + "balance_loss_clip": 1.16934407, + "balance_loss_mlp": 1.03270948, + "epoch": 0.3975349466406133, + "flos": 30991143346560.0, + "grad_norm": 1.9010627540892844, + "language_loss": 0.79208505, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.81956166, + "num_input_tokens_seen": 141995750, + "step": 6612, + "time_per_iteration": 2.793975830078125 + }, + { + "auxiliary_loss_clip": 0.01598804, + "auxiliary_loss_mlp": 0.01215256, + "balance_loss_clip": 1.31756878, + "balance_loss_mlp": 1.00315857, + "epoch": 0.39759506989328125, + "flos": 63690473993760.0, + "grad_norm": 0.8855285812025419, + "language_loss": 0.64857286, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67671347, + "num_input_tokens_seen": 142057655, + "step": 6613, + "time_per_iteration": 3.268383741378784 + }, + { + "auxiliary_loss_clip": 0.01470803, + "auxiliary_loss_mlp": 0.01270951, + "balance_loss_clip": 1.15956616, + "balance_loss_mlp": 1.03596544, + "epoch": 0.3976551931459492, + "flos": 23698092165120.0, + "grad_norm": 6.339607432412015, + "language_loss": 0.72440863, + "learning_rate": 2.741872951078109e-06, + "loss": 0.75182617, + "num_input_tokens_seen": 142076020, + "step": 6614, + "time_per_iteration": 2.761509895324707 + }, + { + "auxiliary_loss_clip": 0.01464496, + "auxiliary_loss_mlp": 0.01276804, + "balance_loss_clip": 1.1524123, + "balance_loss_mlp": 1.04563332, + "epoch": 0.3977153163986172, + "flos": 15671503082880.0, + "grad_norm": 2.183995999715676, + "language_loss": 0.81197989, + "learning_rate": 2.741511260213862e-06, + "loss": 0.8393929, + "num_input_tokens_seen": 142093790, + "step": 6615, + "time_per_iteration": 2.7759509086608887 + }, + { + "auxiliary_loss_clip": 0.01471139, + "auxiliary_loss_mlp": 0.01272319, + "balance_loss_clip": 1.15984631, + "balance_loss_mlp": 1.04038501, + "epoch": 0.39777543965128515, + "flos": 14066481106080.0, + "grad_norm": 2.140447282569761, + "language_loss": 0.67649156, + "learning_rate": 2.741149541231434e-06, + "loss": 0.70392609, + "num_input_tokens_seen": 142110545, + "step": 6616, + "time_per_iteration": 4.291004657745361 + }, + { + "auxiliary_loss_clip": 0.01462501, + "auxiliary_loss_mlp": 0.01272739, + "balance_loss_clip": 1.15009928, + "balance_loss_mlp": 1.03870702, + "epoch": 0.3978355629039531, + "flos": 23369706876960.0, + "grad_norm": 2.3016690740028904, + "language_loss": 0.83757073, + "learning_rate": 2.740787794144541e-06, + "loss": 0.86492312, + "num_input_tokens_seen": 142128695, + "step": 6617, + "time_per_iteration": 2.7485077381134033 + }, + { + "auxiliary_loss_clip": 0.01464941, + "auxiliary_loss_mlp": 0.01263049, + "balance_loss_clip": 1.15358126, + "balance_loss_mlp": 1.0337857, + "epoch": 0.3978956861566211, + "flos": 19064817727200.0, + "grad_norm": 1.6614175063476309, + "language_loss": 0.72637272, + "learning_rate": 2.7404260189669e-06, + "loss": 0.75365257, + "num_input_tokens_seen": 142148375, + "step": 6618, + "time_per_iteration": 2.85593318939209 + }, + { + "auxiliary_loss_clip": 0.01459008, + "auxiliary_loss_mlp": 0.01275553, + "balance_loss_clip": 1.14592361, + "balance_loss_mlp": 1.04247499, + "epoch": 0.39795580940928904, + "flos": 30230372662560.0, + "grad_norm": 1.7432696209295158, + "language_loss": 0.65411413, + "learning_rate": 2.740064215712231e-06, + "loss": 0.68145967, + "num_input_tokens_seen": 142169735, + "step": 6619, + "time_per_iteration": 2.792741298675537 + }, + { + "auxiliary_loss_clip": 0.01553819, + "auxiliary_loss_mlp": 0.01229935, + "balance_loss_clip": 1.26675618, + "balance_loss_mlp": 1.0193634, + "epoch": 0.398015932661957, + "flos": 69853937420160.0, + "grad_norm": 0.7753818207508428, + "language_loss": 0.58140433, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60924184, + "num_input_tokens_seen": 142229520, + "step": 6620, + "time_per_iteration": 3.3031880855560303 + }, + { + "auxiliary_loss_clip": 0.01444382, + "auxiliary_loss_mlp": 0.01273147, + "balance_loss_clip": 1.1308924, + "balance_loss_mlp": 1.04388392, + "epoch": 0.39807605591462497, + "flos": 20159890492320.0, + "grad_norm": 7.612043053731406, + "language_loss": 0.79082108, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81799638, + "num_input_tokens_seen": 142247660, + "step": 6621, + "time_per_iteration": 2.7571723461151123 + }, + { + "auxiliary_loss_clip": 0.01450009, + "auxiliary_loss_mlp": 0.01281216, + "balance_loss_clip": 1.13596237, + "balance_loss_mlp": 1.05080795, + "epoch": 0.39813617916729294, + "flos": 21143870583840.0, + "grad_norm": 1.931334095934736, + "language_loss": 0.78214025, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80945253, + "num_input_tokens_seen": 142266990, + "step": 6622, + "time_per_iteration": 2.7316665649414062 + }, + { + "auxiliary_loss_clip": 0.01453761, + "auxiliary_loss_mlp": 0.01282279, + "balance_loss_clip": 1.13888085, + "balance_loss_mlp": 1.05225301, + "epoch": 0.3981963024199609, + "flos": 18990439876800.0, + "grad_norm": 2.335988541147344, + "language_loss": 0.74711633, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77447677, + "num_input_tokens_seen": 142287170, + "step": 6623, + "time_per_iteration": 2.799865961074829 + }, + { + "auxiliary_loss_clip": 0.0145575, + "auxiliary_loss_mlp": 0.01270636, + "balance_loss_clip": 1.14147091, + "balance_loss_mlp": 1.03965604, + "epoch": 0.39825642567262887, + "flos": 16576629801120.0, + "grad_norm": 1.8249713512202679, + "language_loss": 0.79801857, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.82528239, + "num_input_tokens_seen": 142305405, + "step": 6624, + "time_per_iteration": 2.7282395362854004 + }, + { + "auxiliary_loss_clip": 0.01454311, + "auxiliary_loss_mlp": 0.01275811, + "balance_loss_clip": 1.13928545, + "balance_loss_mlp": 1.04387772, + "epoch": 0.39831654892529683, + "flos": 22202000956800.0, + "grad_norm": 2.301007231174176, + "language_loss": 0.84209025, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.86939156, + "num_input_tokens_seen": 142322710, + "step": 6625, + "time_per_iteration": 2.7757434844970703 + }, + { + "auxiliary_loss_clip": 0.01448122, + "auxiliary_loss_mlp": 0.01275967, + "balance_loss_clip": 1.13445818, + "balance_loss_mlp": 1.04498744, + "epoch": 0.39837667217796485, + "flos": 10489478561280.0, + "grad_norm": 2.191857000129096, + "language_loss": 0.86906171, + "learning_rate": 2.737530807925321e-06, + "loss": 0.89630258, + "num_input_tokens_seen": 142338535, + "step": 6626, + "time_per_iteration": 2.7978549003601074 + }, + { + "auxiliary_loss_clip": 0.01464078, + "auxiliary_loss_mlp": 0.01286989, + "balance_loss_clip": 1.1509459, + "balance_loss_mlp": 1.05391097, + "epoch": 0.3984367954306328, + "flos": 17967089990880.0, + "grad_norm": 3.1578248707734113, + "language_loss": 0.83448637, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86199701, + "num_input_tokens_seen": 142354570, + "step": 6627, + "time_per_iteration": 2.7626564502716064 + }, + { + "auxiliary_loss_clip": 0.01457173, + "auxiliary_loss_mlp": 0.01275995, + "balance_loss_clip": 1.14341617, + "balance_loss_mlp": 1.04749513, + "epoch": 0.3984969186833008, + "flos": 22713277654080.0, + "grad_norm": 1.5627924221629292, + "language_loss": 0.82939452, + "learning_rate": 2.736806725217998e-06, + "loss": 0.85672629, + "num_input_tokens_seen": 142374395, + "step": 6628, + "time_per_iteration": 2.8069944381713867 + }, + { + "auxiliary_loss_clip": 0.01461699, + "auxiliary_loss_mlp": 0.01279714, + "balance_loss_clip": 1.14795804, + "balance_loss_mlp": 1.04797149, + "epoch": 0.39855704193596875, + "flos": 23408128467360.0, + "grad_norm": 1.7515140868364036, + "language_loss": 0.70869267, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.73610681, + "num_input_tokens_seen": 142396040, + "step": 6629, + "time_per_iteration": 2.818673610687256 + }, + { + "auxiliary_loss_clip": 0.0146915, + "auxiliary_loss_mlp": 0.01274809, + "balance_loss_clip": 1.15618968, + "balance_loss_mlp": 1.0440197, + "epoch": 0.3986171651886367, + "flos": 21254090909760.0, + "grad_norm": 1.9622794469958522, + "language_loss": 0.80872566, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.83616525, + "num_input_tokens_seen": 142415495, + "step": 6630, + "time_per_iteration": 2.785778284072876 + }, + { + "auxiliary_loss_clip": 0.01460673, + "auxiliary_loss_mlp": 0.01261182, + "balance_loss_clip": 1.14744604, + "balance_loss_mlp": 1.0290581, + "epoch": 0.3986772884413047, + "flos": 12460359212640.0, + "grad_norm": 7.019832286826624, + "language_loss": 0.75242782, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.7796464, + "num_input_tokens_seen": 142431865, + "step": 6631, + "time_per_iteration": 2.737874746322632 + }, + { + "auxiliary_loss_clip": 0.01454008, + "auxiliary_loss_mlp": 0.01262328, + "balance_loss_clip": 1.13924789, + "balance_loss_mlp": 1.02848709, + "epoch": 0.39873741169397264, + "flos": 19648158657120.0, + "grad_norm": 1.8820059929917443, + "language_loss": 0.71369481, + "learning_rate": 2.735358224635783e-06, + "loss": 0.74085814, + "num_input_tokens_seen": 142450595, + "step": 6632, + "time_per_iteration": 2.797471046447754 + }, + { + "auxiliary_loss_clip": 0.0144956, + "auxiliary_loss_mlp": 0.01257737, + "balance_loss_clip": 1.13515484, + "balance_loss_mlp": 1.02618527, + "epoch": 0.3987975349466406, + "flos": 21686665946400.0, + "grad_norm": 2.085008803020167, + "language_loss": 0.74726021, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.77433324, + "num_input_tokens_seen": 142466650, + "step": 6633, + "time_per_iteration": 2.7156035900115967 + }, + { + "auxiliary_loss_clip": 0.01456375, + "auxiliary_loss_mlp": 0.01261179, + "balance_loss_clip": 1.14119887, + "balance_loss_mlp": 1.02886403, + "epoch": 0.3988576581993086, + "flos": 23916295055520.0, + "grad_norm": 2.5055977808959913, + "language_loss": 0.81307352, + "learning_rate": 2.7346338069806e-06, + "loss": 0.84024906, + "num_input_tokens_seen": 142486165, + "step": 6634, + "time_per_iteration": 2.794581890106201 + }, + { + "auxiliary_loss_clip": 0.01463138, + "auxiliary_loss_mlp": 0.01261406, + "balance_loss_clip": 1.14752805, + "balance_loss_mlp": 1.0256573, + "epoch": 0.39891778145197654, + "flos": 18151839879840.0, + "grad_norm": 1.8834960138304193, + "language_loss": 0.74621367, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.77345908, + "num_input_tokens_seen": 142505035, + "step": 6635, + "time_per_iteration": 2.7982754707336426 + }, + { + "auxiliary_loss_clip": 0.01456204, + "auxiliary_loss_mlp": 0.01271035, + "balance_loss_clip": 1.14056897, + "balance_loss_mlp": 1.03204465, + "epoch": 0.3989779047046445, + "flos": 22597140535200.0, + "grad_norm": 1.9206388616286043, + "language_loss": 0.6597625, + "learning_rate": 2.733909277895868e-06, + "loss": 0.6870349, + "num_input_tokens_seen": 142521870, + "step": 6636, + "time_per_iteration": 2.7822937965393066 + }, + { + "auxiliary_loss_clip": 0.01458816, + "auxiliary_loss_mlp": 0.01266828, + "balance_loss_clip": 1.14384699, + "balance_loss_mlp": 1.03260565, + "epoch": 0.39903802795731247, + "flos": 18079168796640.0, + "grad_norm": 1.7071203410295688, + "language_loss": 0.81510079, + "learning_rate": 2.733546971601763e-06, + "loss": 0.84235728, + "num_input_tokens_seen": 142540455, + "step": 6637, + "time_per_iteration": 2.784822702407837 + }, + { + "auxiliary_loss_clip": 0.01623839, + "auxiliary_loss_mlp": 0.01214561, + "balance_loss_clip": 1.3271687, + "balance_loss_mlp": 1.00475311, + "epoch": 0.39909815120998043, + "flos": 70448694726240.0, + "grad_norm": 0.7192498795171783, + "language_loss": 0.53115284, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55953681, + "num_input_tokens_seen": 142599665, + "step": 6638, + "time_per_iteration": 5.027011156082153 + }, + { + "auxiliary_loss_clip": 0.0144954, + "auxiliary_loss_mlp": 0.01256397, + "balance_loss_clip": 1.13265193, + "balance_loss_mlp": 1.01893163, + "epoch": 0.39915827446264845, + "flos": 18551113627680.0, + "grad_norm": 3.649198296892403, + "language_loss": 0.75471997, + "learning_rate": 2.732822275578769e-06, + "loss": 0.78177941, + "num_input_tokens_seen": 142618845, + "step": 6639, + "time_per_iteration": 2.7926080226898193 + }, + { + "auxiliary_loss_clip": 0.014529, + "auxiliary_loss_mlp": 0.01262278, + "balance_loss_clip": 1.13694215, + "balance_loss_mlp": 1.03263283, + "epoch": 0.3992183977153164, + "flos": 29899787541120.0, + "grad_norm": 1.952369812896428, + "language_loss": 0.76686537, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.7940172, + "num_input_tokens_seen": 142640885, + "step": 6640, + "time_per_iteration": 2.8877153396606445 + }, + { + "auxiliary_loss_clip": 0.01448632, + "auxiliary_loss_mlp": 0.01263824, + "balance_loss_clip": 1.13229954, + "balance_loss_mlp": 1.02960205, + "epoch": 0.3992785209679844, + "flos": 22567290708960.0, + "grad_norm": 2.4329550475216752, + "language_loss": 0.81795627, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84508085, + "num_input_tokens_seen": 142659340, + "step": 6641, + "time_per_iteration": 2.786161422729492 + }, + { + "auxiliary_loss_clip": 0.01448976, + "auxiliary_loss_mlp": 0.01256928, + "balance_loss_clip": 1.13147211, + "balance_loss_mlp": 1.02213335, + "epoch": 0.39933864422065235, + "flos": 19684721767680.0, + "grad_norm": 2.2786628775852975, + "language_loss": 0.76209533, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78915435, + "num_input_tokens_seen": 142677085, + "step": 6642, + "time_per_iteration": 2.766037702560425 + }, + { + "auxiliary_loss_clip": 0.01452107, + "auxiliary_loss_mlp": 0.01260772, + "balance_loss_clip": 1.13436651, + "balance_loss_mlp": 1.0263592, + "epoch": 0.3993987674733203, + "flos": 23040676810080.0, + "grad_norm": 2.1249899354417945, + "language_loss": 0.72707331, + "learning_rate": 2.731372550178393e-06, + "loss": 0.75420207, + "num_input_tokens_seen": 142694595, + "step": 6643, + "time_per_iteration": 2.737605094909668 + }, + { + "auxiliary_loss_clip": 0.01451689, + "auxiliary_loss_mlp": 0.01253287, + "balance_loss_clip": 1.13368964, + "balance_loss_mlp": 1.01792073, + "epoch": 0.3994588907259883, + "flos": 19392861661920.0, + "grad_norm": 1.591705747586049, + "language_loss": 0.66312099, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.69017071, + "num_input_tokens_seen": 142714175, + "step": 6644, + "time_per_iteration": 2.764819860458374 + }, + { + "auxiliary_loss_clip": 0.0145221, + "auxiliary_loss_mlp": 0.01266091, + "balance_loss_clip": 1.13573194, + "balance_loss_mlp": 1.0324409, + "epoch": 0.39951901397865625, + "flos": 13736047697280.0, + "grad_norm": 1.989343798343658, + "language_loss": 0.78329045, + "learning_rate": 2.730647521020907e-06, + "loss": 0.81047344, + "num_input_tokens_seen": 142730955, + "step": 6645, + "time_per_iteration": 2.6683456897735596 + }, + { + "auxiliary_loss_clip": 0.01446851, + "auxiliary_loss_mlp": 0.01268486, + "balance_loss_clip": 1.13066065, + "balance_loss_mlp": 1.03540778, + "epoch": 0.3995791372313242, + "flos": 23588706258720.0, + "grad_norm": 1.8718895668033257, + "language_loss": 0.69876349, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72591686, + "num_input_tokens_seen": 142751200, + "step": 6646, + "time_per_iteration": 2.7368154525756836 + }, + { + "auxiliary_loss_clip": 0.01449737, + "auxiliary_loss_mlp": 0.01264111, + "balance_loss_clip": 1.13231695, + "balance_loss_mlp": 1.03179598, + "epoch": 0.3996392604839922, + "flos": 21357749664000.0, + "grad_norm": 1.9403871134810264, + "language_loss": 0.71813786, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74527633, + "num_input_tokens_seen": 142770170, + "step": 6647, + "time_per_iteration": 2.7662582397460938 + }, + { + "auxiliary_loss_clip": 0.01443215, + "auxiliary_loss_mlp": 0.01263998, + "balance_loss_clip": 1.12638354, + "balance_loss_mlp": 1.03568804, + "epoch": 0.39969938373666014, + "flos": 26034717706560.0, + "grad_norm": 1.6523214433477964, + "language_loss": 0.74470294, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.77177507, + "num_input_tokens_seen": 142792680, + "step": 6648, + "time_per_iteration": 2.76729154586792 + }, + { + "auxiliary_loss_clip": 0.01441376, + "auxiliary_loss_mlp": 0.01265149, + "balance_loss_clip": 1.12297106, + "balance_loss_mlp": 1.0335964, + "epoch": 0.3997595069893281, + "flos": 20118207080160.0, + "grad_norm": 2.3749422802440874, + "language_loss": 0.66031718, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68738246, + "num_input_tokens_seen": 142810510, + "step": 6649, + "time_per_iteration": 4.215905427932739 + }, + { + "auxiliary_loss_clip": 0.01450557, + "auxiliary_loss_mlp": 0.01277936, + "balance_loss_clip": 1.13324523, + "balance_loss_mlp": 1.0479095, + "epoch": 0.39981963024199607, + "flos": 27785954197440.0, + "grad_norm": 1.8475555791392804, + "language_loss": 0.7524966, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77978146, + "num_input_tokens_seen": 142832455, + "step": 6650, + "time_per_iteration": 4.339155197143555 + }, + { + "auxiliary_loss_clip": 0.01438716, + "auxiliary_loss_mlp": 0.0126265, + "balance_loss_clip": 1.12016678, + "balance_loss_mlp": 1.03300536, + "epoch": 0.39987975349466404, + "flos": 21946590177120.0, + "grad_norm": 1.716505391659202, + "language_loss": 0.71842563, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74543929, + "num_input_tokens_seen": 142852590, + "step": 6651, + "time_per_iteration": 2.8468716144561768 + }, + { + "auxiliary_loss_clip": 0.01442758, + "auxiliary_loss_mlp": 0.01269925, + "balance_loss_clip": 1.12431192, + "balance_loss_mlp": 1.03818178, + "epoch": 0.39993987674733206, + "flos": 20706516599040.0, + "grad_norm": 1.9502810361596432, + "language_loss": 0.73341197, + "learning_rate": 2.728109046945403e-06, + "loss": 0.76053882, + "num_input_tokens_seen": 142870595, + "step": 6652, + "time_per_iteration": 2.8354544639587402 + }, + { + "auxiliary_loss_clip": 0.01593013, + "auxiliary_loss_mlp": 0.01248589, + "balance_loss_clip": 1.29270124, + "balance_loss_mlp": 1.04106903, + "epoch": 0.4, + "flos": 61531505775360.0, + "grad_norm": 0.8381222295268655, + "language_loss": 0.60472715, + "learning_rate": 2.727746297241862e-06, + "loss": 0.63314319, + "num_input_tokens_seen": 142925805, + "step": 6653, + "time_per_iteration": 3.256767749786377 + }, + { + "auxiliary_loss_clip": 0.01446759, + "auxiliary_loss_mlp": 0.01263385, + "balance_loss_clip": 1.12733817, + "balance_loss_mlp": 1.03469348, + "epoch": 0.400060123252668, + "flos": 14504100588000.0, + "grad_norm": 2.904827512107781, + "language_loss": 0.6669085, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.69400996, + "num_input_tokens_seen": 142943145, + "step": 6654, + "time_per_iteration": 2.7291085720062256 + }, + { + "auxiliary_loss_clip": 0.01447643, + "auxiliary_loss_mlp": 0.01258087, + "balance_loss_clip": 1.12906408, + "balance_loss_mlp": 1.02577209, + "epoch": 0.40012024650533595, + "flos": 19095008906880.0, + "grad_norm": 2.536430705641959, + "language_loss": 0.90137345, + "learning_rate": 2.7270207150599e-06, + "loss": 0.92843074, + "num_input_tokens_seen": 142956925, + "step": 6655, + "time_per_iteration": 4.328363418579102 + }, + { + "auxiliary_loss_clip": 0.01448584, + "auxiliary_loss_mlp": 0.01260333, + "balance_loss_clip": 1.13071334, + "balance_loss_mlp": 1.03431249, + "epoch": 0.4001803697580039, + "flos": 29353578644160.0, + "grad_norm": 2.176049256576926, + "language_loss": 0.73371804, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.76080728, + "num_input_tokens_seen": 142978040, + "step": 6656, + "time_per_iteration": 2.846240758895874 + }, + { + "auxiliary_loss_clip": 0.01443057, + "auxiliary_loss_mlp": 0.01263826, + "balance_loss_clip": 1.12438881, + "balance_loss_mlp": 1.03074801, + "epoch": 0.4002404930106719, + "flos": 20921988661920.0, + "grad_norm": 1.5871595444188393, + "language_loss": 0.7359488, + "learning_rate": 2.726295022603144e-06, + "loss": 0.76301765, + "num_input_tokens_seen": 142998390, + "step": 6657, + "time_per_iteration": 2.784297227859497 + }, + { + "auxiliary_loss_clip": 0.01457106, + "auxiliary_loss_mlp": 0.01263347, + "balance_loss_clip": 1.13878322, + "balance_loss_mlp": 1.02817035, + "epoch": 0.40030061626333985, + "flos": 28408437352800.0, + "grad_norm": 1.6051504754911767, + "language_loss": 0.79499036, + "learning_rate": 2.725932135056117e-06, + "loss": 0.82219493, + "num_input_tokens_seen": 143021505, + "step": 6658, + "time_per_iteration": 2.880572557449341 + }, + { + "auxiliary_loss_clip": 0.01450297, + "auxiliary_loss_mlp": 0.01261135, + "balance_loss_clip": 1.13082957, + "balance_loss_mlp": 1.02538681, + "epoch": 0.4003607395160078, + "flos": 25924004314560.0, + "grad_norm": 1.932986894284965, + "language_loss": 0.77267927, + "learning_rate": 2.72556921998167e-06, + "loss": 0.7997936, + "num_input_tokens_seen": 143041375, + "step": 6659, + "time_per_iteration": 2.8286542892456055 + }, + { + "auxiliary_loss_clip": 0.01442292, + "auxiliary_loss_mlp": 0.01252239, + "balance_loss_clip": 1.12377369, + "balance_loss_mlp": 1.02412033, + "epoch": 0.4004208627686758, + "flos": 20770274564640.0, + "grad_norm": 2.078925057698242, + "language_loss": 0.7255463, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.75249159, + "num_input_tokens_seen": 143058725, + "step": 6660, + "time_per_iteration": 2.8167147636413574 + }, + { + "auxiliary_loss_clip": 0.01438566, + "auxiliary_loss_mlp": 0.0126058, + "balance_loss_clip": 1.11956263, + "balance_loss_mlp": 1.02959979, + "epoch": 0.40048098602134374, + "flos": 24683551454880.0, + "grad_norm": 4.960018910587717, + "language_loss": 0.71459126, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.74158269, + "num_input_tokens_seen": 143076995, + "step": 6661, + "time_per_iteration": 2.840097427368164 + }, + { + "auxiliary_loss_clip": 0.01449992, + "auxiliary_loss_mlp": 0.01263366, + "balance_loss_clip": 1.13158107, + "balance_loss_mlp": 1.02876163, + "epoch": 0.4005411092740117, + "flos": 23187953312640.0, + "grad_norm": 2.003531028107265, + "language_loss": 0.7566399, + "learning_rate": 2.724480309731437e-06, + "loss": 0.78377348, + "num_input_tokens_seen": 143096780, + "step": 6662, + "time_per_iteration": 2.8642120361328125 + }, + { + "auxiliary_loss_clip": 0.01435854, + "auxiliary_loss_mlp": 0.01259426, + "balance_loss_clip": 1.11574757, + "balance_loss_mlp": 1.02577591, + "epoch": 0.4006012325266797, + "flos": 17523819213120.0, + "grad_norm": 2.7880804753675106, + "language_loss": 0.66798174, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.69493449, + "num_input_tokens_seen": 143112590, + "step": 6663, + "time_per_iteration": 2.7611496448516846 + }, + { + "auxiliary_loss_clip": 0.01442528, + "auxiliary_loss_mlp": 0.01257969, + "balance_loss_clip": 1.12277544, + "balance_loss_mlp": 1.02565384, + "epoch": 0.40066135577934764, + "flos": 19858813843680.0, + "grad_norm": 2.396833913005164, + "language_loss": 0.86271477, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.88971972, + "num_input_tokens_seen": 143130220, + "step": 6664, + "time_per_iteration": 2.7368738651275635 + }, + { + "auxiliary_loss_clip": 0.01436776, + "auxiliary_loss_mlp": 0.01253969, + "balance_loss_clip": 1.11786079, + "balance_loss_mlp": 1.01936483, + "epoch": 0.40072147903201566, + "flos": 18151953664320.0, + "grad_norm": 2.0085232533682116, + "language_loss": 0.84660709, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87351459, + "num_input_tokens_seen": 143147160, + "step": 6665, + "time_per_iteration": 2.7336690425872803 + }, + { + "auxiliary_loss_clip": 0.01441838, + "auxiliary_loss_mlp": 0.01253987, + "balance_loss_clip": 1.12269163, + "balance_loss_mlp": 1.01919293, + "epoch": 0.4007816022846836, + "flos": 18663268289760.0, + "grad_norm": 2.0438473643800847, + "language_loss": 0.7840305, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.81098878, + "num_input_tokens_seen": 143164605, + "step": 6666, + "time_per_iteration": 2.7690980434417725 + }, + { + "auxiliary_loss_clip": 0.0144485, + "auxiliary_loss_mlp": 0.01260791, + "balance_loss_clip": 1.12654281, + "balance_loss_mlp": 1.02370763, + "epoch": 0.4008417255373516, + "flos": 25705573855200.0, + "grad_norm": 8.584690553969002, + "language_loss": 0.73478585, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.76184225, + "num_input_tokens_seen": 143183965, + "step": 6667, + "time_per_iteration": 2.8080456256866455 + }, + { + "auxiliary_loss_clip": 0.01441329, + "auxiliary_loss_mlp": 0.0126891, + "balance_loss_clip": 1.12240195, + "balance_loss_mlp": 1.02991915, + "epoch": 0.40090184879001955, + "flos": 22860933438240.0, + "grad_norm": 1.4404840331017863, + "language_loss": 0.76021206, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.78731441, + "num_input_tokens_seen": 143204965, + "step": 6668, + "time_per_iteration": 2.7964284420013428 + }, + { + "auxiliary_loss_clip": 0.01448494, + "auxiliary_loss_mlp": 0.01270911, + "balance_loss_clip": 1.12860084, + "balance_loss_mlp": 1.03611684, + "epoch": 0.4009619720426875, + "flos": 29062439173440.0, + "grad_norm": 2.1894687691106043, + "language_loss": 0.81996703, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84716111, + "num_input_tokens_seen": 143225015, + "step": 6669, + "time_per_iteration": 2.865171432495117 + }, + { + "auxiliary_loss_clip": 0.01569011, + "auxiliary_loss_mlp": 0.01204071, + "balance_loss_clip": 1.26752257, + "balance_loss_mlp": 0.99349976, + "epoch": 0.4010220952953555, + "flos": 66066658970400.0, + "grad_norm": 0.7064167805025241, + "language_loss": 0.53329897, + "learning_rate": 2.721575341289695e-06, + "loss": 0.56102979, + "num_input_tokens_seen": 143294925, + "step": 6670, + "time_per_iteration": 3.484639883041382 + }, + { + "auxiliary_loss_clip": 0.01441023, + "auxiliary_loss_mlp": 0.01248816, + "balance_loss_clip": 1.12185073, + "balance_loss_mlp": 1.01707315, + "epoch": 0.40108221854802345, + "flos": 29645400821760.0, + "grad_norm": 1.7390615584480544, + "language_loss": 0.88276178, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.90966022, + "num_input_tokens_seen": 143314170, + "step": 6671, + "time_per_iteration": 2.838010549545288 + }, + { + "auxiliary_loss_clip": 0.01443368, + "auxiliary_loss_mlp": 0.01252878, + "balance_loss_clip": 1.12508154, + "balance_loss_mlp": 1.01865542, + "epoch": 0.4011423418006914, + "flos": 19931143573440.0, + "grad_norm": 1.926271119152758, + "language_loss": 0.79176688, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81872934, + "num_input_tokens_seen": 143330050, + "step": 6672, + "time_per_iteration": 2.7805521488189697 + }, + { + "auxiliary_loss_clip": 0.01442696, + "auxiliary_loss_mlp": 0.01259762, + "balance_loss_clip": 1.12323308, + "balance_loss_mlp": 1.02744651, + "epoch": 0.4012024650533594, + "flos": 20086384989600.0, + "grad_norm": 3.162207210748228, + "language_loss": 0.6326403, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65966493, + "num_input_tokens_seen": 143348650, + "step": 6673, + "time_per_iteration": 2.8476200103759766 + }, + { + "auxiliary_loss_clip": 0.01440901, + "auxiliary_loss_mlp": 0.01257592, + "balance_loss_clip": 1.12125611, + "balance_loss_mlp": 1.02680254, + "epoch": 0.40126258830602735, + "flos": 21698234035200.0, + "grad_norm": 1.4630570300179668, + "language_loss": 0.8031162, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.83010107, + "num_input_tokens_seen": 143370275, + "step": 6674, + "time_per_iteration": 2.862628221511841 + }, + { + "auxiliary_loss_clip": 0.01441443, + "auxiliary_loss_mlp": 0.0127652, + "balance_loss_clip": 1.12160516, + "balance_loss_mlp": 1.04267955, + "epoch": 0.4013227115586953, + "flos": 12021981167520.0, + "grad_norm": 2.521344712791341, + "language_loss": 0.82348502, + "learning_rate": 2.719758846294294e-06, + "loss": 0.85066468, + "num_input_tokens_seen": 143385390, + "step": 6675, + "time_per_iteration": 4.4735424518585205 + }, + { + "auxiliary_loss_clip": 0.01445901, + "auxiliary_loss_mlp": 0.01261203, + "balance_loss_clip": 1.12576866, + "balance_loss_mlp": 1.03003192, + "epoch": 0.4013828348113633, + "flos": 25449897578400.0, + "grad_norm": 1.6050122418235104, + "language_loss": 0.93243098, + "learning_rate": 2.71939546536012e-06, + "loss": 0.9595021, + "num_input_tokens_seen": 143404215, + "step": 6676, + "time_per_iteration": 2.7729990482330322 + }, + { + "auxiliary_loss_clip": 0.01447305, + "auxiliary_loss_mlp": 0.01259125, + "balance_loss_clip": 1.12650013, + "balance_loss_mlp": 1.02242279, + "epoch": 0.40144295806403124, + "flos": 18584642485440.0, + "grad_norm": 1.9941471705382157, + "language_loss": 0.79187799, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81894231, + "num_input_tokens_seen": 143422245, + "step": 6677, + "time_per_iteration": 2.7607061862945557 + }, + { + "auxiliary_loss_clip": 0.01448393, + "auxiliary_loss_mlp": 0.012753, + "balance_loss_clip": 1.12896991, + "balance_loss_mlp": 1.04336667, + "epoch": 0.4015030813166992, + "flos": 22932466676640.0, + "grad_norm": 1.851658096106494, + "language_loss": 0.83785045, + "learning_rate": 2.71866862166691e-06, + "loss": 0.86508739, + "num_input_tokens_seen": 143443130, + "step": 6678, + "time_per_iteration": 2.7488348484039307 + }, + { + "auxiliary_loss_clip": 0.01453684, + "auxiliary_loss_mlp": 0.01271439, + "balance_loss_clip": 1.13320255, + "balance_loss_mlp": 1.04446459, + "epoch": 0.4015632045693672, + "flos": 20597168620800.0, + "grad_norm": 3.4690985349906738, + "language_loss": 0.63715518, + "learning_rate": 2.718305158935434e-06, + "loss": 0.66440642, + "num_input_tokens_seen": 143461385, + "step": 6679, + "time_per_iteration": 2.7364919185638428 + }, + { + "auxiliary_loss_clip": 0.01452197, + "auxiliary_loss_mlp": 0.01258763, + "balance_loss_clip": 1.13291347, + "balance_loss_mlp": 1.02911842, + "epoch": 0.4016233278220352, + "flos": 23441088402720.0, + "grad_norm": 1.4871663086216818, + "language_loss": 0.7858389, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.81294852, + "num_input_tokens_seen": 143481750, + "step": 6680, + "time_per_iteration": 2.776787757873535 + }, + { + "auxiliary_loss_clip": 0.01455291, + "auxiliary_loss_mlp": 0.0127248, + "balance_loss_clip": 1.13566828, + "balance_loss_mlp": 1.03940248, + "epoch": 0.40168345107470316, + "flos": 21433037790240.0, + "grad_norm": 1.647753635559192, + "language_loss": 0.76108754, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.78836524, + "num_input_tokens_seen": 143501540, + "step": 6681, + "time_per_iteration": 2.7690043449401855 + }, + { + "auxiliary_loss_clip": 0.01461358, + "auxiliary_loss_mlp": 0.01278041, + "balance_loss_clip": 1.14129591, + "balance_loss_mlp": 1.0480144, + "epoch": 0.4017435743273711, + "flos": 22859454240000.0, + "grad_norm": 1.8611090125404188, + "language_loss": 0.64072323, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66811717, + "num_input_tokens_seen": 143520530, + "step": 6682, + "time_per_iteration": 2.7638442516326904 + }, + { + "auxiliary_loss_clip": 0.01454397, + "auxiliary_loss_mlp": 0.01266543, + "balance_loss_clip": 1.13373351, + "balance_loss_mlp": 1.03766108, + "epoch": 0.4018036975800391, + "flos": 28624895547840.0, + "grad_norm": 1.8806665005558254, + "language_loss": 0.72679341, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75400281, + "num_input_tokens_seen": 143540210, + "step": 6683, + "time_per_iteration": 2.834350347518921 + }, + { + "auxiliary_loss_clip": 0.01447699, + "auxiliary_loss_mlp": 0.0126069, + "balance_loss_clip": 1.12725055, + "balance_loss_mlp": 1.03066373, + "epoch": 0.40186382083270705, + "flos": 26653635614880.0, + "grad_norm": 1.6670776826276708, + "language_loss": 0.73445415, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.76153803, + "num_input_tokens_seen": 143560940, + "step": 6684, + "time_per_iteration": 2.897797107696533 + }, + { + "auxiliary_loss_clip": 0.01576961, + "auxiliary_loss_mlp": 0.0126873, + "balance_loss_clip": 1.27158785, + "balance_loss_mlp": 1.06273651, + "epoch": 0.401923944085375, + "flos": 59265730765440.0, + "grad_norm": 0.8172589529847472, + "language_loss": 0.60415745, + "learning_rate": 2.716123811026767e-06, + "loss": 0.63261437, + "num_input_tokens_seen": 143624015, + "step": 6685, + "time_per_iteration": 4.8893303871154785 + }, + { + "auxiliary_loss_clip": 0.01451359, + "auxiliary_loss_mlp": 0.012645, + "balance_loss_clip": 1.13066971, + "balance_loss_mlp": 1.03352058, + "epoch": 0.401984067338043, + "flos": 16984702882080.0, + "grad_norm": 2.0752823898458077, + "language_loss": 0.69605052, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72320908, + "num_input_tokens_seen": 143642750, + "step": 6686, + "time_per_iteration": 4.30877685546875 + }, + { + "auxiliary_loss_clip": 0.01454188, + "auxiliary_loss_mlp": 0.01262922, + "balance_loss_clip": 1.13485849, + "balance_loss_mlp": 1.03690147, + "epoch": 0.40204419059071095, + "flos": 24974804710080.0, + "grad_norm": 1.9184663814515954, + "language_loss": 0.74804246, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.7752136, + "num_input_tokens_seen": 143664515, + "step": 6687, + "time_per_iteration": 2.8086938858032227 + }, + { + "auxiliary_loss_clip": 0.01457109, + "auxiliary_loss_mlp": 0.01268545, + "balance_loss_clip": 1.13615322, + "balance_loss_mlp": 1.03699267, + "epoch": 0.4021043138433789, + "flos": 23479737562080.0, + "grad_norm": 2.376550224047583, + "language_loss": 0.70992547, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.73718196, + "num_input_tokens_seen": 143683135, + "step": 6688, + "time_per_iteration": 2.787379026412964 + }, + { + "auxiliary_loss_clip": 0.01452556, + "auxiliary_loss_mlp": 0.0126062, + "balance_loss_clip": 1.13212144, + "balance_loss_mlp": 1.0283047, + "epoch": 0.4021644370960469, + "flos": 25998268380480.0, + "grad_norm": 1.7762811978239417, + "language_loss": 0.64624846, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.67338014, + "num_input_tokens_seen": 143703985, + "step": 6689, + "time_per_iteration": 2.8211750984191895 + }, + { + "auxiliary_loss_clip": 0.01448418, + "auxiliary_loss_mlp": 0.0125901, + "balance_loss_clip": 1.12800705, + "balance_loss_mlp": 1.02669537, + "epoch": 0.40222456034871484, + "flos": 13589605614240.0, + "grad_norm": 2.3427081674654513, + "language_loss": 0.73024571, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75731999, + "num_input_tokens_seen": 143719245, + "step": 6690, + "time_per_iteration": 2.728764057159424 + }, + { + "auxiliary_loss_clip": 0.01451317, + "auxiliary_loss_mlp": 0.01264804, + "balance_loss_clip": 1.13099957, + "balance_loss_mlp": 1.03172576, + "epoch": 0.4022846836013828, + "flos": 24280219393920.0, + "grad_norm": 1.64465763216423, + "language_loss": 0.74707907, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.77424026, + "num_input_tokens_seen": 143739575, + "step": 6691, + "time_per_iteration": 2.7721917629241943 + }, + { + "auxiliary_loss_clip": 0.0145917, + "auxiliary_loss_mlp": 0.01266785, + "balance_loss_clip": 1.13944173, + "balance_loss_mlp": 1.03160858, + "epoch": 0.40234480685405083, + "flos": 20153366848800.0, + "grad_norm": 2.606627754632948, + "language_loss": 0.72658664, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.75384617, + "num_input_tokens_seen": 143758515, + "step": 6692, + "time_per_iteration": 4.255327463150024 + }, + { + "auxiliary_loss_clip": 0.0144747, + "auxiliary_loss_mlp": 0.01263768, + "balance_loss_clip": 1.1267693, + "balance_loss_mlp": 1.03316915, + "epoch": 0.4024049301067188, + "flos": 22932466676640.0, + "grad_norm": 2.0381080231188244, + "language_loss": 0.84460652, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.87171888, + "num_input_tokens_seen": 143776770, + "step": 6693, + "time_per_iteration": 2.7907326221466064 + }, + { + "auxiliary_loss_clip": 0.01464697, + "auxiliary_loss_mlp": 0.01261635, + "balance_loss_clip": 1.14383399, + "balance_loss_mlp": 1.02779424, + "epoch": 0.40246505335938676, + "flos": 36031390948800.0, + "grad_norm": 1.8595317182444722, + "language_loss": 0.70787096, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73513424, + "num_input_tokens_seen": 143798450, + "step": 6694, + "time_per_iteration": 2.903876781463623 + }, + { + "auxiliary_loss_clip": 0.01451586, + "auxiliary_loss_mlp": 0.01252336, + "balance_loss_clip": 1.13035178, + "balance_loss_mlp": 1.02078366, + "epoch": 0.4025251766120547, + "flos": 20596372129440.0, + "grad_norm": 2.689506939583521, + "language_loss": 0.68622351, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.71326274, + "num_input_tokens_seen": 143816995, + "step": 6695, + "time_per_iteration": 2.7814605236053467 + }, + { + "auxiliary_loss_clip": 0.01446497, + "auxiliary_loss_mlp": 0.01263946, + "balance_loss_clip": 1.1265353, + "balance_loss_mlp": 1.02972341, + "epoch": 0.4025852998647227, + "flos": 64528505432640.0, + "grad_norm": 4.006090630752131, + "language_loss": 0.79566371, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.82276815, + "num_input_tokens_seen": 143842090, + "step": 6696, + "time_per_iteration": 3.1295089721679688 + }, + { + "auxiliary_loss_clip": 0.01456873, + "auxiliary_loss_mlp": 0.0126882, + "balance_loss_clip": 1.13759375, + "balance_loss_mlp": 1.03345299, + "epoch": 0.40264542311739066, + "flos": 20888383947840.0, + "grad_norm": 1.675425113375786, + "language_loss": 0.71044928, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73770618, + "num_input_tokens_seen": 143860800, + "step": 6697, + "time_per_iteration": 2.836390256881714 + }, + { + "auxiliary_loss_clip": 0.01453762, + "auxiliary_loss_mlp": 0.01253718, + "balance_loss_clip": 1.13397253, + "balance_loss_mlp": 1.01892328, + "epoch": 0.4027055463700586, + "flos": 26252617171680.0, + "grad_norm": 2.5558344058425138, + "language_loss": 0.61698425, + "learning_rate": 2.711394207496984e-06, + "loss": 0.64405906, + "num_input_tokens_seen": 143878950, + "step": 6698, + "time_per_iteration": 2.817125082015991 + }, + { + "auxiliary_loss_clip": 0.01453825, + "auxiliary_loss_mlp": 0.01259363, + "balance_loss_clip": 1.13289762, + "balance_loss_mlp": 1.02666628, + "epoch": 0.4027656696227266, + "flos": 20633504162400.0, + "grad_norm": 2.2407995832517416, + "language_loss": 0.76822829, + "learning_rate": 2.711030202621491e-06, + "loss": 0.79536015, + "num_input_tokens_seen": 143898385, + "step": 6699, + "time_per_iteration": 2.7674124240875244 + }, + { + "auxiliary_loss_clip": 0.01446197, + "auxiliary_loss_mlp": 0.0125371, + "balance_loss_clip": 1.12605, + "balance_loss_mlp": 1.0217762, + "epoch": 0.40282579287539455, + "flos": 22348480968000.0, + "grad_norm": 1.8127092975958345, + "language_loss": 0.80410695, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.83110595, + "num_input_tokens_seen": 143918795, + "step": 6700, + "time_per_iteration": 2.750744581222534 + }, + { + "auxiliary_loss_clip": 0.0144813, + "auxiliary_loss_mlp": 0.01269299, + "balance_loss_clip": 1.12796831, + "balance_loss_mlp": 1.03355062, + "epoch": 0.4028859161280625, + "flos": 29277380242080.0, + "grad_norm": 1.9978010962416688, + "language_loss": 0.75069988, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77787417, + "num_input_tokens_seen": 143938245, + "step": 6701, + "time_per_iteration": 2.8119821548461914 + }, + { + "auxiliary_loss_clip": 0.01449106, + "auxiliary_loss_mlp": 0.01263721, + "balance_loss_clip": 1.13009953, + "balance_loss_mlp": 1.03007126, + "epoch": 0.4029460393807305, + "flos": 28624895547840.0, + "grad_norm": 1.6450938240401456, + "language_loss": 0.66290897, + "learning_rate": 2.709938026276208e-06, + "loss": 0.69003725, + "num_input_tokens_seen": 143960995, + "step": 6702, + "time_per_iteration": 2.85127329826355 + }, + { + "auxiliary_loss_clip": 0.01451324, + "auxiliary_loss_mlp": 0.01253587, + "balance_loss_clip": 1.13199604, + "balance_loss_mlp": 1.01612246, + "epoch": 0.40300616263339845, + "flos": 22604157244800.0, + "grad_norm": 2.3062262770617714, + "language_loss": 0.66397697, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.69102609, + "num_input_tokens_seen": 143979910, + "step": 6703, + "time_per_iteration": 2.760830879211426 + }, + { + "auxiliary_loss_clip": 0.01454557, + "auxiliary_loss_mlp": 0.01260105, + "balance_loss_clip": 1.13504469, + "balance_loss_mlp": 1.02473795, + "epoch": 0.4030662858860664, + "flos": 25522682446080.0, + "grad_norm": 1.9303227229816493, + "language_loss": 0.82529342, + "learning_rate": 2.709209774085071e-06, + "loss": 0.85244, + "num_input_tokens_seen": 144000095, + "step": 6704, + "time_per_iteration": 2.8599393367767334 + }, + { + "auxiliary_loss_clip": 0.01451121, + "auxiliary_loss_mlp": 0.01258188, + "balance_loss_clip": 1.13231039, + "balance_loss_mlp": 1.02434731, + "epoch": 0.40312640913873443, + "flos": 23588971755840.0, + "grad_norm": 1.6269059975738058, + "language_loss": 0.73318934, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.7602824, + "num_input_tokens_seen": 144019695, + "step": 6705, + "time_per_iteration": 2.7972733974456787 + }, + { + "auxiliary_loss_clip": 0.01452397, + "auxiliary_loss_mlp": 0.01252065, + "balance_loss_clip": 1.13394272, + "balance_loss_mlp": 1.01974988, + "epoch": 0.4031865323914024, + "flos": 20013107055840.0, + "grad_norm": 1.75982146913014, + "language_loss": 0.66227663, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68932128, + "num_input_tokens_seen": 144038525, + "step": 6706, + "time_per_iteration": 2.727222442626953 + }, + { + "auxiliary_loss_clip": 0.0145974, + "auxiliary_loss_mlp": 0.01265256, + "balance_loss_clip": 1.14048707, + "balance_loss_mlp": 1.03332281, + "epoch": 0.40324665564407036, + "flos": 21873577740480.0, + "grad_norm": 3.024283670180839, + "language_loss": 0.71499681, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.74224675, + "num_input_tokens_seen": 144059485, + "step": 6707, + "time_per_iteration": 2.806443691253662 + }, + { + "auxiliary_loss_clip": 0.01450205, + "auxiliary_loss_mlp": 0.01259735, + "balance_loss_clip": 1.13189006, + "balance_loss_mlp": 1.03066254, + "epoch": 0.4033067788967383, + "flos": 23881021502400.0, + "grad_norm": 1.837506427234706, + "language_loss": 0.80101871, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82811821, + "num_input_tokens_seen": 144080265, + "step": 6708, + "time_per_iteration": 2.778412103652954 + }, + { + "auxiliary_loss_clip": 0.0145402, + "auxiliary_loss_mlp": 0.01268259, + "balance_loss_clip": 1.13476229, + "balance_loss_mlp": 1.03422737, + "epoch": 0.4033669021494063, + "flos": 17421791369760.0, + "grad_norm": 2.562783099734646, + "language_loss": 0.83213115, + "learning_rate": 2.70738867321606e-06, + "loss": 0.8593539, + "num_input_tokens_seen": 144098040, + "step": 6709, + "time_per_iteration": 2.753540515899658 + }, + { + "auxiliary_loss_clip": 0.01462788, + "auxiliary_loss_mlp": 0.01266307, + "balance_loss_clip": 1.14351678, + "balance_loss_mlp": 1.03170323, + "epoch": 0.40342702540207426, + "flos": 29602807133760.0, + "grad_norm": 1.7594475190709233, + "language_loss": 0.71029949, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73759037, + "num_input_tokens_seen": 144118265, + "step": 6710, + "time_per_iteration": 2.8258936405181885 + }, + { + "auxiliary_loss_clip": 0.01458901, + "auxiliary_loss_mlp": 0.01260962, + "balance_loss_clip": 1.14060128, + "balance_loss_mlp": 1.0290283, + "epoch": 0.4034871486547422, + "flos": 11285826223680.0, + "grad_norm": 2.8317239507333043, + "language_loss": 0.84706903, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87426764, + "num_input_tokens_seen": 144133865, + "step": 6711, + "time_per_iteration": 2.8030405044555664 + }, + { + "auxiliary_loss_clip": 0.01459429, + "auxiliary_loss_mlp": 0.01265533, + "balance_loss_clip": 1.14025784, + "balance_loss_mlp": 1.0328362, + "epoch": 0.4035472719074102, + "flos": 15554266047360.0, + "grad_norm": 2.329460784123702, + "language_loss": 0.76083785, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78808749, + "num_input_tokens_seen": 144150125, + "step": 6712, + "time_per_iteration": 2.8299293518066406 + }, + { + "auxiliary_loss_clip": 0.01462759, + "auxiliary_loss_mlp": 0.01263954, + "balance_loss_clip": 1.14510894, + "balance_loss_mlp": 1.02973151, + "epoch": 0.40360739516007815, + "flos": 24676307176320.0, + "grad_norm": 2.2214201403670537, + "language_loss": 0.79266441, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81993157, + "num_input_tokens_seen": 144169295, + "step": 6713, + "time_per_iteration": 4.446474313735962 + }, + { + "auxiliary_loss_clip": 0.01460057, + "auxiliary_loss_mlp": 0.01261318, + "balance_loss_clip": 1.14223313, + "balance_loss_mlp": 1.02900314, + "epoch": 0.4036675184127461, + "flos": 17305616322720.0, + "grad_norm": 2.4560354151952235, + "language_loss": 0.88176644, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90898025, + "num_input_tokens_seen": 144185790, + "step": 6714, + "time_per_iteration": 2.7599120140075684 + }, + { + "auxiliary_loss_clip": 0.01466274, + "auxiliary_loss_mlp": 0.01262666, + "balance_loss_clip": 1.14726257, + "balance_loss_mlp": 1.02977908, + "epoch": 0.4037276416654141, + "flos": 19866020194080.0, + "grad_norm": 2.552657419028609, + "language_loss": 0.69083357, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71812296, + "num_input_tokens_seen": 144205190, + "step": 6715, + "time_per_iteration": 2.842355251312256 + }, + { + "auxiliary_loss_clip": 0.01461849, + "auxiliary_loss_mlp": 0.01265496, + "balance_loss_clip": 1.14423454, + "balance_loss_mlp": 1.03279924, + "epoch": 0.40378776491808205, + "flos": 18298244034720.0, + "grad_norm": 2.117756630666729, + "language_loss": 0.77748579, + "learning_rate": 2.704838005767892e-06, + "loss": 0.80475926, + "num_input_tokens_seen": 144222705, + "step": 6716, + "time_per_iteration": 2.768789529800415 + }, + { + "auxiliary_loss_clip": 0.01467979, + "auxiliary_loss_mlp": 0.01267001, + "balance_loss_clip": 1.15040517, + "balance_loss_mlp": 1.0379281, + "epoch": 0.40384788817075, + "flos": 15051181832640.0, + "grad_norm": 2.0541569997119957, + "language_loss": 0.76260018, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78995001, + "num_input_tokens_seen": 144239545, + "step": 6717, + "time_per_iteration": 2.7461705207824707 + }, + { + "auxiliary_loss_clip": 0.01588226, + "auxiliary_loss_mlp": 0.01217659, + "balance_loss_clip": 1.29503489, + "balance_loss_mlp": 1.00785065, + "epoch": 0.40390801142341803, + "flos": 61936203250080.0, + "grad_norm": 0.9245939671357956, + "language_loss": 0.60607982, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.63413858, + "num_input_tokens_seen": 144288145, + "step": 6718, + "time_per_iteration": 3.2061238288879395 + }, + { + "auxiliary_loss_clip": 0.01461029, + "auxiliary_loss_mlp": 0.012707, + "balance_loss_clip": 1.14322531, + "balance_loss_mlp": 1.03647757, + "epoch": 0.403968134676086, + "flos": 22740510437280.0, + "grad_norm": 1.9886181437437753, + "language_loss": 0.75024307, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.77756035, + "num_input_tokens_seen": 144302315, + "step": 6719, + "time_per_iteration": 2.768934965133667 + }, + { + "auxiliary_loss_clip": 0.0146603, + "auxiliary_loss_mlp": 0.01264799, + "balance_loss_clip": 1.14831901, + "balance_loss_mlp": 1.03248453, + "epoch": 0.40402825792875396, + "flos": 19786028976000.0, + "grad_norm": 2.312493983189071, + "language_loss": 0.81747502, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.84478331, + "num_input_tokens_seen": 144318990, + "step": 6720, + "time_per_iteration": 2.720590591430664 + }, + { + "auxiliary_loss_clip": 0.01460208, + "auxiliary_loss_mlp": 0.01262268, + "balance_loss_clip": 1.1421299, + "balance_loss_mlp": 1.02861834, + "epoch": 0.40408838118142193, + "flos": 19611292121280.0, + "grad_norm": 2.4886337666143192, + "language_loss": 0.77141595, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79864073, + "num_input_tokens_seen": 144335765, + "step": 6721, + "time_per_iteration": 2.7957260608673096 + }, + { + "auxiliary_loss_clip": 0.0146414, + "auxiliary_loss_mlp": 0.01261275, + "balance_loss_clip": 1.14810705, + "balance_loss_mlp": 1.033728, + "epoch": 0.4041485044340899, + "flos": 24428368244160.0, + "grad_norm": 2.249281518511722, + "language_loss": 0.72749841, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.75475264, + "num_input_tokens_seen": 144355825, + "step": 6722, + "time_per_iteration": 2.850754976272583 + }, + { + "auxiliary_loss_clip": 0.01464214, + "auxiliary_loss_mlp": 0.01260281, + "balance_loss_clip": 1.14653218, + "balance_loss_mlp": 1.03159022, + "epoch": 0.40420862768675786, + "flos": 16761872756160.0, + "grad_norm": 2.100236546040436, + "language_loss": 0.65719378, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.68443871, + "num_input_tokens_seen": 144374320, + "step": 6723, + "time_per_iteration": 2.784754991531372 + }, + { + "auxiliary_loss_clip": 0.01466427, + "auxiliary_loss_mlp": 0.01268201, + "balance_loss_clip": 1.14902234, + "balance_loss_mlp": 1.03397882, + "epoch": 0.4042687509394258, + "flos": 22493747278080.0, + "grad_norm": 1.6054622963236524, + "language_loss": 0.73615408, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76350045, + "num_input_tokens_seen": 144394325, + "step": 6724, + "time_per_iteration": 4.242512464523315 + }, + { + "auxiliary_loss_clip": 0.01463084, + "auxiliary_loss_mlp": 0.01264162, + "balance_loss_clip": 1.14606571, + "balance_loss_mlp": 1.03642464, + "epoch": 0.4043288741920938, + "flos": 30338772436800.0, + "grad_norm": 1.7876066293439228, + "language_loss": 0.74658918, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.77386165, + "num_input_tokens_seen": 144412765, + "step": 6725, + "time_per_iteration": 4.354031085968018 + }, + { + "auxiliary_loss_clip": 0.01464587, + "auxiliary_loss_mlp": 0.01260808, + "balance_loss_clip": 1.14643145, + "balance_loss_mlp": 1.02849269, + "epoch": 0.40438899744476176, + "flos": 46351556746560.0, + "grad_norm": 1.529836000025669, + "language_loss": 0.76771569, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79496956, + "num_input_tokens_seen": 144435400, + "step": 6726, + "time_per_iteration": 3.0106329917907715 + }, + { + "auxiliary_loss_clip": 0.0146295, + "auxiliary_loss_mlp": 0.01261667, + "balance_loss_clip": 1.1452347, + "balance_loss_mlp": 1.02916098, + "epoch": 0.4044491206974297, + "flos": 13335180966720.0, + "grad_norm": 2.0496426101069063, + "language_loss": 0.82098562, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.84823179, + "num_input_tokens_seen": 144452925, + "step": 6727, + "time_per_iteration": 2.7312467098236084 + }, + { + "auxiliary_loss_clip": 0.01467552, + "auxiliary_loss_mlp": 0.01262038, + "balance_loss_clip": 1.15038288, + "balance_loss_mlp": 1.02972305, + "epoch": 0.4045092439500977, + "flos": 12095372885760.0, + "grad_norm": 2.164902280813997, + "language_loss": 0.84909165, + "learning_rate": 2.700462388688447e-06, + "loss": 0.8763876, + "num_input_tokens_seen": 144470195, + "step": 6728, + "time_per_iteration": 2.7798049449920654 + }, + { + "auxiliary_loss_clip": 0.01468809, + "auxiliary_loss_mlp": 0.01264488, + "balance_loss_clip": 1.15176034, + "balance_loss_mlp": 1.03160095, + "epoch": 0.40456936720276565, + "flos": 21181874964480.0, + "grad_norm": 1.9302661842668554, + "language_loss": 0.81997645, + "learning_rate": 2.700097580951786e-06, + "loss": 0.84730941, + "num_input_tokens_seen": 144490320, + "step": 6729, + "time_per_iteration": 2.8428876399993896 + }, + { + "auxiliary_loss_clip": 0.01470521, + "auxiliary_loss_mlp": 0.01258061, + "balance_loss_clip": 1.15342796, + "balance_loss_mlp": 1.02536464, + "epoch": 0.4046294904554336, + "flos": 23917584612960.0, + "grad_norm": 4.0292198868126485, + "language_loss": 0.73545837, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.76274419, + "num_input_tokens_seen": 144508990, + "step": 6730, + "time_per_iteration": 4.409573554992676 + }, + { + "auxiliary_loss_clip": 0.0146132, + "auxiliary_loss_mlp": 0.012542, + "balance_loss_clip": 1.14402676, + "balance_loss_mlp": 1.02035868, + "epoch": 0.4046896137081016, + "flos": 38074297904640.0, + "grad_norm": 1.7645034792626981, + "language_loss": 0.67747843, + "learning_rate": 2.699367885848985e-06, + "loss": 0.70463359, + "num_input_tokens_seen": 144529550, + "step": 6731, + "time_per_iteration": 2.953622579574585 + }, + { + "auxiliary_loss_clip": 0.01456567, + "auxiliary_loss_mlp": 0.01262298, + "balance_loss_clip": 1.13848019, + "balance_loss_mlp": 1.03265345, + "epoch": 0.4047497369607696, + "flos": 23619200863680.0, + "grad_norm": 1.714104253007496, + "language_loss": 0.73887718, + "learning_rate": 2.699002998510517e-06, + "loss": 0.7660659, + "num_input_tokens_seen": 144549310, + "step": 6732, + "time_per_iteration": 2.831195116043091 + }, + { + "auxiliary_loss_clip": 0.01460199, + "auxiliary_loss_mlp": 0.01256707, + "balance_loss_clip": 1.14280653, + "balance_loss_mlp": 1.02763486, + "epoch": 0.40480986021343757, + "flos": 12825269683200.0, + "grad_norm": 1.9848438156314427, + "language_loss": 0.77382809, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.80099714, + "num_input_tokens_seen": 144567430, + "step": 6733, + "time_per_iteration": 2.787045478820801 + }, + { + "auxiliary_loss_clip": 0.01461209, + "auxiliary_loss_mlp": 0.01270101, + "balance_loss_clip": 1.1425705, + "balance_loss_mlp": 1.03568804, + "epoch": 0.40486998346610553, + "flos": 23771028745440.0, + "grad_norm": 2.3575961572113773, + "language_loss": 0.7663303, + "learning_rate": 2.698273144328627e-06, + "loss": 0.79364336, + "num_input_tokens_seen": 144585975, + "step": 6734, + "time_per_iteration": 2.736318826675415 + }, + { + "auxiliary_loss_clip": 0.01465109, + "auxiliary_loss_mlp": 0.01273307, + "balance_loss_clip": 1.14732599, + "balance_loss_mlp": 1.04003811, + "epoch": 0.4049301067187735, + "flos": 22859037030240.0, + "grad_norm": 2.5841248784261115, + "language_loss": 0.65387344, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.68125761, + "num_input_tokens_seen": 144605225, + "step": 6735, + "time_per_iteration": 2.8235161304473877 + }, + { + "auxiliary_loss_clip": 0.01462597, + "auxiliary_loss_mlp": 0.01253957, + "balance_loss_clip": 1.14541066, + "balance_loss_mlp": 1.02259612, + "epoch": 0.40499022997144146, + "flos": 22786214234400.0, + "grad_norm": 1.770080564617688, + "language_loss": 0.83322525, + "learning_rate": 2.697543184232387e-06, + "loss": 0.86039078, + "num_input_tokens_seen": 144624145, + "step": 6736, + "time_per_iteration": 2.79681658744812 + }, + { + "auxiliary_loss_clip": 0.01464158, + "auxiliary_loss_mlp": 0.01261881, + "balance_loss_clip": 1.14506876, + "balance_loss_mlp": 1.02956581, + "epoch": 0.4050503532241094, + "flos": 23041587085920.0, + "grad_norm": 1.77371024613664, + "language_loss": 0.74912089, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77638125, + "num_input_tokens_seen": 144644470, + "step": 6737, + "time_per_iteration": 2.748940944671631 + }, + { + "auxiliary_loss_clip": 0.0146958, + "auxiliary_loss_mlp": 0.01267457, + "balance_loss_clip": 1.15185344, + "balance_loss_mlp": 1.04029202, + "epoch": 0.4051104764767774, + "flos": 16649262956160.0, + "grad_norm": 2.048843374334527, + "language_loss": 0.71656203, + "learning_rate": 2.696813118332519e-06, + "loss": 0.74393237, + "num_input_tokens_seen": 144661055, + "step": 6738, + "time_per_iteration": 2.7394065856933594 + }, + { + "auxiliary_loss_clip": 0.01459496, + "auxiliary_loss_mlp": 0.01259789, + "balance_loss_clip": 1.14207292, + "balance_loss_mlp": 1.02938116, + "epoch": 0.40517059972944536, + "flos": 16360626744000.0, + "grad_norm": 1.9045816699317735, + "language_loss": 0.74991369, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77710664, + "num_input_tokens_seen": 144677935, + "step": 6739, + "time_per_iteration": 2.760861873626709 + }, + { + "auxiliary_loss_clip": 0.01465498, + "auxiliary_loss_mlp": 0.01254459, + "balance_loss_clip": 1.1465801, + "balance_loss_mlp": 1.02424169, + "epoch": 0.4052307229821133, + "flos": 28805890548960.0, + "grad_norm": 1.8828470790277476, + "language_loss": 0.74174309, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76894259, + "num_input_tokens_seen": 144697725, + "step": 6740, + "time_per_iteration": 2.8087759017944336 + }, + { + "auxiliary_loss_clip": 0.0146714, + "auxiliary_loss_mlp": 0.01265675, + "balance_loss_clip": 1.14987063, + "balance_loss_mlp": 1.03717422, + "epoch": 0.4052908462347813, + "flos": 21400039926720.0, + "grad_norm": 1.6972234333508218, + "language_loss": 0.77296638, + "learning_rate": 2.695717821343153e-06, + "loss": 0.80029452, + "num_input_tokens_seen": 144718805, + "step": 6741, + "time_per_iteration": 2.7163760662078857 + }, + { + "auxiliary_loss_clip": 0.01457424, + "auxiliary_loss_mlp": 0.01265309, + "balance_loss_clip": 1.1386373, + "balance_loss_mlp": 1.03432894, + "epoch": 0.40535096948744925, + "flos": 22421265835680.0, + "grad_norm": 2.017375742368117, + "language_loss": 0.71085989, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.73808724, + "num_input_tokens_seen": 144737105, + "step": 6742, + "time_per_iteration": 2.7041754722595215 + }, + { + "auxiliary_loss_clip": 0.01468354, + "auxiliary_loss_mlp": 0.01263668, + "balance_loss_clip": 1.14972305, + "balance_loss_mlp": 1.02906382, + "epoch": 0.4054110927401172, + "flos": 17012201162400.0, + "grad_norm": 2.030742434673872, + "language_loss": 0.72646707, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.75378728, + "num_input_tokens_seen": 144751350, + "step": 6743, + "time_per_iteration": 2.703578233718872 + }, + { + "auxiliary_loss_clip": 0.01463177, + "auxiliary_loss_mlp": 0.01263105, + "balance_loss_clip": 1.14397371, + "balance_loss_mlp": 1.02640271, + "epoch": 0.4054712159927852, + "flos": 21616915331520.0, + "grad_norm": 3.2716125433975884, + "language_loss": 0.70428169, + "learning_rate": 2.694622286918588e-06, + "loss": 0.73154449, + "num_input_tokens_seen": 144770030, + "step": 6744, + "time_per_iteration": 2.73464298248291 + }, + { + "auxiliary_loss_clip": 0.01462979, + "auxiliary_loss_mlp": 0.01258714, + "balance_loss_clip": 1.14485979, + "balance_loss_mlp": 1.02792513, + "epoch": 0.4055313392454532, + "flos": 25814959761600.0, + "grad_norm": 2.11859805474027, + "language_loss": 0.79878688, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82600379, + "num_input_tokens_seen": 144790965, + "step": 6745, + "time_per_iteration": 2.7395284175872803 + }, + { + "auxiliary_loss_clip": 0.01460774, + "auxiliary_loss_mlp": 0.01256605, + "balance_loss_clip": 1.14236999, + "balance_loss_mlp": 1.02905846, + "epoch": 0.40559146249812117, + "flos": 14138848764000.0, + "grad_norm": 1.9216032442756967, + "language_loss": 0.67260504, + "learning_rate": 2.693891798911731e-06, + "loss": 0.6997788, + "num_input_tokens_seen": 144807755, + "step": 6746, + "time_per_iteration": 2.7745611667633057 + }, + { + "auxiliary_loss_clip": 0.01462751, + "auxiliary_loss_mlp": 0.01251631, + "balance_loss_clip": 1.14484942, + "balance_loss_mlp": 1.02255869, + "epoch": 0.40565158575078913, + "flos": 41358757636800.0, + "grad_norm": 2.01093881357444, + "language_loss": 0.57180053, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59894437, + "num_input_tokens_seen": 144832405, + "step": 6747, + "time_per_iteration": 2.9339375495910645 + }, + { + "auxiliary_loss_clip": 0.01463669, + "auxiliary_loss_mlp": 0.01258285, + "balance_loss_clip": 1.1442914, + "balance_loss_mlp": 1.02864027, + "epoch": 0.4057117090034571, + "flos": 28546838665920.0, + "grad_norm": 1.7313601123119382, + "language_loss": 0.84729123, + "learning_rate": 2.693161205655089e-06, + "loss": 0.87451065, + "num_input_tokens_seen": 144853890, + "step": 6748, + "time_per_iteration": 2.7807633876800537 + }, + { + "auxiliary_loss_clip": 0.01464692, + "auxiliary_loss_mlp": 0.01251372, + "balance_loss_clip": 1.14565659, + "balance_loss_mlp": 1.02249026, + "epoch": 0.40577183225612506, + "flos": 18006004647360.0, + "grad_norm": 2.0007679290446845, + "language_loss": 0.81811064, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.84527129, + "num_input_tokens_seen": 144871395, + "step": 6749, + "time_per_iteration": 2.7388834953308105 + }, + { + "auxiliary_loss_clip": 0.01458176, + "auxiliary_loss_mlp": 0.0125073, + "balance_loss_clip": 1.13852119, + "balance_loss_mlp": 1.01860547, + "epoch": 0.40583195550879303, + "flos": 19538772750720.0, + "grad_norm": 2.2985176683999113, + "language_loss": 0.75738746, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.78447652, + "num_input_tokens_seen": 144890975, + "step": 6750, + "time_per_iteration": 2.800438165664673 + }, + { + "auxiliary_loss_clip": 0.01458459, + "auxiliary_loss_mlp": 0.01262431, + "balance_loss_clip": 1.13927174, + "balance_loss_mlp": 1.02897155, + "epoch": 0.405892078761461, + "flos": 22311652360320.0, + "grad_norm": 2.560015052166887, + "language_loss": 0.74501407, + "learning_rate": 2.692065118669195e-06, + "loss": 0.772223, + "num_input_tokens_seen": 144908170, + "step": 6751, + "time_per_iteration": 2.778560161590576 + }, + { + "auxiliary_loss_clip": 0.01461995, + "auxiliary_loss_mlp": 0.01256435, + "balance_loss_clip": 1.14383304, + "balance_loss_mlp": 1.02621841, + "epoch": 0.40595220201412896, + "flos": 25486802042400.0, + "grad_norm": 1.747492579097599, + "language_loss": 0.66774094, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.69492519, + "num_input_tokens_seen": 144928020, + "step": 6752, + "time_per_iteration": 4.415027856826782 + }, + { + "auxiliary_loss_clip": 0.01465153, + "auxiliary_loss_mlp": 0.01257606, + "balance_loss_clip": 1.145895, + "balance_loss_mlp": 1.02281117, + "epoch": 0.4060123252667969, + "flos": 49859718952320.0, + "grad_norm": 1.7031922563667778, + "language_loss": 0.71362555, + "learning_rate": 2.691334262772948e-06, + "loss": 0.74085319, + "num_input_tokens_seen": 144951240, + "step": 6753, + "time_per_iteration": 3.0627756118774414 + }, + { + "auxiliary_loss_clip": 0.01458717, + "auxiliary_loss_mlp": 0.01260063, + "balance_loss_clip": 1.13854349, + "balance_loss_mlp": 1.02698481, + "epoch": 0.4060724485194649, + "flos": 21137233155840.0, + "grad_norm": 1.9000919481964673, + "language_loss": 0.72160125, + "learning_rate": 2.690968795494699e-06, + "loss": 0.74878901, + "num_input_tokens_seen": 144969100, + "step": 6754, + "time_per_iteration": 2.7699472904205322 + }, + { + "auxiliary_loss_clip": 0.01456927, + "auxiliary_loss_mlp": 0.01259934, + "balance_loss_clip": 1.13742781, + "balance_loss_mlp": 1.02609336, + "epoch": 0.40613257177213286, + "flos": 21759981808320.0, + "grad_norm": 7.460188251103096, + "language_loss": 0.82966393, + "learning_rate": 2.690603302014844e-06, + "loss": 0.85683256, + "num_input_tokens_seen": 144987065, + "step": 6755, + "time_per_iteration": 2.7044754028320312 + }, + { + "auxiliary_loss_clip": 0.01461154, + "auxiliary_loss_mlp": 0.01270277, + "balance_loss_clip": 1.14240205, + "balance_loss_mlp": 1.03815222, + "epoch": 0.4061926950248008, + "flos": 25557349148640.0, + "grad_norm": 1.6873136197720668, + "language_loss": 0.70883721, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.73615152, + "num_input_tokens_seen": 145007310, + "step": 6756, + "time_per_iteration": 2.9009881019592285 + }, + { + "auxiliary_loss_clip": 0.01451457, + "auxiliary_loss_mlp": 0.0126753, + "balance_loss_clip": 1.13050163, + "balance_loss_mlp": 1.03635967, + "epoch": 0.4062528182774688, + "flos": 23698054236960.0, + "grad_norm": 1.9164418925584716, + "language_loss": 0.79008639, + "learning_rate": 2.689872236505755e-06, + "loss": 0.81727624, + "num_input_tokens_seen": 145026210, + "step": 6757, + "time_per_iteration": 2.7932355403900146 + }, + { + "auxiliary_loss_clip": 0.0145816, + "auxiliary_loss_mlp": 0.01255252, + "balance_loss_clip": 1.1390568, + "balance_loss_mlp": 1.0237, + "epoch": 0.4063129415301368, + "flos": 21728197645920.0, + "grad_norm": 2.2147516164830545, + "language_loss": 0.78402388, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.81115794, + "num_input_tokens_seen": 145045475, + "step": 6758, + "time_per_iteration": 2.8484084606170654 + }, + { + "auxiliary_loss_clip": 0.01459669, + "auxiliary_loss_mlp": 0.01268864, + "balance_loss_clip": 1.13926065, + "balance_loss_mlp": 1.03597724, + "epoch": 0.40637306478280477, + "flos": 12789275495040.0, + "grad_norm": 1.9941030087923857, + "language_loss": 0.88923883, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.91652417, + "num_input_tokens_seen": 145062260, + "step": 6759, + "time_per_iteration": 2.7301721572875977 + }, + { + "auxiliary_loss_clip": 0.01454734, + "auxiliary_loss_mlp": 0.01255675, + "balance_loss_clip": 1.13458109, + "balance_loss_mlp": 1.0239327, + "epoch": 0.40643318803547274, + "flos": 24026780878560.0, + "grad_norm": 2.354918191884249, + "language_loss": 0.64408231, + "learning_rate": 2.688775442076598e-06, + "loss": 0.67118639, + "num_input_tokens_seen": 145082470, + "step": 6760, + "time_per_iteration": 2.777806520462036 + }, + { + "auxiliary_loss_clip": 0.01456938, + "auxiliary_loss_mlp": 0.01264074, + "balance_loss_clip": 1.13731575, + "balance_loss_mlp": 1.03004193, + "epoch": 0.4064933112881407, + "flos": 25594746678720.0, + "grad_norm": 1.5790250968273638, + "language_loss": 0.74931675, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77652687, + "num_input_tokens_seen": 145105685, + "step": 6761, + "time_per_iteration": 2.7690842151641846 + }, + { + "auxiliary_loss_clip": 0.0146098, + "auxiliary_loss_mlp": 0.01255806, + "balance_loss_clip": 1.14256835, + "balance_loss_mlp": 1.02959442, + "epoch": 0.40655343454080867, + "flos": 22056507077760.0, + "grad_norm": 1.543595129612893, + "language_loss": 0.7026484, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.7298162, + "num_input_tokens_seen": 145125590, + "step": 6762, + "time_per_iteration": 4.287019729614258 + }, + { + "auxiliary_loss_clip": 0.01460987, + "auxiliary_loss_mlp": 0.01251311, + "balance_loss_clip": 1.14018536, + "balance_loss_mlp": 1.01918721, + "epoch": 0.40661355779347663, + "flos": 26471313128160.0, + "grad_norm": 1.5258721790934606, + "language_loss": 0.73330671, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.76042968, + "num_input_tokens_seen": 145146810, + "step": 6763, + "time_per_iteration": 4.346789121627808 + }, + { + "auxiliary_loss_clip": 0.0145935, + "auxiliary_loss_mlp": 0.0126226, + "balance_loss_clip": 1.13904214, + "balance_loss_mlp": 1.02784729, + "epoch": 0.4066736810461446, + "flos": 13262813308800.0, + "grad_norm": 1.9945082181040212, + "language_loss": 0.69377393, + "learning_rate": 2.687312683911033e-06, + "loss": 0.72099006, + "num_input_tokens_seen": 145163130, + "step": 6764, + "time_per_iteration": 2.7639319896698 + }, + { + "auxiliary_loss_clip": 0.01459817, + "auxiliary_loss_mlp": 0.01266967, + "balance_loss_clip": 1.14022887, + "balance_loss_mlp": 1.03503382, + "epoch": 0.40673380429881256, + "flos": 28806156046080.0, + "grad_norm": 2.4507938774178615, + "language_loss": 0.91350853, + "learning_rate": 2.686946929177557e-06, + "loss": 0.94077641, + "num_input_tokens_seen": 145181420, + "step": 6765, + "time_per_iteration": 2.859959840774536 + }, + { + "auxiliary_loss_clip": 0.01464218, + "auxiliary_loss_mlp": 0.0126041, + "balance_loss_clip": 1.14496362, + "balance_loss_mlp": 1.02370834, + "epoch": 0.4067939275514805, + "flos": 12496998179520.0, + "grad_norm": 5.996096177509904, + "language_loss": 0.78476846, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.81201476, + "num_input_tokens_seen": 145198545, + "step": 6766, + "time_per_iteration": 2.892542839050293 + }, + { + "auxiliary_loss_clip": 0.01459773, + "auxiliary_loss_mlp": 0.01268847, + "balance_loss_clip": 1.13925505, + "balance_loss_mlp": 1.03462493, + "epoch": 0.4068540508041485, + "flos": 18772502483520.0, + "grad_norm": 2.561201076459606, + "language_loss": 0.76812541, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.79541165, + "num_input_tokens_seen": 145215835, + "step": 6767, + "time_per_iteration": 2.7491588592529297 + }, + { + "auxiliary_loss_clip": 0.0146108, + "auxiliary_loss_mlp": 0.0126491, + "balance_loss_clip": 1.14205801, + "balance_loss_mlp": 1.03602862, + "epoch": 0.40691417405681646, + "flos": 28515395856960.0, + "grad_norm": 1.8567755954241196, + "language_loss": 0.77449501, + "learning_rate": 2.685849508738034e-06, + "loss": 0.80175495, + "num_input_tokens_seen": 145236555, + "step": 6768, + "time_per_iteration": 4.1992034912109375 + }, + { + "auxiliary_loss_clip": 0.01460136, + "auxiliary_loss_mlp": 0.01266908, + "balance_loss_clip": 1.14065218, + "balance_loss_mlp": 1.03611898, + "epoch": 0.4069742973094844, + "flos": 20816168002560.0, + "grad_norm": 2.5120693941189507, + "language_loss": 0.87287688, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.90014732, + "num_input_tokens_seen": 145254595, + "step": 6769, + "time_per_iteration": 2.7920591831207275 + }, + { + "auxiliary_loss_clip": 0.01464457, + "auxiliary_loss_mlp": 0.01261597, + "balance_loss_clip": 1.14523733, + "balance_loss_mlp": 1.03023589, + "epoch": 0.4070344205621524, + "flos": 21472407584640.0, + "grad_norm": 2.124802308777128, + "language_loss": 0.8067733, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83403385, + "num_input_tokens_seen": 145274005, + "step": 6770, + "time_per_iteration": 2.734680652618408 + }, + { + "auxiliary_loss_clip": 0.01463942, + "auxiliary_loss_mlp": 0.01257993, + "balance_loss_clip": 1.14428926, + "balance_loss_mlp": 1.02281725, + "epoch": 0.4070945438148204, + "flos": 26832279070080.0, + "grad_norm": 1.8044449371189306, + "language_loss": 0.8052094, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.83242869, + "num_input_tokens_seen": 145294850, + "step": 6771, + "time_per_iteration": 2.7970211505889893 + }, + { + "auxiliary_loss_clip": 0.01458618, + "auxiliary_loss_mlp": 0.01257685, + "balance_loss_clip": 1.13864088, + "balance_loss_mlp": 1.0278492, + "epoch": 0.4071546670674884, + "flos": 26356275925920.0, + "grad_norm": 1.456141229420781, + "language_loss": 0.76094329, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.78810632, + "num_input_tokens_seen": 145317050, + "step": 6772, + "time_per_iteration": 2.856243133544922 + }, + { + "auxiliary_loss_clip": 0.01457659, + "auxiliary_loss_mlp": 0.01263625, + "balance_loss_clip": 1.13804555, + "balance_loss_mlp": 1.0334084, + "epoch": 0.40721479032015634, + "flos": 17897377304160.0, + "grad_norm": 1.8590891651931993, + "language_loss": 0.8185032, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.84571606, + "num_input_tokens_seen": 145334480, + "step": 6773, + "time_per_iteration": 2.7300448417663574 + }, + { + "auxiliary_loss_clip": 0.01565479, + "auxiliary_loss_mlp": 0.01212372, + "balance_loss_clip": 1.2684412, + "balance_loss_mlp": 1.00256348, + "epoch": 0.4072749135728243, + "flos": 49860249582240.0, + "grad_norm": 0.8294714198586917, + "language_loss": 0.64247966, + "learning_rate": 2.683653966031597e-06, + "loss": 0.67025816, + "num_input_tokens_seen": 145388695, + "step": 6774, + "time_per_iteration": 3.190504312515259 + }, + { + "auxiliary_loss_clip": 0.01463043, + "auxiliary_loss_mlp": 0.01263799, + "balance_loss_clip": 1.14247465, + "balance_loss_mlp": 1.03033948, + "epoch": 0.40733503682549227, + "flos": 27566120396160.0, + "grad_norm": 1.7899227700728226, + "language_loss": 0.72095072, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74821913, + "num_input_tokens_seen": 145408240, + "step": 6775, + "time_per_iteration": 2.7641170024871826 + }, + { + "auxiliary_loss_clip": 0.01460012, + "auxiliary_loss_mlp": 0.01260523, + "balance_loss_clip": 1.13961506, + "balance_loss_mlp": 1.02572858, + "epoch": 0.40739516007816023, + "flos": 22129140232800.0, + "grad_norm": 1.4383272846504789, + "language_loss": 0.78020209, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.80740744, + "num_input_tokens_seen": 145428395, + "step": 6776, + "time_per_iteration": 2.785916805267334 + }, + { + "auxiliary_loss_clip": 0.01468744, + "auxiliary_loss_mlp": 0.01271041, + "balance_loss_clip": 1.14927077, + "balance_loss_mlp": 1.03891683, + "epoch": 0.4074552833308282, + "flos": 23844534248160.0, + "grad_norm": 4.937152696020711, + "language_loss": 0.79511297, + "learning_rate": 2.682555844513981e-06, + "loss": 0.82251078, + "num_input_tokens_seen": 145448290, + "step": 6777, + "time_per_iteration": 2.7855746746063232 + }, + { + "auxiliary_loss_clip": 0.01560323, + "auxiliary_loss_mlp": 0.01241661, + "balance_loss_clip": 1.26316154, + "balance_loss_mlp": 1.0333786, + "epoch": 0.40751540658349616, + "flos": 58006654814880.0, + "grad_norm": 0.6850630205909874, + "language_loss": 0.53040403, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55842382, + "num_input_tokens_seen": 145509785, + "step": 6778, + "time_per_iteration": 3.3077199459075928 + }, + { + "auxiliary_loss_clip": 0.0146749, + "auxiliary_loss_mlp": 0.01279069, + "balance_loss_clip": 1.14778376, + "balance_loss_mlp": 1.04599154, + "epoch": 0.40757552983616413, + "flos": 21216769236000.0, + "grad_norm": 7.232029349276274, + "language_loss": 0.81621516, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84368074, + "num_input_tokens_seen": 145528620, + "step": 6779, + "time_per_iteration": 2.8325819969177246 + }, + { + "auxiliary_loss_clip": 0.01464508, + "auxiliary_loss_mlp": 0.01258722, + "balance_loss_clip": 1.14566588, + "balance_loss_mlp": 1.02602577, + "epoch": 0.4076356530888321, + "flos": 26835958101600.0, + "grad_norm": 3.4275748184194783, + "language_loss": 0.76567101, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.79290336, + "num_input_tokens_seen": 145547775, + "step": 6780, + "time_per_iteration": 2.78007435798645 + }, + { + "auxiliary_loss_clip": 0.01460558, + "auxiliary_loss_mlp": 0.01270725, + "balance_loss_clip": 1.14233065, + "balance_loss_mlp": 1.04127121, + "epoch": 0.40769577634150006, + "flos": 12204645007680.0, + "grad_norm": 1.9323854524447375, + "language_loss": 0.66501766, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.69233048, + "num_input_tokens_seen": 145564465, + "step": 6781, + "time_per_iteration": 2.8826193809509277 + }, + { + "auxiliary_loss_clip": 0.01459783, + "auxiliary_loss_mlp": 0.01269088, + "balance_loss_clip": 1.14014375, + "balance_loss_mlp": 1.04077876, + "epoch": 0.407755899594168, + "flos": 33658012656000.0, + "grad_norm": 1.6097229189107354, + "language_loss": 0.71613288, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.74342155, + "num_input_tokens_seen": 145585965, + "step": 6782, + "time_per_iteration": 2.8566274642944336 + }, + { + "auxiliary_loss_clip": 0.01456977, + "auxiliary_loss_mlp": 0.01269399, + "balance_loss_clip": 1.13677692, + "balance_loss_mlp": 1.03918147, + "epoch": 0.407816022846836, + "flos": 20159435354400.0, + "grad_norm": 2.093539416045998, + "language_loss": 0.82514799, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.85241169, + "num_input_tokens_seen": 145605000, + "step": 6783, + "time_per_iteration": 2.7370235919952393 + }, + { + "auxiliary_loss_clip": 0.01462887, + "auxiliary_loss_mlp": 0.01273132, + "balance_loss_clip": 1.14349079, + "balance_loss_mlp": 1.0408169, + "epoch": 0.40787614609950396, + "flos": 21180964688640.0, + "grad_norm": 1.8450547919742486, + "language_loss": 0.81111014, + "learning_rate": 2.679992655730283e-06, + "loss": 0.83847034, + "num_input_tokens_seen": 145623740, + "step": 6784, + "time_per_iteration": 2.7348811626434326 + }, + { + "auxiliary_loss_clip": 0.01459623, + "auxiliary_loss_mlp": 0.01271165, + "balance_loss_clip": 1.139153, + "balance_loss_mlp": 1.03579831, + "epoch": 0.407936269352172, + "flos": 20522676985920.0, + "grad_norm": 1.874010710046497, + "language_loss": 0.65987659, + "learning_rate": 2.679626382651386e-06, + "loss": 0.68718445, + "num_input_tokens_seen": 145643515, + "step": 6785, + "time_per_iteration": 2.7253899574279785 + }, + { + "auxiliary_loss_clip": 0.01459183, + "auxiliary_loss_mlp": 0.01258988, + "balance_loss_clip": 1.13870919, + "balance_loss_mlp": 1.02934313, + "epoch": 0.40799639260483994, + "flos": 20120786195040.0, + "grad_norm": 2.023673800901967, + "language_loss": 0.80015302, + "learning_rate": 2.679260083800989e-06, + "loss": 0.8273347, + "num_input_tokens_seen": 145660890, + "step": 6786, + "time_per_iteration": 2.7492735385894775 + }, + { + "auxiliary_loss_clip": 0.0146117, + "auxiliary_loss_mlp": 0.01266346, + "balance_loss_clip": 1.13987923, + "balance_loss_mlp": 1.0347935, + "epoch": 0.4080565158575079, + "flos": 20999514549600.0, + "grad_norm": 1.880495766522314, + "language_loss": 0.81866646, + "learning_rate": 2.678893759192982e-06, + "loss": 0.84594154, + "num_input_tokens_seen": 145680070, + "step": 6787, + "time_per_iteration": 2.730684518814087 + }, + { + "auxiliary_loss_clip": 0.0146548, + "auxiliary_loss_mlp": 0.01269395, + "balance_loss_clip": 1.14565361, + "balance_loss_mlp": 1.03574491, + "epoch": 0.40811663911017587, + "flos": 19319735440800.0, + "grad_norm": 10.441349707391865, + "language_loss": 0.68054551, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70789427, + "num_input_tokens_seen": 145698010, + "step": 6788, + "time_per_iteration": 2.7057831287384033 + }, + { + "auxiliary_loss_clip": 0.01458215, + "auxiliary_loss_mlp": 0.01265366, + "balance_loss_clip": 1.13773751, + "balance_loss_mlp": 1.03534019, + "epoch": 0.40817676236284384, + "flos": 40628595342240.0, + "grad_norm": 1.8866860122436822, + "language_loss": 0.66307294, + "learning_rate": 2.678161032759701e-06, + "loss": 0.69030875, + "num_input_tokens_seen": 145722215, + "step": 6789, + "time_per_iteration": 2.8893027305603027 + }, + { + "auxiliary_loss_clip": 0.01456154, + "auxiliary_loss_mlp": 0.0125783, + "balance_loss_clip": 1.13543725, + "balance_loss_mlp": 1.02589655, + "epoch": 0.4082368856155118, + "flos": 20524194112320.0, + "grad_norm": 1.711378380981162, + "language_loss": 0.60446703, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.63160694, + "num_input_tokens_seen": 145741090, + "step": 6790, + "time_per_iteration": 4.373453855514526 + }, + { + "auxiliary_loss_clip": 0.01469708, + "auxiliary_loss_mlp": 0.01266426, + "balance_loss_clip": 1.14933217, + "balance_loss_mlp": 1.03449297, + "epoch": 0.40829700886817977, + "flos": 11428816844160.0, + "grad_norm": 4.422642317288595, + "language_loss": 0.69848359, + "learning_rate": 2.677428203462683e-06, + "loss": 0.72584498, + "num_input_tokens_seen": 145754985, + "step": 6791, + "time_per_iteration": 2.675788640975952 + }, + { + "auxiliary_loss_clip": 0.01558802, + "auxiliary_loss_mlp": 0.01235039, + "balance_loss_clip": 1.25905871, + "balance_loss_mlp": 1.02523041, + "epoch": 0.40835713212084773, + "flos": 67337303009760.0, + "grad_norm": 0.7480267976903263, + "language_loss": 0.59543437, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.62337279, + "num_input_tokens_seen": 145815260, + "step": 6792, + "time_per_iteration": 3.296828508377075 + }, + { + "auxiliary_loss_clip": 0.01473125, + "auxiliary_loss_mlp": 0.01275793, + "balance_loss_clip": 1.15019536, + "balance_loss_mlp": 1.0430969, + "epoch": 0.4084172553735157, + "flos": 21764153905920.0, + "grad_norm": 1.6163405874214454, + "language_loss": 0.80072361, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.8282128, + "num_input_tokens_seen": 145832665, + "step": 6793, + "time_per_iteration": 2.749302387237549 + }, + { + "auxiliary_loss_clip": 0.01459892, + "auxiliary_loss_mlp": 0.01268037, + "balance_loss_clip": 1.13887584, + "balance_loss_mlp": 1.03667533, + "epoch": 0.40847737862618366, + "flos": 27419716241280.0, + "grad_norm": 1.8005971036266222, + "language_loss": 0.84776914, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87504846, + "num_input_tokens_seen": 145850240, + "step": 6794, + "time_per_iteration": 2.7873916625976562 + }, + { + "auxiliary_loss_clip": 0.01464978, + "auxiliary_loss_mlp": 0.01264694, + "balance_loss_clip": 1.14435029, + "balance_loss_mlp": 1.03371429, + "epoch": 0.4085375018788516, + "flos": 18589193864640.0, + "grad_norm": 1.730711643252243, + "language_loss": 0.79753333, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82483006, + "num_input_tokens_seen": 145869545, + "step": 6795, + "time_per_iteration": 2.8103110790252686 + }, + { + "auxiliary_loss_clip": 0.0146197, + "auxiliary_loss_mlp": 0.01271075, + "balance_loss_clip": 1.14074612, + "balance_loss_mlp": 1.03570867, + "epoch": 0.4085976251315196, + "flos": 15413475260160.0, + "grad_norm": 2.431007180969561, + "language_loss": 0.6986506, + "learning_rate": 2.675595680920792e-06, + "loss": 0.72598106, + "num_input_tokens_seen": 145884025, + "step": 6796, + "time_per_iteration": 2.7261204719543457 + }, + { + "auxiliary_loss_clip": 0.01456021, + "auxiliary_loss_mlp": 0.01261934, + "balance_loss_clip": 1.13553262, + "balance_loss_mlp": 1.03076291, + "epoch": 0.40865774838418756, + "flos": 21254394335040.0, + "grad_norm": 1.8537875278263776, + "language_loss": 0.78127676, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.8084563, + "num_input_tokens_seen": 145903210, + "step": 6797, + "time_per_iteration": 2.7441439628601074 + }, + { + "auxiliary_loss_clip": 0.01456263, + "auxiliary_loss_mlp": 0.01254885, + "balance_loss_clip": 1.13447726, + "balance_loss_mlp": 1.02333343, + "epoch": 0.4087178716368556, + "flos": 13773976221600.0, + "grad_norm": 1.968145072294605, + "language_loss": 0.85514349, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.88225502, + "num_input_tokens_seen": 145920985, + "step": 6798, + "time_per_iteration": 2.8323168754577637 + }, + { + "auxiliary_loss_clip": 0.01459625, + "auxiliary_loss_mlp": 0.01258509, + "balance_loss_clip": 1.13880563, + "balance_loss_mlp": 1.02829242, + "epoch": 0.40877799488952354, + "flos": 23623941883680.0, + "grad_norm": 2.291185879581228, + "language_loss": 0.83954382, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86672521, + "num_input_tokens_seen": 145940350, + "step": 6799, + "time_per_iteration": 2.8594906330108643 + }, + { + "auxiliary_loss_clip": 0.0146544, + "auxiliary_loss_mlp": 0.01267396, + "balance_loss_clip": 1.14495564, + "balance_loss_mlp": 1.03622508, + "epoch": 0.4088381181421915, + "flos": 20920661176320.0, + "grad_norm": 2.444719475732173, + "language_loss": 0.83988959, + "learning_rate": 2.6741292016681e-06, + "loss": 0.8672179, + "num_input_tokens_seen": 145957460, + "step": 6800, + "time_per_iteration": 4.170229196548462 + }, + { + "auxiliary_loss_clip": 0.01461341, + "auxiliary_loss_mlp": 0.01263555, + "balance_loss_clip": 1.13959026, + "balance_loss_mlp": 1.03333783, + "epoch": 0.4088982413948595, + "flos": 13299262634880.0, + "grad_norm": 6.218569605000431, + "language_loss": 0.74779969, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.77504867, + "num_input_tokens_seen": 145975285, + "step": 6801, + "time_per_iteration": 2.791795015335083 + }, + { + "auxiliary_loss_clip": 0.01453236, + "auxiliary_loss_mlp": 0.01275259, + "balance_loss_clip": 1.13242328, + "balance_loss_mlp": 1.04542351, + "epoch": 0.40895836464752744, + "flos": 15269460579360.0, + "grad_norm": 2.257061021456979, + "language_loss": 0.80286968, + "learning_rate": 2.673395808607861e-06, + "loss": 0.83015466, + "num_input_tokens_seen": 145989150, + "step": 6802, + "time_per_iteration": 4.195723295211792 + }, + { + "auxiliary_loss_clip": 0.0145827, + "auxiliary_loss_mlp": 0.01270572, + "balance_loss_clip": 1.13770521, + "balance_loss_mlp": 1.03806639, + "epoch": 0.4090184879001954, + "flos": 14503152384000.0, + "grad_norm": 2.482546890614211, + "language_loss": 0.76492012, + "learning_rate": 2.673029073767934e-06, + "loss": 0.79220855, + "num_input_tokens_seen": 146006980, + "step": 6803, + "time_per_iteration": 2.7330315113067627 + }, + { + "auxiliary_loss_clip": 0.01452778, + "auxiliary_loss_mlp": 0.01262682, + "balance_loss_clip": 1.13258028, + "balance_loss_mlp": 1.03360939, + "epoch": 0.40907861115286337, + "flos": 13883324199840.0, + "grad_norm": 1.9049658936887839, + "language_loss": 0.786937, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.81409162, + "num_input_tokens_seen": 146025125, + "step": 6804, + "time_per_iteration": 2.7272021770477295 + }, + { + "auxiliary_loss_clip": 0.01453383, + "auxiliary_loss_mlp": 0.01259119, + "balance_loss_clip": 1.13185012, + "balance_loss_mlp": 1.02527809, + "epoch": 0.40913873440553133, + "flos": 28040113347840.0, + "grad_norm": 2.6704132371405205, + "language_loss": 0.75731903, + "learning_rate": 2.672295527537998e-06, + "loss": 0.78444409, + "num_input_tokens_seen": 146044990, + "step": 6805, + "time_per_iteration": 2.7613160610198975 + }, + { + "auxiliary_loss_clip": 0.01465982, + "auxiliary_loss_mlp": 0.01266425, + "balance_loss_clip": 1.14499962, + "balance_loss_mlp": 1.0329659, + "epoch": 0.4091988576581993, + "flos": 21620442650400.0, + "grad_norm": 1.8260960353507327, + "language_loss": 0.79145283, + "learning_rate": 2.671928716175804e-06, + "loss": 0.81877685, + "num_input_tokens_seen": 146066045, + "step": 6806, + "time_per_iteration": 4.2958245277404785 + }, + { + "auxiliary_loss_clip": 0.0145505, + "auxiliary_loss_mlp": 0.01260241, + "balance_loss_clip": 1.13456893, + "balance_loss_mlp": 1.02678156, + "epoch": 0.40925898091086726, + "flos": 25226195104800.0, + "grad_norm": 2.034209111511459, + "language_loss": 0.73141724, + "learning_rate": 2.671561879334007e-06, + "loss": 0.75857019, + "num_input_tokens_seen": 146086280, + "step": 6807, + "time_per_iteration": 2.8051347732543945 + }, + { + "auxiliary_loss_clip": 0.01604945, + "auxiliary_loss_mlp": 0.01219582, + "balance_loss_clip": 1.30637109, + "balance_loss_mlp": 1.00901031, + "epoch": 0.40931910416353523, + "flos": 68936977116000.0, + "grad_norm": 0.8444747666002226, + "language_loss": 0.58752501, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.61577034, + "num_input_tokens_seen": 146148840, + "step": 6808, + "time_per_iteration": 3.4047086238861084 + }, + { + "auxiliary_loss_clip": 0.01464017, + "auxiliary_loss_mlp": 0.01256963, + "balance_loss_clip": 1.144943, + "balance_loss_mlp": 1.02636445, + "epoch": 0.4093792274162032, + "flos": 20191409157600.0, + "grad_norm": 1.7294112056718054, + "language_loss": 0.54629076, + "learning_rate": 2.670828129267242e-06, + "loss": 0.57350063, + "num_input_tokens_seen": 146166195, + "step": 6809, + "time_per_iteration": 2.7324352264404297 + }, + { + "auxiliary_loss_clip": 0.01459619, + "auxiliary_loss_mlp": 0.01255282, + "balance_loss_clip": 1.14009428, + "balance_loss_mlp": 1.02392077, + "epoch": 0.40943935066887116, + "flos": 25231125765600.0, + "grad_norm": 1.7981912057190657, + "language_loss": 0.83160549, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85875452, + "num_input_tokens_seen": 146185045, + "step": 6810, + "time_per_iteration": 2.748319625854492 + }, + { + "auxiliary_loss_clip": 0.01478587, + "auxiliary_loss_mlp": 0.01266588, + "balance_loss_clip": 1.15735543, + "balance_loss_mlp": 1.03141177, + "epoch": 0.4094994739215392, + "flos": 23257400502240.0, + "grad_norm": 2.9073615778838144, + "language_loss": 0.77260339, + "learning_rate": 2.670094277448999e-06, + "loss": 0.80005515, + "num_input_tokens_seen": 146204655, + "step": 6811, + "time_per_iteration": 2.8297436237335205 + }, + { + "auxiliary_loss_clip": 0.01462277, + "auxiliary_loss_mlp": 0.0126005, + "balance_loss_clip": 1.14153647, + "balance_loss_mlp": 1.0281167, + "epoch": 0.40955959717420715, + "flos": 17383976629920.0, + "grad_norm": 1.6622109312057693, + "language_loss": 0.70638967, + "learning_rate": 2.669727313417857e-06, + "loss": 0.7336129, + "num_input_tokens_seen": 146222000, + "step": 6812, + "time_per_iteration": 2.6887056827545166 + }, + { + "auxiliary_loss_clip": 0.01460029, + "auxiliary_loss_mlp": 0.01255805, + "balance_loss_clip": 1.13905525, + "balance_loss_mlp": 1.02291799, + "epoch": 0.4096197204268751, + "flos": 25084759538880.0, + "grad_norm": 1.8533406076801286, + "language_loss": 0.66242909, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68958747, + "num_input_tokens_seen": 146242630, + "step": 6813, + "time_per_iteration": 2.750431537628174 + }, + { + "auxiliary_loss_clip": 0.01463792, + "auxiliary_loss_mlp": 0.01260655, + "balance_loss_clip": 1.14346325, + "balance_loss_mlp": 1.02967525, + "epoch": 0.4096798436795431, + "flos": 30589138771200.0, + "grad_norm": 2.0020254596809677, + "language_loss": 0.74428183, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.77152634, + "num_input_tokens_seen": 146263070, + "step": 6814, + "time_per_iteration": 2.7891297340393066 + }, + { + "auxiliary_loss_clip": 0.01469407, + "auxiliary_loss_mlp": 0.0126736, + "balance_loss_clip": 1.14835119, + "balance_loss_mlp": 1.03332865, + "epoch": 0.40973996693221104, + "flos": 24135901287840.0, + "grad_norm": 2.1116762398023514, + "language_loss": 0.66165411, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68902171, + "num_input_tokens_seen": 146282890, + "step": 6815, + "time_per_iteration": 2.774394989013672 + }, + { + "auxiliary_loss_clip": 0.01467417, + "auxiliary_loss_mlp": 0.01267728, + "balance_loss_clip": 1.14746475, + "balance_loss_mlp": 1.0401814, + "epoch": 0.409800090184879, + "flos": 23991317684640.0, + "grad_norm": 1.6134849235174467, + "language_loss": 0.76227093, + "learning_rate": 2.668259203471188e-06, + "loss": 0.78962243, + "num_input_tokens_seen": 146301755, + "step": 6816, + "time_per_iteration": 2.8076705932617188 + }, + { + "auxiliary_loss_clip": 0.01469848, + "auxiliary_loss_mlp": 0.01262985, + "balance_loss_clip": 1.14946985, + "balance_loss_mlp": 1.03086102, + "epoch": 0.40986021343754697, + "flos": 16145533962720.0, + "grad_norm": 2.1731233672760073, + "language_loss": 0.81954795, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.84687626, + "num_input_tokens_seen": 146316835, + "step": 6817, + "time_per_iteration": 2.724536418914795 + }, + { + "auxiliary_loss_clip": 0.01463033, + "auxiliary_loss_mlp": 0.01266913, + "balance_loss_clip": 1.14226472, + "balance_loss_mlp": 1.03173637, + "epoch": 0.40992033669021494, + "flos": 24793164930240.0, + "grad_norm": 1.6161143743724933, + "language_loss": 0.79741228, + "learning_rate": 2.667524996399444e-06, + "loss": 0.8247118, + "num_input_tokens_seen": 146336650, + "step": 6818, + "time_per_iteration": 2.758498191833496 + }, + { + "auxiliary_loss_clip": 0.01461793, + "auxiliary_loss_mlp": 0.01254988, + "balance_loss_clip": 1.14100397, + "balance_loss_mlp": 1.02591598, + "epoch": 0.4099804599428829, + "flos": 29644414689600.0, + "grad_norm": 1.6845367231948962, + "language_loss": 0.65949869, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68666649, + "num_input_tokens_seen": 146357640, + "step": 6819, + "time_per_iteration": 2.748908281326294 + }, + { + "auxiliary_loss_clip": 0.01465095, + "auxiliary_loss_mlp": 0.01266687, + "balance_loss_clip": 1.14276433, + "balance_loss_mlp": 1.03341877, + "epoch": 0.41004058319555087, + "flos": 24828893621280.0, + "grad_norm": 1.5753502320942054, + "language_loss": 0.85304964, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.88036746, + "num_input_tokens_seen": 146379325, + "step": 6820, + "time_per_iteration": 2.7793350219726562 + }, + { + "auxiliary_loss_clip": 0.01457429, + "auxiliary_loss_mlp": 0.01260196, + "balance_loss_clip": 1.13587761, + "balance_loss_mlp": 1.03188634, + "epoch": 0.41010070644821883, + "flos": 25739747491680.0, + "grad_norm": 2.4040612772923113, + "language_loss": 0.71348369, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.74065995, + "num_input_tokens_seen": 146398635, + "step": 6821, + "time_per_iteration": 2.7413394451141357 + }, + { + "auxiliary_loss_clip": 0.01461369, + "auxiliary_loss_mlp": 0.01255006, + "balance_loss_clip": 1.13939095, + "balance_loss_mlp": 1.02555156, + "epoch": 0.4101608297008868, + "flos": 22348253399040.0, + "grad_norm": 1.8270332229932265, + "language_loss": 0.74860895, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.77577269, + "num_input_tokens_seen": 146417585, + "step": 6822, + "time_per_iteration": 2.7521965503692627 + }, + { + "auxiliary_loss_clip": 0.01463358, + "auxiliary_loss_mlp": 0.0126504, + "balance_loss_clip": 1.14171314, + "balance_loss_mlp": 1.03558612, + "epoch": 0.41022095295355476, + "flos": 21947272884000.0, + "grad_norm": 1.8989455071774954, + "language_loss": 0.75786781, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.78515184, + "num_input_tokens_seen": 146437035, + "step": 6823, + "time_per_iteration": 2.707062005996704 + }, + { + "auxiliary_loss_clip": 0.01465891, + "auxiliary_loss_mlp": 0.01261491, + "balance_loss_clip": 1.14368773, + "balance_loss_mlp": 1.02726817, + "epoch": 0.4102810762062228, + "flos": 27452789961120.0, + "grad_norm": 1.7385266173109017, + "language_loss": 0.72838056, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75565434, + "num_input_tokens_seen": 146457370, + "step": 6824, + "time_per_iteration": 2.7501649856567383 + }, + { + "auxiliary_loss_clip": 0.01461349, + "auxiliary_loss_mlp": 0.0127031, + "balance_loss_clip": 1.13937068, + "balance_loss_mlp": 1.03475237, + "epoch": 0.41034119945889075, + "flos": 24501608249760.0, + "grad_norm": 1.78574918690047, + "language_loss": 0.71926475, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.74658138, + "num_input_tokens_seen": 146478105, + "step": 6825, + "time_per_iteration": 2.7707204818725586 + }, + { + "auxiliary_loss_clip": 0.01457553, + "auxiliary_loss_mlp": 0.0126303, + "balance_loss_clip": 1.13454127, + "balance_loss_mlp": 1.03433883, + "epoch": 0.4104013227115587, + "flos": 24354938597760.0, + "grad_norm": 1.9545523759876686, + "language_loss": 0.8481378, + "learning_rate": 2.664587156721768e-06, + "loss": 0.87534368, + "num_input_tokens_seen": 146497835, + "step": 6826, + "time_per_iteration": 2.727283000946045 + }, + { + "auxiliary_loss_clip": 0.01463978, + "auxiliary_loss_mlp": 0.01264681, + "balance_loss_clip": 1.14234054, + "balance_loss_mlp": 1.03370094, + "epoch": 0.4104614459642267, + "flos": 23731431382080.0, + "grad_norm": 2.3327695691903703, + "language_loss": 0.66587657, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.69316316, + "num_input_tokens_seen": 146517735, + "step": 6827, + "time_per_iteration": 2.7731597423553467 + }, + { + "auxiliary_loss_clip": 0.01452674, + "auxiliary_loss_mlp": 0.01256816, + "balance_loss_clip": 1.1307621, + "balance_loss_mlp": 1.02621722, + "epoch": 0.41052156921689464, + "flos": 22130202221280.0, + "grad_norm": 1.4599513789451504, + "language_loss": 0.72353113, + "learning_rate": 2.663852444511689e-06, + "loss": 0.75062609, + "num_input_tokens_seen": 146537640, + "step": 6828, + "time_per_iteration": 4.356207370758057 + }, + { + "auxiliary_loss_clip": 0.01459134, + "auxiliary_loss_mlp": 0.01273215, + "balance_loss_clip": 1.1360085, + "balance_loss_mlp": 1.04013681, + "epoch": 0.4105816924695626, + "flos": 20086460845920.0, + "grad_norm": 1.9501085920420151, + "language_loss": 0.83363521, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.8609587, + "num_input_tokens_seen": 146554695, + "step": 6829, + "time_per_iteration": 2.7252402305603027 + }, + { + "auxiliary_loss_clip": 0.01450466, + "auxiliary_loss_mlp": 0.01260281, + "balance_loss_clip": 1.12727141, + "balance_loss_mlp": 1.03120804, + "epoch": 0.4106418157222306, + "flos": 18078713658720.0, + "grad_norm": 1.8207071260593681, + "language_loss": 0.89727902, + "learning_rate": 2.663117631608206e-06, + "loss": 0.9243865, + "num_input_tokens_seen": 146573740, + "step": 6830, + "time_per_iteration": 2.8119804859161377 + }, + { + "auxiliary_loss_clip": 0.01457579, + "auxiliary_loss_mlp": 0.01266891, + "balance_loss_clip": 1.13466978, + "balance_loss_mlp": 1.03743708, + "epoch": 0.41070193897489854, + "flos": 21649723554240.0, + "grad_norm": 1.7992501183606833, + "language_loss": 0.65306616, + "learning_rate": 2.662750187431268e-06, + "loss": 0.68031085, + "num_input_tokens_seen": 146592885, + "step": 6831, + "time_per_iteration": 2.7872400283813477 + }, + { + "auxiliary_loss_clip": 0.01452505, + "auxiliary_loss_mlp": 0.01256732, + "balance_loss_clip": 1.12838173, + "balance_loss_mlp": 1.02746868, + "epoch": 0.4107620622275665, + "flos": 26650449649440.0, + "grad_norm": 2.9285167257257445, + "language_loss": 0.69710159, + "learning_rate": 2.662382718122776e-06, + "loss": 0.72419399, + "num_input_tokens_seen": 146611995, + "step": 6832, + "time_per_iteration": 2.924637794494629 + }, + { + "auxiliary_loss_clip": 0.0144868, + "auxiliary_loss_mlp": 0.01259937, + "balance_loss_clip": 1.12537551, + "balance_loss_mlp": 1.02876592, + "epoch": 0.41082218548023447, + "flos": 18736318654560.0, + "grad_norm": 2.2268606139848712, + "language_loss": 0.73633885, + "learning_rate": 2.662015223696666e-06, + "loss": 0.76342505, + "num_input_tokens_seen": 146628045, + "step": 6833, + "time_per_iteration": 2.7672252655029297 + }, + { + "auxiliary_loss_clip": 0.01454706, + "auxiliary_loss_mlp": 0.01268625, + "balance_loss_clip": 1.13083804, + "balance_loss_mlp": 1.0330677, + "epoch": 0.41088230873290243, + "flos": 22896017350560.0, + "grad_norm": 1.6320607828180278, + "language_loss": 0.72454464, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.75177801, + "num_input_tokens_seen": 146648355, + "step": 6834, + "time_per_iteration": 2.7702713012695312 + }, + { + "auxiliary_loss_clip": 0.01446657, + "auxiliary_loss_mlp": 0.01264165, + "balance_loss_clip": 1.12294579, + "balance_loss_mlp": 1.03204083, + "epoch": 0.4109424319855704, + "flos": 24279119477280.0, + "grad_norm": 2.026141466282425, + "language_loss": 0.71969044, + "learning_rate": 2.661280159547329e-06, + "loss": 0.74679863, + "num_input_tokens_seen": 146668370, + "step": 6835, + "time_per_iteration": 2.7452282905578613 + }, + { + "auxiliary_loss_clip": 0.01457932, + "auxiliary_loss_mlp": 0.01267734, + "balance_loss_clip": 1.13426352, + "balance_loss_mlp": 1.03217661, + "epoch": 0.41100255523823837, + "flos": 12970801490400.0, + "grad_norm": 1.9217898075046858, + "language_loss": 0.86874163, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89599836, + "num_input_tokens_seen": 146686665, + "step": 6836, + "time_per_iteration": 2.6723814010620117 + }, + { + "auxiliary_loss_clip": 0.0145394, + "auxiliary_loss_mlp": 0.01257442, + "balance_loss_clip": 1.13244855, + "balance_loss_mlp": 1.02531815, + "epoch": 0.4110626784909064, + "flos": 23147711170560.0, + "grad_norm": 1.8031515140661394, + "language_loss": 0.69140673, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71852058, + "num_input_tokens_seen": 146706570, + "step": 6837, + "time_per_iteration": 2.7377679347991943 + }, + { + "auxiliary_loss_clip": 0.01453913, + "auxiliary_loss_mlp": 0.01254422, + "balance_loss_clip": 1.1309545, + "balance_loss_mlp": 1.02058077, + "epoch": 0.41112280174357435, + "flos": 22749613195680.0, + "grad_norm": 1.727859635084048, + "language_loss": 0.75173026, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77881366, + "num_input_tokens_seen": 146723425, + "step": 6838, + "time_per_iteration": 4.269673109054565 + }, + { + "auxiliary_loss_clip": 0.01456184, + "auxiliary_loss_mlp": 0.01254233, + "balance_loss_clip": 1.13403153, + "balance_loss_mlp": 1.02153659, + "epoch": 0.4111829249962423, + "flos": 21104349076800.0, + "grad_norm": 2.1511043296726484, + "language_loss": 0.82376748, + "learning_rate": 2.659809730450451e-06, + "loss": 0.85087168, + "num_input_tokens_seen": 146741640, + "step": 6839, + "time_per_iteration": 2.7272276878356934 + }, + { + "auxiliary_loss_clip": 0.01446597, + "auxiliary_loss_mlp": 0.01254702, + "balance_loss_clip": 1.12459326, + "balance_loss_mlp": 1.02429461, + "epoch": 0.4112430482489103, + "flos": 21507870778560.0, + "grad_norm": 1.9192171167204046, + "language_loss": 0.8018899, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82890284, + "num_input_tokens_seen": 146759195, + "step": 6840, + "time_per_iteration": 4.198974609375 + }, + { + "auxiliary_loss_clip": 0.01455455, + "auxiliary_loss_mlp": 0.01255579, + "balance_loss_clip": 1.13254786, + "balance_loss_mlp": 1.02593386, + "epoch": 0.41130317150157825, + "flos": 19571884398720.0, + "grad_norm": 1.868325886240937, + "language_loss": 0.67783952, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.70494986, + "num_input_tokens_seen": 146774990, + "step": 6841, + "time_per_iteration": 2.707829475402832 + }, + { + "auxiliary_loss_clip": 0.01582092, + "auxiliary_loss_mlp": 0.01243568, + "balance_loss_clip": 1.28156126, + "balance_loss_mlp": 1.03528595, + "epoch": 0.4113632947542462, + "flos": 62390321481600.0, + "grad_norm": 0.770818745993829, + "language_loss": 0.59617269, + "learning_rate": 2.65870664586847e-06, + "loss": 0.62442929, + "num_input_tokens_seen": 146839610, + "step": 6842, + "time_per_iteration": 3.3662965297698975 + }, + { + "auxiliary_loss_clip": 0.01451614, + "auxiliary_loss_mlp": 0.01268335, + "balance_loss_clip": 1.13034761, + "balance_loss_mlp": 1.03773654, + "epoch": 0.4114234180069142, + "flos": 13920380376480.0, + "grad_norm": 2.0996794529288065, + "language_loss": 0.70213938, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.72933888, + "num_input_tokens_seen": 146857360, + "step": 6843, + "time_per_iteration": 2.699680805206299 + }, + { + "auxiliary_loss_clip": 0.01575652, + "auxiliary_loss_mlp": 0.0123098, + "balance_loss_clip": 1.27572131, + "balance_loss_mlp": 1.02346039, + "epoch": 0.41148354125958214, + "flos": 64935326520000.0, + "grad_norm": 0.7381929827465341, + "language_loss": 0.53659564, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.56466198, + "num_input_tokens_seen": 146917055, + "step": 6844, + "time_per_iteration": 4.739994764328003 + }, + { + "auxiliary_loss_clip": 0.01442131, + "auxiliary_loss_mlp": 0.01267316, + "balance_loss_clip": 1.11926281, + "balance_loss_mlp": 1.03748071, + "epoch": 0.4115436645122501, + "flos": 18730477717920.0, + "grad_norm": 1.8222516886739966, + "language_loss": 0.66293353, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.69002801, + "num_input_tokens_seen": 146935215, + "step": 6845, + "time_per_iteration": 2.7315080165863037 + }, + { + "auxiliary_loss_clip": 0.01455605, + "auxiliary_loss_mlp": 0.01269178, + "balance_loss_clip": 1.13353109, + "balance_loss_mlp": 1.03857958, + "epoch": 0.41160378776491807, + "flos": 16254768156480.0, + "grad_norm": 2.6046077581176887, + "language_loss": 0.7019887, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72923648, + "num_input_tokens_seen": 146951970, + "step": 6846, + "time_per_iteration": 2.7570667266845703 + }, + { + "auxiliary_loss_clip": 0.01445369, + "auxiliary_loss_mlp": 0.0127183, + "balance_loss_clip": 1.12328529, + "balance_loss_mlp": 1.03913307, + "epoch": 0.41166391101758604, + "flos": 27972828063360.0, + "grad_norm": 2.6421600592363497, + "language_loss": 0.64826715, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67543912, + "num_input_tokens_seen": 146975615, + "step": 6847, + "time_per_iteration": 2.816870927810669 + }, + { + "auxiliary_loss_clip": 0.01455916, + "auxiliary_loss_mlp": 0.01269207, + "balance_loss_clip": 1.13287508, + "balance_loss_mlp": 1.03956187, + "epoch": 0.411724034270254, + "flos": 34134167512800.0, + "grad_norm": 1.43117965753625, + "language_loss": 0.7057125, + "learning_rate": 2.656499802669069e-06, + "loss": 0.73296374, + "num_input_tokens_seen": 146998855, + "step": 6848, + "time_per_iteration": 2.8496437072753906 + }, + { + "auxiliary_loss_clip": 0.01561311, + "auxiliary_loss_mlp": 0.01245895, + "balance_loss_clip": 1.26170588, + "balance_loss_mlp": 1.03684998, + "epoch": 0.41178415752292197, + "flos": 67930846614720.0, + "grad_norm": 0.8950649808022776, + "language_loss": 0.56240785, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.59047997, + "num_input_tokens_seen": 147062710, + "step": 6849, + "time_per_iteration": 3.403177499771118 + }, + { + "auxiliary_loss_clip": 0.01450581, + "auxiliary_loss_mlp": 0.01257511, + "balance_loss_clip": 1.12814331, + "balance_loss_mlp": 1.02538681, + "epoch": 0.41184428077558993, + "flos": 34316831352960.0, + "grad_norm": 1.611048664782556, + "language_loss": 0.75799739, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78507829, + "num_input_tokens_seen": 147086075, + "step": 6850, + "time_per_iteration": 3.0151286125183105 + }, + { + "auxiliary_loss_clip": 0.01442492, + "auxiliary_loss_mlp": 0.01258729, + "balance_loss_clip": 1.12031496, + "balance_loss_mlp": 1.02851176, + "epoch": 0.41190440402825795, + "flos": 35447025958560.0, + "grad_norm": 1.5037184412362126, + "language_loss": 0.68094587, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.7079581, + "num_input_tokens_seen": 147107590, + "step": 6851, + "time_per_iteration": 2.96817946434021 + }, + { + "auxiliary_loss_clip": 0.01444481, + "auxiliary_loss_mlp": 0.01275848, + "balance_loss_clip": 1.12120688, + "balance_loss_mlp": 1.04563081, + "epoch": 0.4119645272809259, + "flos": 20851972549920.0, + "grad_norm": 2.1857215548387003, + "language_loss": 0.79724675, + "learning_rate": 2.655028075792743e-06, + "loss": 0.82445008, + "num_input_tokens_seen": 147123715, + "step": 6852, + "time_per_iteration": 2.7781450748443604 + }, + { + "auxiliary_loss_clip": 0.01456253, + "auxiliary_loss_mlp": 0.01277417, + "balance_loss_clip": 1.13298905, + "balance_loss_mlp": 1.0456742, + "epoch": 0.4120246505335939, + "flos": 27564451557120.0, + "grad_norm": 2.1476215780413477, + "language_loss": 0.77959996, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.80693668, + "num_input_tokens_seen": 147144290, + "step": 6853, + "time_per_iteration": 2.8076913356781006 + }, + { + "auxiliary_loss_clip": 0.01455039, + "auxiliary_loss_mlp": 0.01256644, + "balance_loss_clip": 1.13378406, + "balance_loss_mlp": 1.02184939, + "epoch": 0.41208477378626185, + "flos": 37818469915200.0, + "grad_norm": 1.9160034775548038, + "language_loss": 0.65727746, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.6843943, + "num_input_tokens_seen": 147166340, + "step": 6854, + "time_per_iteration": 2.957824468612671 + }, + { + "auxiliary_loss_clip": 0.01446654, + "auxiliary_loss_mlp": 0.01261641, + "balance_loss_clip": 1.12560272, + "balance_loss_mlp": 1.02989817, + "epoch": 0.4121448970389298, + "flos": 23443022738880.0, + "grad_norm": 2.5057144830446396, + "language_loss": 0.83811402, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.86519706, + "num_input_tokens_seen": 147184025, + "step": 6855, + "time_per_iteration": 2.833436965942383 + }, + { + "auxiliary_loss_clip": 0.01450587, + "auxiliary_loss_mlp": 0.01274552, + "balance_loss_clip": 1.12906218, + "balance_loss_mlp": 1.04395366, + "epoch": 0.4122050202915978, + "flos": 21327482628000.0, + "grad_norm": 1.6689122391594313, + "language_loss": 0.79075152, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81800282, + "num_input_tokens_seen": 147202730, + "step": 6856, + "time_per_iteration": 2.7201366424560547 + }, + { + "auxiliary_loss_clip": 0.0145413, + "auxiliary_loss_mlp": 0.01266825, + "balance_loss_clip": 1.13197136, + "balance_loss_mlp": 1.03489161, + "epoch": 0.41226514354426574, + "flos": 17307929940480.0, + "grad_norm": 2.4531834106471457, + "language_loss": 0.80249012, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.82969964, + "num_input_tokens_seen": 147215315, + "step": 6857, + "time_per_iteration": 2.831162691116333 + }, + { + "auxiliary_loss_clip": 0.01438834, + "auxiliary_loss_mlp": 0.01262442, + "balance_loss_clip": 1.11636329, + "balance_loss_mlp": 1.03031731, + "epoch": 0.4123252667969337, + "flos": 17640752823360.0, + "grad_norm": 6.175556991812485, + "language_loss": 0.70436591, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.73137867, + "num_input_tokens_seen": 147233330, + "step": 6858, + "time_per_iteration": 2.8151907920837402 + }, + { + "auxiliary_loss_clip": 0.01451099, + "auxiliary_loss_mlp": 0.01263405, + "balance_loss_clip": 1.12917948, + "balance_loss_mlp": 1.03337908, + "epoch": 0.4123853900496017, + "flos": 46428248214720.0, + "grad_norm": 1.5497190551686988, + "language_loss": 0.59423637, + "learning_rate": 2.652451598005391e-06, + "loss": 0.6213814, + "num_input_tokens_seen": 147257780, + "step": 6859, + "time_per_iteration": 2.978363275527954 + }, + { + "auxiliary_loss_clip": 0.01441318, + "auxiliary_loss_mlp": 0.01264135, + "balance_loss_clip": 1.11962676, + "balance_loss_mlp": 1.03525317, + "epoch": 0.41244551330226964, + "flos": 17677050436800.0, + "grad_norm": 2.225815117428063, + "language_loss": 0.73564684, + "learning_rate": 2.652083430674264e-06, + "loss": 0.76270133, + "num_input_tokens_seen": 147276055, + "step": 6860, + "time_per_iteration": 2.7476024627685547 + }, + { + "auxiliary_loss_clip": 0.01436308, + "auxiliary_loss_mlp": 0.01259897, + "balance_loss_clip": 1.11407781, + "balance_loss_mlp": 1.03196836, + "epoch": 0.4125056365549376, + "flos": 18695507590080.0, + "grad_norm": 1.8499735631592673, + "language_loss": 0.74460191, + "learning_rate": 2.651715238616068e-06, + "loss": 0.77156401, + "num_input_tokens_seen": 147293200, + "step": 6861, + "time_per_iteration": 2.8221170902252197 + }, + { + "auxiliary_loss_clip": 0.01439893, + "auxiliary_loss_mlp": 0.01266009, + "balance_loss_clip": 1.11769962, + "balance_loss_mlp": 1.04017878, + "epoch": 0.41256575980760557, + "flos": 17897263519680.0, + "grad_norm": 2.409810574108308, + "language_loss": 0.80127978, + "learning_rate": 2.651347021844765e-06, + "loss": 0.82833886, + "num_input_tokens_seen": 147310640, + "step": 6862, + "time_per_iteration": 2.8516998291015625 + }, + { + "auxiliary_loss_clip": 0.01445611, + "auxiliary_loss_mlp": 0.01267274, + "balance_loss_clip": 1.12357426, + "balance_loss_mlp": 1.03724742, + "epoch": 0.41262588306027354, + "flos": 21983494641120.0, + "grad_norm": 1.832385299154552, + "language_loss": 0.76354545, + "learning_rate": 2.650978780374318e-06, + "loss": 0.79067433, + "num_input_tokens_seen": 147329435, + "step": 6863, + "time_per_iteration": 2.862576961517334 + }, + { + "auxiliary_loss_clip": 0.01544224, + "auxiliary_loss_mlp": 0.01222877, + "balance_loss_clip": 1.24668455, + "balance_loss_mlp": 1.01383209, + "epoch": 0.41268600631294156, + "flos": 53355478285440.0, + "grad_norm": 0.7072470316053109, + "language_loss": 0.52650577, + "learning_rate": 2.650610514218691e-06, + "loss": 0.55417681, + "num_input_tokens_seen": 147385805, + "step": 6864, + "time_per_iteration": 3.2971675395965576 + }, + { + "auxiliary_loss_clip": 0.01443631, + "auxiliary_loss_mlp": 0.01265838, + "balance_loss_clip": 1.12086177, + "balance_loss_mlp": 1.03447652, + "epoch": 0.4127461295656095, + "flos": 24387443395200.0, + "grad_norm": 1.6955020232472189, + "language_loss": 0.72506666, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.75216132, + "num_input_tokens_seen": 147405160, + "step": 6865, + "time_per_iteration": 2.859043836593628 + }, + { + "auxiliary_loss_clip": 0.01539349, + "auxiliary_loss_mlp": 0.01222298, + "balance_loss_clip": 1.24203229, + "balance_loss_mlp": 1.0140152, + "epoch": 0.4128062528182775, + "flos": 71711904846240.0, + "grad_norm": 0.9296817897603868, + "language_loss": 0.66522026, + "learning_rate": 2.649873907907753e-06, + "loss": 0.69283676, + "num_input_tokens_seen": 147460245, + "step": 6866, + "time_per_iteration": 4.856863737106323 + }, + { + "auxiliary_loss_clip": 0.01440778, + "auxiliary_loss_mlp": 0.01263275, + "balance_loss_clip": 1.11841345, + "balance_loss_mlp": 1.03439319, + "epoch": 0.41286637607094545, + "flos": 17850535662240.0, + "grad_norm": 3.2921149093277338, + "language_loss": 0.81382668, + "learning_rate": 2.649505567780375e-06, + "loss": 0.84086728, + "num_input_tokens_seen": 147476200, + "step": 6867, + "time_per_iteration": 2.738452196121216 + }, + { + "auxiliary_loss_clip": 0.01449767, + "auxiliary_loss_mlp": 0.01260346, + "balance_loss_clip": 1.12791061, + "balance_loss_mlp": 1.0276494, + "epoch": 0.4129264993236134, + "flos": 25551508212000.0, + "grad_norm": 1.9562331253154972, + "language_loss": 0.78060746, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.80770862, + "num_input_tokens_seen": 147494315, + "step": 6868, + "time_per_iteration": 2.8238089084625244 + }, + { + "auxiliary_loss_clip": 0.01545301, + "auxiliary_loss_mlp": 0.01233688, + "balance_loss_clip": 1.24903655, + "balance_loss_mlp": 1.02464294, + "epoch": 0.4129866225762814, + "flos": 65419901792640.0, + "grad_norm": 0.852606056024994, + "language_loss": 0.57723564, + "learning_rate": 2.64876881365164e-06, + "loss": 0.60502553, + "num_input_tokens_seen": 147543665, + "step": 6869, + "time_per_iteration": 3.018446207046509 + }, + { + "auxiliary_loss_clip": 0.01457935, + "auxiliary_loss_mlp": 0.01265049, + "balance_loss_clip": 1.13766611, + "balance_loss_mlp": 1.03788376, + "epoch": 0.41304674582894935, + "flos": 28879547764320.0, + "grad_norm": 1.7974096147673617, + "language_loss": 0.75514162, + "learning_rate": 2.64840039967822e-06, + "loss": 0.78237152, + "num_input_tokens_seen": 147564870, + "step": 6870, + "time_per_iteration": 2.7920334339141846 + }, + { + "auxiliary_loss_clip": 0.01450716, + "auxiliary_loss_mlp": 0.01262937, + "balance_loss_clip": 1.12973869, + "balance_loss_mlp": 1.03405571, + "epoch": 0.4131068690816173, + "flos": 22894120942560.0, + "grad_norm": 1.759909002238732, + "language_loss": 0.83744007, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.86457658, + "num_input_tokens_seen": 147584840, + "step": 6871, + "time_per_iteration": 2.7566089630126953 + }, + { + "auxiliary_loss_clip": 0.01458641, + "auxiliary_loss_mlp": 0.01275647, + "balance_loss_clip": 1.13744259, + "balance_loss_mlp": 1.04543042, + "epoch": 0.4131669923342853, + "flos": 26067450072960.0, + "grad_norm": 2.1016841979332623, + "language_loss": 0.68729639, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.71463931, + "num_input_tokens_seen": 147604635, + "step": 6872, + "time_per_iteration": 2.825929641723633 + }, + { + "auxiliary_loss_clip": 0.01442818, + "auxiliary_loss_mlp": 0.01255442, + "balance_loss_clip": 1.12138271, + "balance_loss_mlp": 1.0267514, + "epoch": 0.41322711558695324, + "flos": 19246571291520.0, + "grad_norm": 1.8570680352538251, + "language_loss": 0.75960416, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.78658676, + "num_input_tokens_seen": 147620700, + "step": 6873, + "time_per_iteration": 2.655337333679199 + }, + { + "auxiliary_loss_clip": 0.01452059, + "auxiliary_loss_mlp": 0.01270274, + "balance_loss_clip": 1.13209248, + "balance_loss_mlp": 1.039294, + "epoch": 0.4132872388396212, + "flos": 22676676615360.0, + "grad_norm": 2.3583247390928843, + "language_loss": 0.8359924, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.86321568, + "num_input_tokens_seen": 147639490, + "step": 6874, + "time_per_iteration": 2.667874813079834 + }, + { + "auxiliary_loss_clip": 0.01453638, + "auxiliary_loss_mlp": 0.01263081, + "balance_loss_clip": 1.13365221, + "balance_loss_mlp": 1.03362656, + "epoch": 0.4133473620922892, + "flos": 20151887650560.0, + "grad_norm": 1.902414107114655, + "language_loss": 0.71658814, + "learning_rate": 2.646557961279436e-06, + "loss": 0.74375534, + "num_input_tokens_seen": 147657205, + "step": 6875, + "time_per_iteration": 2.660183906555176 + }, + { + "auxiliary_loss_clip": 0.01446384, + "auxiliary_loss_mlp": 0.01256066, + "balance_loss_clip": 1.12772512, + "balance_loss_mlp": 1.02909136, + "epoch": 0.41340748534495714, + "flos": 24245021697120.0, + "grad_norm": 1.7012392888665941, + "language_loss": 0.82725936, + "learning_rate": 2.646189399991154e-06, + "loss": 0.85428387, + "num_input_tokens_seen": 147677005, + "step": 6876, + "time_per_iteration": 4.137486219406128 + }, + { + "auxiliary_loss_clip": 0.01452005, + "auxiliary_loss_mlp": 0.01270543, + "balance_loss_clip": 1.13195086, + "balance_loss_mlp": 1.03765559, + "epoch": 0.41346760859762516, + "flos": 14393918190240.0, + "grad_norm": 4.809366546384208, + "language_loss": 0.65618849, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.68341392, + "num_input_tokens_seen": 147693435, + "step": 6877, + "time_per_iteration": 2.689661979675293 + }, + { + "auxiliary_loss_clip": 0.01452526, + "auxiliary_loss_mlp": 0.0126315, + "balance_loss_clip": 1.13202918, + "balance_loss_mlp": 1.03503156, + "epoch": 0.4135277318502931, + "flos": 22494202416000.0, + "grad_norm": 2.00767225577122, + "language_loss": 0.76548433, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.79264116, + "num_input_tokens_seen": 147714000, + "step": 6878, + "time_per_iteration": 4.236459732055664 + }, + { + "auxiliary_loss_clip": 0.01451049, + "auxiliary_loss_mlp": 0.01261728, + "balance_loss_clip": 1.13082504, + "balance_loss_mlp": 1.02941251, + "epoch": 0.4135878551029611, + "flos": 22420924482240.0, + "grad_norm": 1.8391648581279296, + "language_loss": 0.80500346, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.83213127, + "num_input_tokens_seen": 147731010, + "step": 6879, + "time_per_iteration": 2.7411067485809326 + }, + { + "auxiliary_loss_clip": 0.01457878, + "auxiliary_loss_mlp": 0.01273748, + "balance_loss_clip": 1.13901901, + "balance_loss_mlp": 1.04601026, + "epoch": 0.41364797835562905, + "flos": 27055716046560.0, + "grad_norm": 1.7287688183060155, + "language_loss": 0.8447935, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.87210977, + "num_input_tokens_seen": 147750880, + "step": 6880, + "time_per_iteration": 2.7706146240234375 + }, + { + "auxiliary_loss_clip": 0.01450658, + "auxiliary_loss_mlp": 0.012691, + "balance_loss_clip": 1.12946427, + "balance_loss_mlp": 1.03735697, + "epoch": 0.413708101608297, + "flos": 22969902134880.0, + "grad_norm": 1.7486127168682124, + "language_loss": 0.70504206, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.73223966, + "num_input_tokens_seen": 147771360, + "step": 6881, + "time_per_iteration": 2.7530205249786377 + }, + { + "auxiliary_loss_clip": 0.01450085, + "auxiliary_loss_mlp": 0.01263743, + "balance_loss_clip": 1.12940228, + "balance_loss_mlp": 1.03695869, + "epoch": 0.413768224860965, + "flos": 13335674032800.0, + "grad_norm": 1.9917449895203063, + "language_loss": 0.80914843, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83628672, + "num_input_tokens_seen": 147787440, + "step": 6882, + "time_per_iteration": 2.7582051753997803 + }, + { + "auxiliary_loss_clip": 0.0146357, + "auxiliary_loss_mlp": 0.01276992, + "balance_loss_clip": 1.14261734, + "balance_loss_mlp": 1.04257846, + "epoch": 0.41382834811363295, + "flos": 20816092146240.0, + "grad_norm": 2.5494999962462805, + "language_loss": 0.7029413, + "learning_rate": 2.643608785656077e-06, + "loss": 0.73034692, + "num_input_tokens_seen": 147805720, + "step": 6883, + "time_per_iteration": 4.193112373352051 + }, + { + "auxiliary_loss_clip": 0.01453852, + "auxiliary_loss_mlp": 0.01271356, + "balance_loss_clip": 1.133762, + "balance_loss_mlp": 1.04132962, + "epoch": 0.4138884713663009, + "flos": 20669194925280.0, + "grad_norm": 1.8045192447387524, + "language_loss": 0.75953263, + "learning_rate": 2.643240028730663e-06, + "loss": 0.78678471, + "num_input_tokens_seen": 147824605, + "step": 6884, + "time_per_iteration": 2.7908172607421875 + }, + { + "auxiliary_loss_clip": 0.01447786, + "auxiliary_loss_mlp": 0.0125226, + "balance_loss_clip": 1.12789416, + "balance_loss_mlp": 1.01975369, + "epoch": 0.4139485946189689, + "flos": 29059025639040.0, + "grad_norm": 1.433648914901062, + "language_loss": 0.75870132, + "learning_rate": 2.642871247413523e-06, + "loss": 0.78570175, + "num_input_tokens_seen": 147845445, + "step": 6885, + "time_per_iteration": 2.8474905490875244 + }, + { + "auxiliary_loss_clip": 0.01449506, + "auxiliary_loss_mlp": 0.01262071, + "balance_loss_clip": 1.12857568, + "balance_loss_mlp": 1.02727592, + "epoch": 0.41400871787163684, + "flos": 24428026890720.0, + "grad_norm": 2.0670888744491083, + "language_loss": 0.69860893, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.7257247, + "num_input_tokens_seen": 147865580, + "step": 6886, + "time_per_iteration": 2.906034231185913 + }, + { + "auxiliary_loss_clip": 0.01444373, + "auxiliary_loss_mlp": 0.01282991, + "balance_loss_clip": 1.12485313, + "balance_loss_mlp": 1.05048525, + "epoch": 0.4140688411243048, + "flos": 19466101667520.0, + "grad_norm": 2.4035499173807704, + "language_loss": 0.7581408, + "learning_rate": 2.642133611660002e-06, + "loss": 0.78541446, + "num_input_tokens_seen": 147885230, + "step": 6887, + "time_per_iteration": 2.8518905639648438 + }, + { + "auxiliary_loss_clip": 0.01447111, + "auxiliary_loss_mlp": 0.01255031, + "balance_loss_clip": 1.1266768, + "balance_loss_mlp": 1.02405095, + "epoch": 0.4141289643769728, + "flos": 19315449558720.0, + "grad_norm": 2.115179327210408, + "language_loss": 0.70171332, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72873473, + "num_input_tokens_seen": 147903035, + "step": 6888, + "time_per_iteration": 2.8143534660339355 + }, + { + "auxiliary_loss_clip": 0.0144724, + "auxiliary_loss_mlp": 0.01258772, + "balance_loss_clip": 1.1260426, + "balance_loss_mlp": 1.02893698, + "epoch": 0.41418908762964074, + "flos": 16728761108160.0, + "grad_norm": 1.7857810716462512, + "language_loss": 0.7609126, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.78797269, + "num_input_tokens_seen": 147918745, + "step": 6889, + "time_per_iteration": 2.7341370582580566 + }, + { + "auxiliary_loss_clip": 0.01449826, + "auxiliary_loss_mlp": 0.0126582, + "balance_loss_clip": 1.13000667, + "balance_loss_mlp": 1.03426838, + "epoch": 0.41424921088230876, + "flos": 25298942044320.0, + "grad_norm": 1.7510150074797715, + "language_loss": 0.8053658, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.83252227, + "num_input_tokens_seen": 147938265, + "step": 6890, + "time_per_iteration": 2.8345253467559814 + }, + { + "auxiliary_loss_clip": 0.01451262, + "auxiliary_loss_mlp": 0.01268148, + "balance_loss_clip": 1.13032985, + "balance_loss_mlp": 1.03239942, + "epoch": 0.4143093341349767, + "flos": 20962685941920.0, + "grad_norm": 1.75785021935136, + "language_loss": 0.74150467, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76869881, + "num_input_tokens_seen": 147957320, + "step": 6891, + "time_per_iteration": 2.7899014949798584 + }, + { + "auxiliary_loss_clip": 0.01452322, + "auxiliary_loss_mlp": 0.0126449, + "balance_loss_clip": 1.1318717, + "balance_loss_mlp": 1.0277878, + "epoch": 0.4143694573876447, + "flos": 22019868110880.0, + "grad_norm": 1.6801775496302254, + "language_loss": 0.84211028, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86927837, + "num_input_tokens_seen": 147977045, + "step": 6892, + "time_per_iteration": 2.8047080039978027 + }, + { + "auxiliary_loss_clip": 0.01454418, + "auxiliary_loss_mlp": 0.01258167, + "balance_loss_clip": 1.13353169, + "balance_loss_mlp": 1.02833104, + "epoch": 0.41442958064031266, + "flos": 35700198976800.0, + "grad_norm": 1.606803312119295, + "language_loss": 0.70424557, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.7313714, + "num_input_tokens_seen": 147996905, + "step": 6893, + "time_per_iteration": 2.9199044704437256 + }, + { + "auxiliary_loss_clip": 0.01447032, + "auxiliary_loss_mlp": 0.01262869, + "balance_loss_clip": 1.12640023, + "balance_loss_mlp": 1.03494072, + "epoch": 0.4144897038929806, + "flos": 28296320618880.0, + "grad_norm": 1.4172430355021024, + "language_loss": 0.72964346, + "learning_rate": 2.639551120239279e-06, + "loss": 0.75674248, + "num_input_tokens_seen": 148017875, + "step": 6894, + "time_per_iteration": 2.7890372276306152 + }, + { + "auxiliary_loss_clip": 0.01448553, + "auxiliary_loss_mlp": 0.01261512, + "balance_loss_clip": 1.1285255, + "balance_loss_mlp": 1.03148556, + "epoch": 0.4145498271456486, + "flos": 11649257496000.0, + "grad_norm": 3.584706175123593, + "language_loss": 0.62477303, + "learning_rate": 2.63918209577416e-06, + "loss": 0.65187371, + "num_input_tokens_seen": 148032300, + "step": 6895, + "time_per_iteration": 2.738049030303955 + }, + { + "auxiliary_loss_clip": 0.01449969, + "auxiliary_loss_mlp": 0.01255689, + "balance_loss_clip": 1.12913525, + "balance_loss_mlp": 1.0264256, + "epoch": 0.41460995039831655, + "flos": 27238114389600.0, + "grad_norm": 1.6237987517521018, + "language_loss": 0.70765877, + "learning_rate": 2.638813047071192e-06, + "loss": 0.73471534, + "num_input_tokens_seen": 148053260, + "step": 6896, + "time_per_iteration": 2.7960827350616455 + }, + { + "auxiliary_loss_clip": 0.01452061, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_clip": 1.13072777, + "balance_loss_mlp": 1.03589869, + "epoch": 0.4146700736509845, + "flos": 25924800805920.0, + "grad_norm": 1.6850026197088013, + "language_loss": 0.72897261, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.75618869, + "num_input_tokens_seen": 148072965, + "step": 6897, + "time_per_iteration": 2.829820394515991 + }, + { + "auxiliary_loss_clip": 0.01449122, + "auxiliary_loss_mlp": 0.01262296, + "balance_loss_clip": 1.12825084, + "balance_loss_mlp": 1.03188848, + "epoch": 0.4147301969036525, + "flos": 26835654676320.0, + "grad_norm": 1.7721638466259264, + "language_loss": 0.85006618, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.87718034, + "num_input_tokens_seen": 148093240, + "step": 6898, + "time_per_iteration": 2.8076179027557373 + }, + { + "auxiliary_loss_clip": 0.01441524, + "auxiliary_loss_mlp": 0.01262377, + "balance_loss_clip": 1.11946511, + "balance_loss_mlp": 1.03044295, + "epoch": 0.41479032015632045, + "flos": 20300112357120.0, + "grad_norm": 1.9563796321224554, + "language_loss": 0.74395967, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.77099872, + "num_input_tokens_seen": 148110925, + "step": 6899, + "time_per_iteration": 2.8212461471557617 + }, + { + "auxiliary_loss_clip": 0.01450354, + "auxiliary_loss_mlp": 0.01269753, + "balance_loss_clip": 1.12863946, + "balance_loss_mlp": 1.03381395, + "epoch": 0.4148504434089884, + "flos": 25267347522720.0, + "grad_norm": 15.356212070052887, + "language_loss": 0.75730586, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.78450692, + "num_input_tokens_seen": 148130670, + "step": 6900, + "time_per_iteration": 2.8987185955047607 + }, + { + "auxiliary_loss_clip": 0.01456806, + "auxiliary_loss_mlp": 0.01266436, + "balance_loss_clip": 1.13561606, + "balance_loss_mlp": 1.0354557, + "epoch": 0.4149105666616564, + "flos": 12823904269440.0, + "grad_norm": 2.7468956199014514, + "language_loss": 0.79951811, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82675052, + "num_input_tokens_seen": 148148350, + "step": 6901, + "time_per_iteration": 2.816301107406616 + }, + { + "auxiliary_loss_clip": 0.01439279, + "auxiliary_loss_mlp": 0.01264074, + "balance_loss_clip": 1.11789942, + "balance_loss_mlp": 1.03290331, + "epoch": 0.41497068991432434, + "flos": 16765779356640.0, + "grad_norm": 1.6877900374926944, + "language_loss": 0.69462538, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.72165889, + "num_input_tokens_seen": 148167550, + "step": 6902, + "time_per_iteration": 2.799149990081787 + }, + { + "auxiliary_loss_clip": 0.01449615, + "auxiliary_loss_mlp": 0.01248039, + "balance_loss_clip": 1.12813473, + "balance_loss_mlp": 1.01896632, + "epoch": 0.4150308131669923, + "flos": 18002666969280.0, + "grad_norm": 1.5791247551701713, + "language_loss": 0.83870113, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.8656776, + "num_input_tokens_seen": 148184740, + "step": 6903, + "time_per_iteration": 2.784015417098999 + }, + { + "auxiliary_loss_clip": 0.01448555, + "auxiliary_loss_mlp": 0.01271548, + "balance_loss_clip": 1.1274569, + "balance_loss_mlp": 1.03808904, + "epoch": 0.41509093641966033, + "flos": 30047405397120.0, + "grad_norm": 3.1120419542713185, + "language_loss": 0.68182677, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.70902783, + "num_input_tokens_seen": 148204605, + "step": 6904, + "time_per_iteration": 2.872943878173828 + }, + { + "auxiliary_loss_clip": 0.01442469, + "auxiliary_loss_mlp": 0.01271066, + "balance_loss_clip": 1.12040174, + "balance_loss_mlp": 1.03894198, + "epoch": 0.4151510596723283, + "flos": 24282305442720.0, + "grad_norm": 1.6370956491595814, + "language_loss": 0.77556491, + "learning_rate": 2.635490520350643e-06, + "loss": 0.80270034, + "num_input_tokens_seen": 148224675, + "step": 6905, + "time_per_iteration": 4.431305408477783 + }, + { + "auxiliary_loss_clip": 0.01455035, + "auxiliary_loss_mlp": 0.01272284, + "balance_loss_clip": 1.1334343, + "balance_loss_mlp": 1.03710818, + "epoch": 0.41521118292499626, + "flos": 23478410076480.0, + "grad_norm": 1.6270654834715639, + "language_loss": 0.68706709, + "learning_rate": 2.635121230039025e-06, + "loss": 0.71434027, + "num_input_tokens_seen": 148243375, + "step": 6906, + "time_per_iteration": 2.9086968898773193 + }, + { + "auxiliary_loss_clip": 0.01444259, + "auxiliary_loss_mlp": 0.01264321, + "balance_loss_clip": 1.1218555, + "balance_loss_mlp": 1.0337224, + "epoch": 0.4152713061776642, + "flos": 22127585178240.0, + "grad_norm": 3.2051562470261787, + "language_loss": 0.67301154, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.70009732, + "num_input_tokens_seen": 148261140, + "step": 6907, + "time_per_iteration": 2.8051536083221436 + }, + { + "auxiliary_loss_clip": 0.01453889, + "auxiliary_loss_mlp": 0.01271758, + "balance_loss_clip": 1.13330197, + "balance_loss_mlp": 1.04268539, + "epoch": 0.4153314294303322, + "flos": 21253521987360.0, + "grad_norm": 2.53957875593259, + "language_loss": 0.7738055, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.80106199, + "num_input_tokens_seen": 148279655, + "step": 6908, + "time_per_iteration": 2.8020243644714355 + }, + { + "auxiliary_loss_clip": 0.0156097, + "auxiliary_loss_mlp": 0.01242126, + "balance_loss_clip": 1.25610518, + "balance_loss_mlp": 1.03460693, + "epoch": 0.41539155268300015, + "flos": 57926967022080.0, + "grad_norm": 0.7676471112424877, + "language_loss": 0.64735615, + "learning_rate": 2.634013214657026e-06, + "loss": 0.67538702, + "num_input_tokens_seen": 148339005, + "step": 6909, + "time_per_iteration": 3.3363842964172363 + }, + { + "auxiliary_loss_clip": 0.01440212, + "auxiliary_loss_mlp": 0.0126652, + "balance_loss_clip": 1.11817372, + "balance_loss_mlp": 1.03782845, + "epoch": 0.4154516759356681, + "flos": 21905589471840.0, + "grad_norm": 1.42903450389447, + "language_loss": 0.87231326, + "learning_rate": 2.633643828093996e-06, + "loss": 0.89938056, + "num_input_tokens_seen": 148358715, + "step": 6910, + "time_per_iteration": 2.840721368789673 + }, + { + "auxiliary_loss_clip": 0.01563251, + "auxiliary_loss_mlp": 0.01241234, + "balance_loss_clip": 1.25938535, + "balance_loss_mlp": 1.03371429, + "epoch": 0.4155117991883361, + "flos": 67839894158400.0, + "grad_norm": 0.8409719716267379, + "language_loss": 0.62071824, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64876312, + "num_input_tokens_seen": 148417280, + "step": 6911, + "time_per_iteration": 3.2207424640655518 + }, + { + "auxiliary_loss_clip": 0.01455544, + "auxiliary_loss_mlp": 0.01268077, + "balance_loss_clip": 1.13442218, + "balance_loss_mlp": 1.03499949, + "epoch": 0.41557192244100405, + "flos": 14284418499360.0, + "grad_norm": 2.6000060179809807, + "language_loss": 0.8755694, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90280557, + "num_input_tokens_seen": 148432610, + "step": 6912, + "time_per_iteration": 2.749509334564209 + }, + { + "auxiliary_loss_clip": 0.01445053, + "auxiliary_loss_mlp": 0.01263135, + "balance_loss_clip": 1.12425947, + "balance_loss_mlp": 1.03349066, + "epoch": 0.415632045693672, + "flos": 24464248647840.0, + "grad_norm": 2.238640281198116, + "language_loss": 0.63611215, + "learning_rate": 2.632535524293914e-06, + "loss": 0.663194, + "num_input_tokens_seen": 148451510, + "step": 6913, + "time_per_iteration": 2.7806754112243652 + }, + { + "auxiliary_loss_clip": 0.01447468, + "auxiliary_loss_mlp": 0.01270592, + "balance_loss_clip": 1.12710977, + "balance_loss_mlp": 1.04323649, + "epoch": 0.41569216894634, + "flos": 20117069235360.0, + "grad_norm": 2.0139790981347763, + "language_loss": 0.75330424, + "learning_rate": 2.632166041703586e-06, + "loss": 0.7804848, + "num_input_tokens_seen": 148469945, + "step": 6914, + "time_per_iteration": 4.298918962478638 + }, + { + "auxiliary_loss_clip": 0.01443809, + "auxiliary_loss_mlp": 0.01266174, + "balance_loss_clip": 1.12344193, + "balance_loss_mlp": 1.03462219, + "epoch": 0.41575229219900794, + "flos": 23800726859040.0, + "grad_norm": 1.828305016440842, + "language_loss": 0.87874258, + "learning_rate": 2.631796535141458e-06, + "loss": 0.90584242, + "num_input_tokens_seen": 148486655, + "step": 6915, + "time_per_iteration": 2.8177382946014404 + }, + { + "auxiliary_loss_clip": 0.01460689, + "auxiliary_loss_mlp": 0.01256559, + "balance_loss_clip": 1.13948309, + "balance_loss_mlp": 1.02386284, + "epoch": 0.4158124154516759, + "flos": 23110237784160.0, + "grad_norm": 3.7846905368813504, + "language_loss": 0.70894057, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73611307, + "num_input_tokens_seen": 148505035, + "step": 6916, + "time_per_iteration": 4.373914480209351 + }, + { + "auxiliary_loss_clip": 0.0144439, + "auxiliary_loss_mlp": 0.01258301, + "balance_loss_clip": 1.12433851, + "balance_loss_mlp": 1.02388763, + "epoch": 0.41587253870434393, + "flos": 24245249266080.0, + "grad_norm": 1.4750441149196991, + "language_loss": 0.72221881, + "learning_rate": 2.631057450157852e-06, + "loss": 0.74924576, + "num_input_tokens_seen": 148525575, + "step": 6917, + "time_per_iteration": 2.8824188709259033 + }, + { + "auxiliary_loss_clip": 0.01438615, + "auxiliary_loss_mlp": 0.01260378, + "balance_loss_clip": 1.11706269, + "balance_loss_mlp": 1.03225946, + "epoch": 0.4159326619570119, + "flos": 23884055755200.0, + "grad_norm": 2.1377731645939777, + "language_loss": 0.80964971, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.8366397, + "num_input_tokens_seen": 148547270, + "step": 6918, + "time_per_iteration": 2.867765426635742 + }, + { + "auxiliary_loss_clip": 0.01443525, + "auxiliary_loss_mlp": 0.01263995, + "balance_loss_clip": 1.1221118, + "balance_loss_mlp": 1.03053594, + "epoch": 0.41599278520967986, + "flos": 40629809043360.0, + "grad_norm": 1.4613536745644022, + "language_loss": 0.70417356, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.73124874, + "num_input_tokens_seen": 148572100, + "step": 6919, + "time_per_iteration": 2.955091714859009 + }, + { + "auxiliary_loss_clip": 0.01444489, + "auxiliary_loss_mlp": 0.01260344, + "balance_loss_clip": 1.12277579, + "balance_loss_mlp": 1.03050804, + "epoch": 0.4160529084623478, + "flos": 18224890244640.0, + "grad_norm": 1.931333663013021, + "language_loss": 0.81455618, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.84160447, + "num_input_tokens_seen": 148591245, + "step": 6920, + "time_per_iteration": 2.783754348754883 + }, + { + "auxiliary_loss_clip": 0.01439192, + "auxiliary_loss_mlp": 0.01257893, + "balance_loss_clip": 1.11636043, + "balance_loss_mlp": 1.02843928, + "epoch": 0.4161130317150158, + "flos": 13663642111200.0, + "grad_norm": 1.9284584161617992, + "language_loss": 0.65512705, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.68209791, + "num_input_tokens_seen": 148607980, + "step": 6921, + "time_per_iteration": 2.8830525875091553 + }, + { + "auxiliary_loss_clip": 0.01443691, + "auxiliary_loss_mlp": 0.0126631, + "balance_loss_clip": 1.12183809, + "balance_loss_mlp": 1.03628349, + "epoch": 0.41617315496768376, + "flos": 16180276521600.0, + "grad_norm": 2.3015303390090223, + "language_loss": 0.8032887, + "learning_rate": 2.629209319173274e-06, + "loss": 0.83038872, + "num_input_tokens_seen": 148624490, + "step": 6922, + "time_per_iteration": 4.29810905456543 + }, + { + "auxiliary_loss_clip": 0.01439725, + "auxiliary_loss_mlp": 0.01258393, + "balance_loss_clip": 1.11739349, + "balance_loss_mlp": 1.02607799, + "epoch": 0.4162332782203517, + "flos": 26215712707680.0, + "grad_norm": 6.7731475277483, + "language_loss": 0.67807615, + "learning_rate": 2.628839621341247e-06, + "loss": 0.70505738, + "num_input_tokens_seen": 148646490, + "step": 6923, + "time_per_iteration": 2.8498332500457764 + }, + { + "auxiliary_loss_clip": 0.01450443, + "auxiliary_loss_mlp": 0.01260577, + "balance_loss_clip": 1.12934279, + "balance_loss_mlp": 1.02692652, + "epoch": 0.4162934014730197, + "flos": 28186517502720.0, + "grad_norm": 2.8999044729961923, + "language_loss": 0.75760901, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78471923, + "num_input_tokens_seen": 148668580, + "step": 6924, + "time_per_iteration": 2.877946138381958 + }, + { + "auxiliary_loss_clip": 0.01442464, + "auxiliary_loss_mlp": 0.01265785, + "balance_loss_clip": 1.11956835, + "balance_loss_mlp": 1.03537714, + "epoch": 0.41635352472568765, + "flos": 19867385607840.0, + "grad_norm": 1.8288628410160868, + "language_loss": 0.73605478, + "learning_rate": 2.62810015415423e-06, + "loss": 0.76313734, + "num_input_tokens_seen": 148688410, + "step": 6925, + "time_per_iteration": 2.835130214691162 + }, + { + "auxiliary_loss_clip": 0.01436579, + "auxiliary_loss_mlp": 0.01251742, + "balance_loss_clip": 1.11447465, + "balance_loss_mlp": 1.02228808, + "epoch": 0.4164136479783556, + "flos": 14936903193600.0, + "grad_norm": 2.860357427892436, + "language_loss": 0.84020412, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.86708736, + "num_input_tokens_seen": 148704855, + "step": 6926, + "time_per_iteration": 2.781695604324341 + }, + { + "auxiliary_loss_clip": 0.01440264, + "auxiliary_loss_mlp": 0.01250874, + "balance_loss_clip": 1.11827207, + "balance_loss_mlp": 1.02294588, + "epoch": 0.4164737712310236, + "flos": 21759488742240.0, + "grad_norm": 1.7543760546876408, + "language_loss": 0.86731374, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.89422512, + "num_input_tokens_seen": 148723065, + "step": 6927, + "time_per_iteration": 2.779707431793213 + }, + { + "auxiliary_loss_clip": 0.01453953, + "auxiliary_loss_mlp": 0.01256344, + "balance_loss_clip": 1.13201082, + "balance_loss_mlp": 1.02402878, + "epoch": 0.41653389448369155, + "flos": 20742207361920.0, + "grad_norm": 2.386478315306744, + "language_loss": 0.7292012, + "learning_rate": 2.626990774776604e-06, + "loss": 0.75630414, + "num_input_tokens_seen": 148741780, + "step": 6928, + "time_per_iteration": 2.793477773666382 + }, + { + "auxiliary_loss_clip": 0.01440549, + "auxiliary_loss_mlp": 0.01249166, + "balance_loss_clip": 1.11798644, + "balance_loss_mlp": 1.02104759, + "epoch": 0.4165940177363595, + "flos": 24975221919840.0, + "grad_norm": 1.9863235560935428, + "language_loss": 0.77822983, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80512702, + "num_input_tokens_seen": 148759795, + "step": 6929, + "time_per_iteration": 2.8251922130584717 + }, + { + "auxiliary_loss_clip": 0.0144925, + "auxiliary_loss_mlp": 0.01256195, + "balance_loss_clip": 1.12671089, + "balance_loss_mlp": 1.02883959, + "epoch": 0.41665414098902753, + "flos": 20523814830720.0, + "grad_norm": 1.793305750510237, + "language_loss": 0.71045387, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73750836, + "num_input_tokens_seen": 148778680, + "step": 6930, + "time_per_iteration": 2.7656822204589844 + }, + { + "auxiliary_loss_clip": 0.01442294, + "auxiliary_loss_mlp": 0.01256237, + "balance_loss_clip": 1.12019348, + "balance_loss_mlp": 1.02697408, + "epoch": 0.4167142642416955, + "flos": 19684987264800.0, + "grad_norm": 1.7331040866033824, + "language_loss": 0.81091785, + "learning_rate": 2.625881181419007e-06, + "loss": 0.8379032, + "num_input_tokens_seen": 148796470, + "step": 6931, + "time_per_iteration": 2.743734836578369 + }, + { + "auxiliary_loss_clip": 0.01443351, + "auxiliary_loss_mlp": 0.01256663, + "balance_loss_clip": 1.11970305, + "balance_loss_mlp": 1.02682734, + "epoch": 0.41677438749436346, + "flos": 23765718803040.0, + "grad_norm": 2.156374218270788, + "language_loss": 0.79028863, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81728876, + "num_input_tokens_seen": 148815300, + "step": 6932, + "time_per_iteration": 2.823789596557617 + }, + { + "auxiliary_loss_clip": 0.01452404, + "auxiliary_loss_mlp": 0.01258218, + "balance_loss_clip": 1.12778866, + "balance_loss_mlp": 1.0281918, + "epoch": 0.41683451074703143, + "flos": 30412733077440.0, + "grad_norm": 1.8708578294259008, + "language_loss": 0.81540412, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.84251034, + "num_input_tokens_seen": 148834315, + "step": 6933, + "time_per_iteration": 2.8611600399017334 + }, + { + "auxiliary_loss_clip": 0.01445698, + "auxiliary_loss_mlp": 0.01263002, + "balance_loss_clip": 1.12148321, + "balance_loss_mlp": 1.03297544, + "epoch": 0.4168946339996994, + "flos": 21508894838880.0, + "grad_norm": 2.4603149348389364, + "language_loss": 0.76899081, + "learning_rate": 2.624771374460121e-06, + "loss": 0.79607785, + "num_input_tokens_seen": 148852420, + "step": 6934, + "time_per_iteration": 2.7420480251312256 + }, + { + "auxiliary_loss_clip": 0.01444624, + "auxiliary_loss_mlp": 0.01255181, + "balance_loss_clip": 1.12058759, + "balance_loss_mlp": 1.02668071, + "epoch": 0.41695475725236736, + "flos": 17640563182560.0, + "grad_norm": 1.9794265400943027, + "language_loss": 0.67280191, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69980001, + "num_input_tokens_seen": 148869305, + "step": 6935, + "time_per_iteration": 2.7495272159576416 + }, + { + "auxiliary_loss_clip": 0.01457053, + "auxiliary_loss_mlp": 0.01256403, + "balance_loss_clip": 1.13308597, + "balance_loss_mlp": 1.02523232, + "epoch": 0.4170148805050353, + "flos": 15670706591520.0, + "grad_norm": 2.4528495774376218, + "language_loss": 0.73254073, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75967526, + "num_input_tokens_seen": 148886395, + "step": 6936, + "time_per_iteration": 2.6930298805236816 + }, + { + "auxiliary_loss_clip": 0.01448742, + "auxiliary_loss_mlp": 0.01249707, + "balance_loss_clip": 1.12470698, + "balance_loss_mlp": 1.02273297, + "epoch": 0.4170750037577033, + "flos": 15160984948800.0, + "grad_norm": 1.935118374747805, + "language_loss": 0.74165082, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.76863539, + "num_input_tokens_seen": 148905235, + "step": 6937, + "time_per_iteration": 2.7312235832214355 + }, + { + "auxiliary_loss_clip": 0.01446898, + "auxiliary_loss_mlp": 0.01250365, + "balance_loss_clip": 1.12295723, + "balance_loss_mlp": 1.025298, + "epoch": 0.41713512701037125, + "flos": 28770882492960.0, + "grad_norm": 1.4755945239097323, + "language_loss": 0.8441025, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.87107515, + "num_input_tokens_seen": 148928130, + "step": 6938, + "time_per_iteration": 2.825091600418091 + }, + { + "auxiliary_loss_clip": 0.01447927, + "auxiliary_loss_mlp": 0.01265565, + "balance_loss_clip": 1.12304831, + "balance_loss_mlp": 1.03706443, + "epoch": 0.4171952502630392, + "flos": 28259529939360.0, + "grad_norm": 1.8988747479630415, + "language_loss": 0.74217796, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76931286, + "num_input_tokens_seen": 148948790, + "step": 6939, + "time_per_iteration": 2.8157505989074707 + }, + { + "auxiliary_loss_clip": 0.01450261, + "auxiliary_loss_mlp": 0.01256666, + "balance_loss_clip": 1.12607896, + "balance_loss_mlp": 1.0302639, + "epoch": 0.4172553735157072, + "flos": 24574013835840.0, + "grad_norm": 3.897573001500407, + "language_loss": 0.75690114, + "learning_rate": 2.622551121253579e-06, + "loss": 0.78397048, + "num_input_tokens_seen": 148967690, + "step": 6940, + "time_per_iteration": 2.8595573902130127 + }, + { + "auxiliary_loss_clip": 0.01448373, + "auxiliary_loss_mlp": 0.01261343, + "balance_loss_clip": 1.12448645, + "balance_loss_mlp": 1.0351311, + "epoch": 0.41731549676837515, + "flos": 27047864917440.0, + "grad_norm": 1.9824494377429926, + "language_loss": 0.71356064, + "learning_rate": 2.622180996345424e-06, + "loss": 0.74065781, + "num_input_tokens_seen": 148987150, + "step": 6941, + "time_per_iteration": 2.83082914352417 + }, + { + "auxiliary_loss_clip": 0.01450592, + "auxiliary_loss_mlp": 0.01260971, + "balance_loss_clip": 1.12605047, + "balance_loss_mlp": 1.03266144, + "epoch": 0.4173756200210431, + "flos": 28396110700800.0, + "grad_norm": 1.9733559883099523, + "language_loss": 0.7429893, + "learning_rate": 2.621810847844104e-06, + "loss": 0.770105, + "num_input_tokens_seen": 149004895, + "step": 6942, + "time_per_iteration": 4.387242317199707 + }, + { + "auxiliary_loss_clip": 0.01456222, + "auxiliary_loss_mlp": 0.01272006, + "balance_loss_clip": 1.13157165, + "balance_loss_mlp": 1.04369664, + "epoch": 0.41743574327371114, + "flos": 22523369535360.0, + "grad_norm": 2.9952151348818608, + "language_loss": 0.72601163, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.75329387, + "num_input_tokens_seen": 149020970, + "step": 6943, + "time_per_iteration": 2.7439467906951904 + }, + { + "auxiliary_loss_clip": 0.0144734, + "auxiliary_loss_mlp": 0.01267096, + "balance_loss_clip": 1.1235652, + "balance_loss_mlp": 1.03783298, + "epoch": 0.4174958665263791, + "flos": 30115828526400.0, + "grad_norm": 1.8386034617289944, + "language_loss": 0.63742429, + "learning_rate": 2.621070480118111e-06, + "loss": 0.66456866, + "num_input_tokens_seen": 149041795, + "step": 6944, + "time_per_iteration": 2.848369598388672 + }, + { + "auxiliary_loss_clip": 0.01445928, + "auxiliary_loss_mlp": 0.01263436, + "balance_loss_clip": 1.12251127, + "balance_loss_mlp": 1.03646207, + "epoch": 0.41755598977904707, + "flos": 25265716611840.0, + "grad_norm": 2.0205391704283118, + "language_loss": 0.70243621, + "learning_rate": 2.620700260921513e-06, + "loss": 0.7295298, + "num_input_tokens_seen": 149063700, + "step": 6945, + "time_per_iteration": 2.848029851913452 + }, + { + "auxiliary_loss_clip": 0.01448642, + "auxiliary_loss_mlp": 0.01259441, + "balance_loss_clip": 1.12565637, + "balance_loss_mlp": 1.03246653, + "epoch": 0.41761611303171503, + "flos": 19830632856480.0, + "grad_norm": 1.8333434507376674, + "language_loss": 0.81105024, + "learning_rate": 2.620330018187899e-06, + "loss": 0.83813107, + "num_input_tokens_seen": 149082410, + "step": 6946, + "time_per_iteration": 2.7582406997680664 + }, + { + "auxiliary_loss_clip": 0.01456358, + "auxiliary_loss_mlp": 0.01265972, + "balance_loss_clip": 1.1348238, + "balance_loss_mlp": 1.03670847, + "epoch": 0.417676236284383, + "flos": 15525060999840.0, + "grad_norm": 2.2714982322405035, + "language_loss": 0.77853805, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.80576134, + "num_input_tokens_seen": 149098745, + "step": 6947, + "time_per_iteration": 2.828202486038208 + }, + { + "auxiliary_loss_clip": 0.01456441, + "auxiliary_loss_mlp": 0.01269257, + "balance_loss_clip": 1.13289428, + "balance_loss_mlp": 1.04209137, + "epoch": 0.41773635953705096, + "flos": 32527173271680.0, + "grad_norm": 1.9742024816133694, + "language_loss": 0.7181164, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.74537337, + "num_input_tokens_seen": 149122255, + "step": 6948, + "time_per_iteration": 2.8878884315490723 + }, + { + "auxiliary_loss_clip": 0.01450835, + "auxiliary_loss_mlp": 0.01258425, + "balance_loss_clip": 1.12767863, + "balance_loss_mlp": 1.03240395, + "epoch": 0.4177964827897189, + "flos": 23443212379680.0, + "grad_norm": 1.4858761576890915, + "language_loss": 0.76907897, + "learning_rate": 2.619219148905362e-06, + "loss": 0.79617155, + "num_input_tokens_seen": 149142845, + "step": 6949, + "time_per_iteration": 2.800675630569458 + }, + { + "auxiliary_loss_clip": 0.01456722, + "auxiliary_loss_mlp": 0.01265472, + "balance_loss_clip": 1.13412428, + "balance_loss_mlp": 1.03544545, + "epoch": 0.4178566060423869, + "flos": 22751206178400.0, + "grad_norm": 1.5905637128273615, + "language_loss": 0.82023978, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.8474617, + "num_input_tokens_seen": 149163375, + "step": 6950, + "time_per_iteration": 3.015108108520508 + }, + { + "auxiliary_loss_clip": 0.01457491, + "auxiliary_loss_mlp": 0.01252928, + "balance_loss_clip": 1.13489485, + "balance_loss_mlp": 1.02671671, + "epoch": 0.41791672929505486, + "flos": 26035286628960.0, + "grad_norm": 1.3759109625502297, + "language_loss": 0.76102155, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78812575, + "num_input_tokens_seen": 149185610, + "step": 6951, + "time_per_iteration": 2.9190568923950195 + }, + { + "auxiliary_loss_clip": 0.01464483, + "auxiliary_loss_mlp": 0.01262612, + "balance_loss_clip": 1.14205122, + "balance_loss_mlp": 1.03201342, + "epoch": 0.4179768525477228, + "flos": 19570481056800.0, + "grad_norm": 1.762813141121045, + "language_loss": 0.73647523, + "learning_rate": 2.61810806829516e-06, + "loss": 0.76374614, + "num_input_tokens_seen": 149203990, + "step": 6952, + "time_per_iteration": 2.981405019760132 + }, + { + "auxiliary_loss_clip": 0.01454928, + "auxiliary_loss_mlp": 0.01256056, + "balance_loss_clip": 1.13244486, + "balance_loss_mlp": 1.02717447, + "epoch": 0.4180369758003908, + "flos": 17785677780000.0, + "grad_norm": 2.3091413190263888, + "language_loss": 0.7231462, + "learning_rate": 2.617737661195593e-06, + "loss": 0.75025612, + "num_input_tokens_seen": 149221385, + "step": 6953, + "time_per_iteration": 4.218497276306152 + }, + { + "auxiliary_loss_clip": 0.01464791, + "auxiliary_loss_mlp": 0.01258725, + "balance_loss_clip": 1.14446425, + "balance_loss_mlp": 1.03136897, + "epoch": 0.41809709905305875, + "flos": 20962837654560.0, + "grad_norm": 1.6688243924565127, + "language_loss": 0.76103961, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78827477, + "num_input_tokens_seen": 149241175, + "step": 6954, + "time_per_iteration": 4.327341794967651 + }, + { + "auxiliary_loss_clip": 0.01468717, + "auxiliary_loss_mlp": 0.01266349, + "balance_loss_clip": 1.14678383, + "balance_loss_mlp": 1.0382297, + "epoch": 0.4181572223057267, + "flos": 22019792254560.0, + "grad_norm": 2.202070025214967, + "language_loss": 0.8430202, + "learning_rate": 2.616996776736485e-06, + "loss": 0.87037081, + "num_input_tokens_seen": 149259115, + "step": 6955, + "time_per_iteration": 2.8047871589660645 + }, + { + "auxiliary_loss_clip": 0.0145933, + "auxiliary_loss_mlp": 0.01253844, + "balance_loss_clip": 1.13893235, + "balance_loss_mlp": 1.02706027, + "epoch": 0.4182173455583947, + "flos": 26247686510880.0, + "grad_norm": 2.2839149581022933, + "language_loss": 0.83030295, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85743475, + "num_input_tokens_seen": 149278705, + "step": 6956, + "time_per_iteration": 2.827216625213623 + }, + { + "auxiliary_loss_clip": 0.01459175, + "auxiliary_loss_mlp": 0.01263125, + "balance_loss_clip": 1.1372273, + "balance_loss_mlp": 1.03290844, + "epoch": 0.4182774688110627, + "flos": 14793267794400.0, + "grad_norm": 2.0553827440427, + "language_loss": 0.71897578, + "learning_rate": 2.616255798691059e-06, + "loss": 0.74619877, + "num_input_tokens_seen": 149294040, + "step": 6957, + "time_per_iteration": 2.8499536514282227 + }, + { + "auxiliary_loss_clip": 0.01462127, + "auxiliary_loss_mlp": 0.01257593, + "balance_loss_clip": 1.14092898, + "balance_loss_mlp": 1.02852011, + "epoch": 0.41833759206373067, + "flos": 20414087570880.0, + "grad_norm": 1.9370483081731034, + "language_loss": 0.75178105, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77897823, + "num_input_tokens_seen": 149310385, + "step": 6958, + "time_per_iteration": 2.7923107147216797 + }, + { + "auxiliary_loss_clip": 0.0145618, + "auxiliary_loss_mlp": 0.01267049, + "balance_loss_clip": 1.13507175, + "balance_loss_mlp": 1.03912127, + "epoch": 0.41839771531639863, + "flos": 23658418945440.0, + "grad_norm": 1.7169782321089166, + "language_loss": 0.7729376, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.80016994, + "num_input_tokens_seen": 149328235, + "step": 6959, + "time_per_iteration": 4.304449558258057 + }, + { + "auxiliary_loss_clip": 0.0145196, + "auxiliary_loss_mlp": 0.01264254, + "balance_loss_clip": 1.13164687, + "balance_loss_mlp": 1.03556299, + "epoch": 0.4184578385690666, + "flos": 19756103293440.0, + "grad_norm": 2.015700258468708, + "language_loss": 0.77031255, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.79747462, + "num_input_tokens_seen": 149347465, + "step": 6960, + "time_per_iteration": 2.9291908740997314 + }, + { + "auxiliary_loss_clip": 0.01454308, + "auxiliary_loss_mlp": 0.01261547, + "balance_loss_clip": 1.13287354, + "balance_loss_mlp": 1.03552592, + "epoch": 0.41851796182173456, + "flos": 20195467470720.0, + "grad_norm": 4.433480976889378, + "language_loss": 0.75958401, + "learning_rate": 2.614773562290835e-06, + "loss": 0.78674257, + "num_input_tokens_seen": 149366685, + "step": 6961, + "time_per_iteration": 2.783416271209717 + }, + { + "auxiliary_loss_clip": 0.01589996, + "auxiliary_loss_mlp": 0.01214607, + "balance_loss_clip": 1.29345632, + "balance_loss_mlp": 1.00479889, + "epoch": 0.41857808507440253, + "flos": 59025339537120.0, + "grad_norm": 0.7827065186967985, + "language_loss": 0.54633629, + "learning_rate": 2.61440294487496e-06, + "loss": 0.57438231, + "num_input_tokens_seen": 149422925, + "step": 6962, + "time_per_iteration": 3.250420331954956 + }, + { + "auxiliary_loss_clip": 0.01466428, + "auxiliary_loss_mlp": 0.01260136, + "balance_loss_clip": 1.14484906, + "balance_loss_mlp": 1.03201759, + "epoch": 0.4186382083270705, + "flos": 18480794090400.0, + "grad_norm": 1.933941361464207, + "language_loss": 0.85682589, + "learning_rate": 2.614032304160864e-06, + "loss": 0.88409162, + "num_input_tokens_seen": 149440820, + "step": 6963, + "time_per_iteration": 2.80950665473938 + }, + { + "auxiliary_loss_clip": 0.0145579, + "auxiliary_loss_mlp": 0.01255692, + "balance_loss_clip": 1.13449144, + "balance_loss_mlp": 1.02967072, + "epoch": 0.41869833157973846, + "flos": 21580845287040.0, + "grad_norm": 1.5513631900561093, + "language_loss": 0.70663464, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.73374939, + "num_input_tokens_seen": 149461060, + "step": 6964, + "time_per_iteration": 2.7718560695648193 + }, + { + "auxiliary_loss_clip": 0.01460584, + "auxiliary_loss_mlp": 0.01267918, + "balance_loss_clip": 1.13928485, + "balance_loss_mlp": 1.04151607, + "epoch": 0.4187584548324064, + "flos": 35520341820480.0, + "grad_norm": 1.6412830993638294, + "language_loss": 0.71057487, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.7378599, + "num_input_tokens_seen": 149483115, + "step": 6965, + "time_per_iteration": 2.8996965885162354 + }, + { + "auxiliary_loss_clip": 0.01445784, + "auxiliary_loss_mlp": 0.01256129, + "balance_loss_clip": 1.12534559, + "balance_loss_mlp": 1.03163457, + "epoch": 0.4188185780850744, + "flos": 18657579065760.0, + "grad_norm": 1.572589864567785, + "language_loss": 0.72114396, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.7481631, + "num_input_tokens_seen": 149501495, + "step": 6966, + "time_per_iteration": 2.825239658355713 + }, + { + "auxiliary_loss_clip": 0.0145776, + "auxiliary_loss_mlp": 0.01266631, + "balance_loss_clip": 1.1358763, + "balance_loss_mlp": 1.03698587, + "epoch": 0.41887870133774235, + "flos": 40336621452000.0, + "grad_norm": 2.1999626683053592, + "language_loss": 0.71179444, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73903835, + "num_input_tokens_seen": 149523170, + "step": 6967, + "time_per_iteration": 2.958918571472168 + }, + { + "auxiliary_loss_clip": 0.01581047, + "auxiliary_loss_mlp": 0.01227943, + "balance_loss_clip": 1.28484344, + "balance_loss_mlp": 1.01813507, + "epoch": 0.4189388245904103, + "flos": 61376947066080.0, + "grad_norm": 0.6697838091048308, + "language_loss": 0.4609791, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48906901, + "num_input_tokens_seen": 149583955, + "step": 6968, + "time_per_iteration": 3.312532901763916 + }, + { + "auxiliary_loss_clip": 0.01451272, + "auxiliary_loss_mlp": 0.01261162, + "balance_loss_clip": 1.12968922, + "balance_loss_mlp": 1.0303731, + "epoch": 0.4189989478430783, + "flos": 28217884455360.0, + "grad_norm": 1.939584897797034, + "language_loss": 0.75086689, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77799124, + "num_input_tokens_seen": 149604440, + "step": 6969, + "time_per_iteration": 2.9144198894500732 + }, + { + "auxiliary_loss_clip": 0.01446668, + "auxiliary_loss_mlp": 0.01265853, + "balance_loss_clip": 1.12577927, + "balance_loss_mlp": 1.04002309, + "epoch": 0.4190590710957463, + "flos": 24567869473920.0, + "grad_norm": 1.7625239475761008, + "language_loss": 0.80675101, + "learning_rate": 2.611437167992705e-06, + "loss": 0.83387619, + "num_input_tokens_seen": 149623745, + "step": 6970, + "time_per_iteration": 2.9153213500976562 + }, + { + "auxiliary_loss_clip": 0.01460503, + "auxiliary_loss_mlp": 0.01257803, + "balance_loss_clip": 1.13968468, + "balance_loss_mlp": 1.03044665, + "epoch": 0.41911919434841427, + "flos": 21728197645920.0, + "grad_norm": 1.8979624412961105, + "language_loss": 0.83456039, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.86174345, + "num_input_tokens_seen": 149643025, + "step": 6971, + "time_per_iteration": 2.7424750328063965 + }, + { + "auxiliary_loss_clip": 0.01457619, + "auxiliary_loss_mlp": 0.01249108, + "balance_loss_clip": 1.13690281, + "balance_loss_mlp": 1.01946342, + "epoch": 0.41917931760108224, + "flos": 17603393221440.0, + "grad_norm": 1.7618970938016507, + "language_loss": 0.74881405, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.77588129, + "num_input_tokens_seen": 149660695, + "step": 6972, + "time_per_iteration": 2.7535200119018555 + }, + { + "auxiliary_loss_clip": 0.01451128, + "auxiliary_loss_mlp": 0.01264873, + "balance_loss_clip": 1.13076973, + "balance_loss_mlp": 1.03694463, + "epoch": 0.4192394408537502, + "flos": 37819797400800.0, + "grad_norm": 1.5674857898142762, + "language_loss": 0.72874236, + "learning_rate": 2.610324618710212e-06, + "loss": 0.75590229, + "num_input_tokens_seen": 149682040, + "step": 6973, + "time_per_iteration": 3.041029691696167 + }, + { + "auxiliary_loss_clip": 0.0146146, + "auxiliary_loss_mlp": 0.0126915, + "balance_loss_clip": 1.14001822, + "balance_loss_mlp": 1.03588104, + "epoch": 0.41929956410641817, + "flos": 23109706789920.0, + "grad_norm": 1.8397079965897578, + "language_loss": 0.74788237, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77518845, + "num_input_tokens_seen": 149700855, + "step": 6974, + "time_per_iteration": 2.7489242553710938 + }, + { + "auxiliary_loss_clip": 0.01448108, + "auxiliary_loss_mlp": 0.01260787, + "balance_loss_clip": 1.12702298, + "balance_loss_mlp": 1.03285861, + "epoch": 0.41935968735908613, + "flos": 22526517572640.0, + "grad_norm": 1.9751399970867336, + "language_loss": 0.72860521, + "learning_rate": 2.609582803447259e-06, + "loss": 0.75569415, + "num_input_tokens_seen": 149717360, + "step": 6975, + "time_per_iteration": 2.9607365131378174 + }, + { + "auxiliary_loss_clip": 0.01457158, + "auxiliary_loss_mlp": 0.01260919, + "balance_loss_clip": 1.13620675, + "balance_loss_mlp": 1.03222811, + "epoch": 0.4194198106117541, + "flos": 26872900493760.0, + "grad_norm": 1.855545782867224, + "language_loss": 0.80932802, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83650875, + "num_input_tokens_seen": 149738975, + "step": 6976, + "time_per_iteration": 2.861449718475342 + }, + { + "auxiliary_loss_clip": 0.01447421, + "auxiliary_loss_mlp": 0.01249981, + "balance_loss_clip": 1.12665939, + "balance_loss_mlp": 1.02319765, + "epoch": 0.41947993386442206, + "flos": 19904517640800.0, + "grad_norm": 1.9620198468320278, + "language_loss": 0.67420202, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.70117605, + "num_input_tokens_seen": 149757055, + "step": 6977, + "time_per_iteration": 2.814199209213257 + }, + { + "auxiliary_loss_clip": 0.01453103, + "auxiliary_loss_mlp": 0.01248828, + "balance_loss_clip": 1.1336937, + "balance_loss_mlp": 1.01956451, + "epoch": 0.41954005711709, + "flos": 17385797181600.0, + "grad_norm": 2.5568311396863477, + "language_loss": 0.81394041, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.84095967, + "num_input_tokens_seen": 149772885, + "step": 6978, + "time_per_iteration": 2.855056047439575 + }, + { + "auxiliary_loss_clip": 0.0144869, + "auxiliary_loss_mlp": 0.01265944, + "balance_loss_clip": 1.12811744, + "balance_loss_mlp": 1.03744364, + "epoch": 0.419600180369758, + "flos": 25005299315040.0, + "grad_norm": 1.8105135786903714, + "language_loss": 0.82736224, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.85450852, + "num_input_tokens_seen": 149791515, + "step": 6979, + "time_per_iteration": 2.7933754920959473 + }, + { + "auxiliary_loss_clip": 0.0144543, + "auxiliary_loss_mlp": 0.01251256, + "balance_loss_clip": 1.12471879, + "balance_loss_mlp": 1.02275538, + "epoch": 0.41966030362242596, + "flos": 17385683397120.0, + "grad_norm": 2.1113531242994776, + "language_loss": 0.83603299, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.86299986, + "num_input_tokens_seen": 149807250, + "step": 6980, + "time_per_iteration": 2.88375186920166 + }, + { + "auxiliary_loss_clip": 0.01450507, + "auxiliary_loss_mlp": 0.01246664, + "balance_loss_clip": 1.12863266, + "balance_loss_mlp": 1.01701891, + "epoch": 0.4197204268750939, + "flos": 22157928070560.0, + "grad_norm": 2.789347110929286, + "language_loss": 0.79520822, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.82217991, + "num_input_tokens_seen": 149821640, + "step": 6981, + "time_per_iteration": 4.368924856185913 + }, + { + "auxiliary_loss_clip": 0.01444204, + "auxiliary_loss_mlp": 0.01246963, + "balance_loss_clip": 1.124084, + "balance_loss_mlp": 1.02113318, + "epoch": 0.4197805501277619, + "flos": 22085939694240.0, + "grad_norm": 1.7284826223702987, + "language_loss": 0.84385395, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.87076569, + "num_input_tokens_seen": 149840545, + "step": 6982, + "time_per_iteration": 2.795337677001953 + }, + { + "auxiliary_loss_clip": 0.01448526, + "auxiliary_loss_mlp": 0.01262167, + "balance_loss_clip": 1.12820423, + "balance_loss_mlp": 1.03271329, + "epoch": 0.4198406733804299, + "flos": 26434598304960.0, + "grad_norm": 3.051498035378495, + "language_loss": 0.56635904, + "learning_rate": 2.606614618903214e-06, + "loss": 0.59346592, + "num_input_tokens_seen": 149860375, + "step": 6983, + "time_per_iteration": 2.8828558921813965 + }, + { + "auxiliary_loss_clip": 0.01443218, + "auxiliary_loss_mlp": 0.01245023, + "balance_loss_clip": 1.1226697, + "balance_loss_mlp": 1.01652217, + "epoch": 0.4199007966330979, + "flos": 12532651014240.0, + "grad_norm": 1.8150437003485014, + "language_loss": 0.82272708, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84960955, + "num_input_tokens_seen": 149877850, + "step": 6984, + "time_per_iteration": 2.880530834197998 + }, + { + "auxiliary_loss_clip": 0.01445304, + "auxiliary_loss_mlp": 0.01258699, + "balance_loss_clip": 1.12466526, + "balance_loss_mlp": 1.0298171, + "epoch": 0.41996091988576584, + "flos": 21765140038080.0, + "grad_norm": 2.0675241103283524, + "language_loss": 0.7965191, + "learning_rate": 2.605872342456914e-06, + "loss": 0.82355917, + "num_input_tokens_seen": 149896110, + "step": 6985, + "time_per_iteration": 2.7537412643432617 + }, + { + "auxiliary_loss_clip": 0.0144901, + "auxiliary_loss_mlp": 0.01266962, + "balance_loss_clip": 1.12957978, + "balance_loss_mlp": 1.03579164, + "epoch": 0.4200210431384338, + "flos": 26544363492960.0, + "grad_norm": 2.320443260692757, + "language_loss": 0.78523469, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.8123945, + "num_input_tokens_seen": 149916495, + "step": 6986, + "time_per_iteration": 2.7881293296813965 + }, + { + "auxiliary_loss_clip": 0.01441676, + "auxiliary_loss_mlp": 0.01251225, + "balance_loss_clip": 1.12151194, + "balance_loss_mlp": 1.02653956, + "epoch": 0.42008116639110177, + "flos": 26798181289920.0, + "grad_norm": 1.5022366536355691, + "language_loss": 0.72440767, + "learning_rate": 2.605129974111655e-06, + "loss": 0.75133669, + "num_input_tokens_seen": 149936445, + "step": 6987, + "time_per_iteration": 2.823646068572998 + }, + { + "auxiliary_loss_clip": 0.01456157, + "auxiliary_loss_mlp": 0.01248281, + "balance_loss_clip": 1.13553739, + "balance_loss_mlp": 1.01749194, + "epoch": 0.42014128964376973, + "flos": 32090046855840.0, + "grad_norm": 1.4283349873767286, + "language_loss": 0.74876869, + "learning_rate": 2.604758755512104e-06, + "loss": 0.7758131, + "num_input_tokens_seen": 149959430, + "step": 6988, + "time_per_iteration": 2.8303632736206055 + }, + { + "auxiliary_loss_clip": 0.01454837, + "auxiliary_loss_mlp": 0.01267688, + "balance_loss_clip": 1.13376224, + "balance_loss_mlp": 1.03880572, + "epoch": 0.4202014128964377, + "flos": 26469454648320.0, + "grad_norm": 1.5180537385466084, + "language_loss": 0.74332929, + "learning_rate": 2.60438751398004e-06, + "loss": 0.77055454, + "num_input_tokens_seen": 149980365, + "step": 6989, + "time_per_iteration": 2.874399185180664 + }, + { + "auxiliary_loss_clip": 0.01452944, + "auxiliary_loss_mlp": 0.01259557, + "balance_loss_clip": 1.13308251, + "balance_loss_mlp": 1.03067541, + "epoch": 0.42026153614910566, + "flos": 13402580035680.0, + "grad_norm": 2.114177424656759, + "language_loss": 0.71628755, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.74341255, + "num_input_tokens_seen": 149997375, + "step": 6990, + "time_per_iteration": 4.287685871124268 + }, + { + "auxiliary_loss_clip": 0.01582121, + "auxiliary_loss_mlp": 0.01224152, + "balance_loss_clip": 1.28665829, + "balance_loss_mlp": 1.0151062, + "epoch": 0.42032165940177363, + "flos": 60256575853920.0, + "grad_norm": 0.8285524501198356, + "language_loss": 0.60470271, + "learning_rate": 2.603644962174685e-06, + "loss": 0.63276541, + "num_input_tokens_seen": 150051230, + "step": 6991, + "time_per_iteration": 4.8203771114349365 + }, + { + "auxiliary_loss_clip": 0.01458348, + "auxiliary_loss_mlp": 0.01265635, + "balance_loss_clip": 1.13910103, + "balance_loss_mlp": 1.03560877, + "epoch": 0.4203817826544416, + "flos": 24537526581600.0, + "grad_norm": 7.1862662302053035, + "language_loss": 0.83114111, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85838097, + "num_input_tokens_seen": 150071135, + "step": 6992, + "time_per_iteration": 2.9264461994171143 + }, + { + "auxiliary_loss_clip": 0.0158111, + "auxiliary_loss_mlp": 0.0122831, + "balance_loss_clip": 1.28614879, + "balance_loss_mlp": 1.01850128, + "epoch": 0.42044190590710956, + "flos": 58826442444480.0, + "grad_norm": 0.8412826597175135, + "language_loss": 0.65487891, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.68297309, + "num_input_tokens_seen": 150125220, + "step": 6993, + "time_per_iteration": 3.310070514678955 + }, + { + "auxiliary_loss_clip": 0.01449982, + "auxiliary_loss_mlp": 0.01256609, + "balance_loss_clip": 1.12987781, + "balance_loss_mlp": 1.02486575, + "epoch": 0.4205020291597775, + "flos": 16437962990880.0, + "grad_norm": 1.8677885965142451, + "language_loss": 0.83286703, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85993296, + "num_input_tokens_seen": 150142300, + "step": 6994, + "time_per_iteration": 2.9319632053375244 + }, + { + "auxiliary_loss_clip": 0.01450669, + "auxiliary_loss_mlp": 0.01251062, + "balance_loss_clip": 1.13217139, + "balance_loss_mlp": 1.02561307, + "epoch": 0.4205621524124455, + "flos": 18407629941120.0, + "grad_norm": 1.6791711427516283, + "language_loss": 0.78375751, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.8107748, + "num_input_tokens_seen": 150161345, + "step": 6995, + "time_per_iteration": 2.814236640930176 + }, + { + "auxiliary_loss_clip": 0.01448369, + "auxiliary_loss_mlp": 0.01252531, + "balance_loss_clip": 1.1294713, + "balance_loss_mlp": 1.02612877, + "epoch": 0.4206222756651135, + "flos": 25522454877120.0, + "grad_norm": 1.5010577899150632, + "language_loss": 0.80021811, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82722706, + "num_input_tokens_seen": 150182420, + "step": 6996, + "time_per_iteration": 2.8634684085845947 + }, + { + "auxiliary_loss_clip": 0.01449405, + "auxiliary_loss_mlp": 0.01262004, + "balance_loss_clip": 1.12956095, + "balance_loss_mlp": 1.03407633, + "epoch": 0.4206823989177815, + "flos": 15306137474400.0, + "grad_norm": 2.171952429448434, + "language_loss": 0.75724173, + "learning_rate": 2.601416757842559e-06, + "loss": 0.78435576, + "num_input_tokens_seen": 150200175, + "step": 6997, + "time_per_iteration": 4.288829565048218 + }, + { + "auxiliary_loss_clip": 0.01438603, + "auxiliary_loss_mlp": 0.0125313, + "balance_loss_clip": 1.11894476, + "balance_loss_mlp": 1.02691841, + "epoch": 0.42074252217044944, + "flos": 15555593532960.0, + "grad_norm": 2.0688016137152436, + "language_loss": 0.75844246, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.78535986, + "num_input_tokens_seen": 150217100, + "step": 6998, + "time_per_iteration": 2.736795425415039 + }, + { + "auxiliary_loss_clip": 0.01449644, + "auxiliary_loss_mlp": 0.01266159, + "balance_loss_clip": 1.129776, + "balance_loss_mlp": 1.03765917, + "epoch": 0.4208026454231174, + "flos": 26148579135840.0, + "grad_norm": 1.8334396148792054, + "language_loss": 0.7620433, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78920138, + "num_input_tokens_seen": 150239830, + "step": 6999, + "time_per_iteration": 2.8748254776000977 + }, + { + "auxiliary_loss_clip": 0.01451941, + "auxiliary_loss_mlp": 0.01266512, + "balance_loss_clip": 1.1326077, + "balance_loss_mlp": 1.03972769, + "epoch": 0.42086276867578537, + "flos": 23552370717120.0, + "grad_norm": 2.28293397846391, + "language_loss": 0.64066279, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66784739, + "num_input_tokens_seen": 150260690, + "step": 7000, + "time_per_iteration": 2.7373831272125244 + }, + { + "auxiliary_loss_clip": 0.01456273, + "auxiliary_loss_mlp": 0.01269354, + "balance_loss_clip": 1.13851714, + "balance_loss_mlp": 1.03799248, + "epoch": 0.42092289192845334, + "flos": 18115352625600.0, + "grad_norm": 1.528964227206569, + "language_loss": 0.76369262, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.79094887, + "num_input_tokens_seen": 150279885, + "step": 7001, + "time_per_iteration": 2.8280720710754395 + }, + { + "auxiliary_loss_clip": 0.01458795, + "auxiliary_loss_mlp": 0.0126394, + "balance_loss_clip": 1.13981938, + "balance_loss_mlp": 1.03944468, + "epoch": 0.4209830151811213, + "flos": 20008403964000.0, + "grad_norm": 1.5027691204071612, + "language_loss": 0.86545575, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.89268315, + "num_input_tokens_seen": 150297390, + "step": 7002, + "time_per_iteration": 2.811997413635254 + }, + { + "auxiliary_loss_clip": 0.01453747, + "auxiliary_loss_mlp": 0.01271944, + "balance_loss_clip": 1.13506281, + "balance_loss_mlp": 1.04649544, + "epoch": 0.42104313843378927, + "flos": 21980991382560.0, + "grad_norm": 2.8032967109680436, + "language_loss": 0.67326248, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.70051938, + "num_input_tokens_seen": 150317390, + "step": 7003, + "time_per_iteration": 2.8094940185546875 + }, + { + "auxiliary_loss_clip": 0.01454586, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_clip": 1.13481188, + "balance_loss_mlp": 1.03636122, + "epoch": 0.42110326168645723, + "flos": 25446142690560.0, + "grad_norm": 2.2318956905630336, + "language_loss": 0.77663052, + "learning_rate": 2.598816148672344e-06, + "loss": 0.80382311, + "num_input_tokens_seen": 150337455, + "step": 7004, + "time_per_iteration": 2.956735610961914 + }, + { + "auxiliary_loss_clip": 0.01451569, + "auxiliary_loss_mlp": 0.01258123, + "balance_loss_clip": 1.13421559, + "balance_loss_mlp": 1.03095746, + "epoch": 0.4211633849391252, + "flos": 17824554508320.0, + "grad_norm": 1.9809882165669797, + "language_loss": 0.68236572, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70946264, + "num_input_tokens_seen": 150355385, + "step": 7005, + "time_per_iteration": 2.9675559997558594 + }, + { + "auxiliary_loss_clip": 0.01451327, + "auxiliary_loss_mlp": 0.01262846, + "balance_loss_clip": 1.13238418, + "balance_loss_mlp": 1.03491747, + "epoch": 0.42122350819179316, + "flos": 16283897347680.0, + "grad_norm": 2.1182642424674016, + "language_loss": 0.72661722, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.75375891, + "num_input_tokens_seen": 150371750, + "step": 7006, + "time_per_iteration": 2.932792901992798 + }, + { + "auxiliary_loss_clip": 0.01445564, + "auxiliary_loss_mlp": 0.01262482, + "balance_loss_clip": 1.12585163, + "balance_loss_mlp": 1.03283763, + "epoch": 0.4212836314444611, + "flos": 19647893160000.0, + "grad_norm": 1.6256721720016845, + "language_loss": 0.70850635, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.73558676, + "num_input_tokens_seen": 150389955, + "step": 7007, + "time_per_iteration": 2.816516876220703 + }, + { + "auxiliary_loss_clip": 0.01448136, + "auxiliary_loss_mlp": 0.01265012, + "balance_loss_clip": 1.12845337, + "balance_loss_mlp": 1.03536725, + "epoch": 0.4213437546971291, + "flos": 18370990974240.0, + "grad_norm": 1.8260957541765288, + "language_loss": 0.82226336, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84939492, + "num_input_tokens_seen": 150405780, + "step": 7008, + "time_per_iteration": 2.879105806350708 + }, + { + "auxiliary_loss_clip": 0.01445765, + "auxiliary_loss_mlp": 0.01254526, + "balance_loss_clip": 1.12610137, + "balance_loss_mlp": 1.02659798, + "epoch": 0.42140387794979706, + "flos": 27706721542560.0, + "grad_norm": 1.7509944115419291, + "language_loss": 0.7193898, + "learning_rate": 2.596957889196831e-06, + "loss": 0.74639273, + "num_input_tokens_seen": 150425615, + "step": 7009, + "time_per_iteration": 2.8412516117095947 + }, + { + "auxiliary_loss_clip": 0.01435918, + "auxiliary_loss_mlp": 0.0125641, + "balance_loss_clip": 1.11724424, + "balance_loss_mlp": 1.0294354, + "epoch": 0.4214640012024651, + "flos": 28149575110560.0, + "grad_norm": 1.8935205338598344, + "language_loss": 0.66660976, + "learning_rate": 2.596586169335243e-06, + "loss": 0.69353306, + "num_input_tokens_seen": 150445765, + "step": 7010, + "time_per_iteration": 2.821197509765625 + }, + { + "auxiliary_loss_clip": 0.01440666, + "auxiliary_loss_mlp": 0.01262867, + "balance_loss_clip": 1.12059963, + "balance_loss_mlp": 1.03513002, + "epoch": 0.42152412445513304, + "flos": 22999183038720.0, + "grad_norm": 1.577921988796279, + "language_loss": 0.72557712, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.75261247, + "num_input_tokens_seen": 150464405, + "step": 7011, + "time_per_iteration": 2.7835395336151123 + }, + { + "auxiliary_loss_clip": 0.0154975, + "auxiliary_loss_mlp": 0.01226959, + "balance_loss_clip": 1.25182927, + "balance_loss_mlp": 1.01638794, + "epoch": 0.421584247707801, + "flos": 63755711157600.0, + "grad_norm": 0.799168523472131, + "language_loss": 0.54324466, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.57101178, + "num_input_tokens_seen": 150520430, + "step": 7012, + "time_per_iteration": 3.205435276031494 + }, + { + "auxiliary_loss_clip": 0.01448665, + "auxiliary_loss_mlp": 0.01258184, + "balance_loss_clip": 1.12924051, + "balance_loss_mlp": 1.02873003, + "epoch": 0.421644370960469, + "flos": 24316820432640.0, + "grad_norm": 1.41573914238773, + "language_loss": 0.7869482, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.8140167, + "num_input_tokens_seen": 150542610, + "step": 7013, + "time_per_iteration": 2.837686538696289 + }, + { + "auxiliary_loss_clip": 0.01439631, + "auxiliary_loss_mlp": 0.01265096, + "balance_loss_clip": 1.12008166, + "balance_loss_mlp": 1.0383122, + "epoch": 0.42170449421313694, + "flos": 23442946882560.0, + "grad_norm": 2.311828474683727, + "language_loss": 0.81320512, + "learning_rate": 2.595099063803787e-06, + "loss": 0.8402524, + "num_input_tokens_seen": 150560970, + "step": 7014, + "time_per_iteration": 2.7866125106811523 + }, + { + "auxiliary_loss_clip": 0.0144826, + "auxiliary_loss_mlp": 0.01260585, + "balance_loss_clip": 1.13040853, + "balance_loss_mlp": 1.02998662, + "epoch": 0.4217646174658049, + "flos": 23697712883520.0, + "grad_norm": 1.6106458925873488, + "language_loss": 0.77488136, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.80196983, + "num_input_tokens_seen": 150582615, + "step": 7015, + "time_per_iteration": 2.841285228729248 + }, + { + "auxiliary_loss_clip": 0.0145006, + "auxiliary_loss_mlp": 0.01266612, + "balance_loss_clip": 1.13127518, + "balance_loss_mlp": 1.03696716, + "epoch": 0.42182474071847287, + "flos": 24973818577920.0, + "grad_norm": 1.4396657840132705, + "language_loss": 0.82106531, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84823203, + "num_input_tokens_seen": 150603640, + "step": 7016, + "time_per_iteration": 2.7927780151367188 + }, + { + "auxiliary_loss_clip": 0.0144355, + "auxiliary_loss_mlp": 0.01251259, + "balance_loss_clip": 1.1255548, + "balance_loss_mlp": 1.02065992, + "epoch": 0.42188486397114083, + "flos": 22858847389440.0, + "grad_norm": 2.01871328439679, + "language_loss": 0.68487155, + "learning_rate": 2.593983497660586e-06, + "loss": 0.71181965, + "num_input_tokens_seen": 150622490, + "step": 7017, + "time_per_iteration": 2.8052151203155518 + }, + { + "auxiliary_loss_clip": 0.01538383, + "auxiliary_loss_mlp": 0.01221397, + "balance_loss_clip": 1.24058568, + "balance_loss_mlp": 1.01158905, + "epoch": 0.4219449872238088, + "flos": 66982860711360.0, + "grad_norm": 0.7008415076042516, + "language_loss": 0.59327972, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.6208775, + "num_input_tokens_seen": 150689545, + "step": 7018, + "time_per_iteration": 3.431640386581421 + }, + { + "auxiliary_loss_clip": 0.0144296, + "auxiliary_loss_mlp": 0.01263725, + "balance_loss_clip": 1.1247952, + "balance_loss_mlp": 1.03083801, + "epoch": 0.42200511047647676, + "flos": 13117053932640.0, + "grad_norm": 2.2520431465859025, + "language_loss": 0.75287628, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77994311, + "num_input_tokens_seen": 150707610, + "step": 7019, + "time_per_iteration": 4.358793497085571 + }, + { + "auxiliary_loss_clip": 0.0144732, + "auxiliary_loss_mlp": 0.01262436, + "balance_loss_clip": 1.12947345, + "balance_loss_mlp": 1.03355443, + "epoch": 0.42206523372914473, + "flos": 13992937675200.0, + "grad_norm": 1.9877891005923163, + "language_loss": 0.69076204, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71785963, + "num_input_tokens_seen": 150724530, + "step": 7020, + "time_per_iteration": 2.8247766494750977 + }, + { + "auxiliary_loss_clip": 0.01444203, + "auxiliary_loss_mlp": 0.012479, + "balance_loss_clip": 1.12502491, + "balance_loss_mlp": 1.02283287, + "epoch": 0.4221253569818127, + "flos": 21944352415680.0, + "grad_norm": 1.6612233420645628, + "language_loss": 0.81075442, + "learning_rate": 2.592495760867347e-06, + "loss": 0.83767545, + "num_input_tokens_seen": 150742870, + "step": 7021, + "time_per_iteration": 2.812427043914795 + }, + { + "auxiliary_loss_clip": 0.01446454, + "auxiliary_loss_mlp": 0.01261974, + "balance_loss_clip": 1.12819898, + "balance_loss_mlp": 1.03461838, + "epoch": 0.42218548023448066, + "flos": 32195071023840.0, + "grad_norm": 1.9665384562379726, + "language_loss": 0.69597268, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.72305703, + "num_input_tokens_seen": 150765500, + "step": 7022, + "time_per_iteration": 2.87165904045105 + }, + { + "auxiliary_loss_clip": 0.01440612, + "auxiliary_loss_mlp": 0.01247403, + "balance_loss_clip": 1.12257206, + "balance_loss_mlp": 1.02405286, + "epoch": 0.4222456034871487, + "flos": 30121669463040.0, + "grad_norm": 1.5591885746189953, + "language_loss": 0.67308581, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69996595, + "num_input_tokens_seen": 150784945, + "step": 7023, + "time_per_iteration": 3.0179619789123535 + }, + { + "auxiliary_loss_clip": 0.01451673, + "auxiliary_loss_mlp": 0.01262528, + "balance_loss_clip": 1.13371646, + "balance_loss_mlp": 1.03803337, + "epoch": 0.42230572673981664, + "flos": 22130126364960.0, + "grad_norm": 1.595316613072347, + "language_loss": 0.69615555, + "learning_rate": 2.591379722314322e-06, + "loss": 0.72329754, + "num_input_tokens_seen": 150803120, + "step": 7024, + "time_per_iteration": 2.894430637359619 + }, + { + "auxiliary_loss_clip": 0.01446968, + "auxiliary_loss_mlp": 0.01257918, + "balance_loss_clip": 1.12720299, + "balance_loss_mlp": 1.03056216, + "epoch": 0.4223658499924846, + "flos": 22057303569120.0, + "grad_norm": 1.6280407852035088, + "language_loss": 0.76844543, + "learning_rate": 2.591007664594147e-06, + "loss": 0.79549432, + "num_input_tokens_seen": 150823135, + "step": 7025, + "time_per_iteration": 2.908358097076416 + }, + { + "auxiliary_loss_clip": 0.01448601, + "auxiliary_loss_mlp": 0.01253935, + "balance_loss_clip": 1.1308403, + "balance_loss_mlp": 1.02867699, + "epoch": 0.4224259732451526, + "flos": 20412722157120.0, + "grad_norm": 2.293687145585564, + "language_loss": 0.79760909, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.82463443, + "num_input_tokens_seen": 150842070, + "step": 7026, + "time_per_iteration": 2.8069190979003906 + }, + { + "auxiliary_loss_clip": 0.0154606, + "auxiliary_loss_mlp": 0.01220955, + "balance_loss_clip": 1.24615741, + "balance_loss_mlp": 1.01114655, + "epoch": 0.42248609649782054, + "flos": 62853239410560.0, + "grad_norm": 0.7164892721815093, + "language_loss": 0.61827934, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.64594948, + "num_input_tokens_seen": 150907450, + "step": 7027, + "time_per_iteration": 3.408735990524292 + }, + { + "auxiliary_loss_clip": 0.01447976, + "auxiliary_loss_mlp": 0.01260989, + "balance_loss_clip": 1.12902617, + "balance_loss_mlp": 1.03687513, + "epoch": 0.4225462197504885, + "flos": 26252351674560.0, + "grad_norm": 2.233944467988489, + "language_loss": 0.71051478, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.73760438, + "num_input_tokens_seen": 150928040, + "step": 7028, + "time_per_iteration": 4.284631013870239 + }, + { + "auxiliary_loss_clip": 0.01446858, + "auxiliary_loss_mlp": 0.0126719, + "balance_loss_clip": 1.12852049, + "balance_loss_mlp": 1.04155123, + "epoch": 0.42260634300315647, + "flos": 20524042399680.0, + "grad_norm": 1.8977277081977144, + "language_loss": 0.82496375, + "learning_rate": 2.589519209743846e-06, + "loss": 0.85210431, + "num_input_tokens_seen": 150945760, + "step": 7029, + "time_per_iteration": 2.752699136734009 + }, + { + "auxiliary_loss_clip": 0.01453287, + "auxiliary_loss_mlp": 0.01268478, + "balance_loss_clip": 1.13437152, + "balance_loss_mlp": 1.04016876, + "epoch": 0.42266646625582444, + "flos": 24319058194080.0, + "grad_norm": 2.6344528346956637, + "language_loss": 0.74577379, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77299142, + "num_input_tokens_seen": 150965665, + "step": 7030, + "time_per_iteration": 5.297173261642456 + }, + { + "auxiliary_loss_clip": 0.01449092, + "auxiliary_loss_mlp": 0.01273465, + "balance_loss_clip": 1.12974811, + "balance_loss_mlp": 1.04763532, + "epoch": 0.4227265895084924, + "flos": 24206296681440.0, + "grad_norm": 2.913953526194681, + "language_loss": 0.87083399, + "learning_rate": 2.588774848134486e-06, + "loss": 0.89805961, + "num_input_tokens_seen": 150982260, + "step": 7031, + "time_per_iteration": 2.7833662033081055 + }, + { + "auxiliary_loss_clip": 0.01450414, + "auxiliary_loss_mlp": 0.01274856, + "balance_loss_clip": 1.13150167, + "balance_loss_mlp": 1.04807246, + "epoch": 0.42278671276116037, + "flos": 16911652517280.0, + "grad_norm": 2.0010200815152888, + "language_loss": 0.73409116, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.76134384, + "num_input_tokens_seen": 150999990, + "step": 7032, + "time_per_iteration": 2.7599246501922607 + }, + { + "auxiliary_loss_clip": 0.01443213, + "auxiliary_loss_mlp": 0.0126788, + "balance_loss_clip": 1.12477756, + "balance_loss_mlp": 1.043576, + "epoch": 0.42284683601382833, + "flos": 25413448252320.0, + "grad_norm": 1.5071221543318385, + "language_loss": 0.70822716, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.73533809, + "num_input_tokens_seen": 151021105, + "step": 7033, + "time_per_iteration": 2.806072235107422 + }, + { + "auxiliary_loss_clip": 0.01447602, + "auxiliary_loss_mlp": 0.01261384, + "balance_loss_clip": 1.12844741, + "balance_loss_mlp": 1.03555441, + "epoch": 0.4229069592664963, + "flos": 23042611146240.0, + "grad_norm": 1.7682366351366037, + "language_loss": 0.9048031, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.93189299, + "num_input_tokens_seen": 151040665, + "step": 7034, + "time_per_iteration": 2.8142476081848145 + }, + { + "auxiliary_loss_clip": 0.01442647, + "auxiliary_loss_mlp": 0.01262618, + "balance_loss_clip": 1.12401414, + "balance_loss_mlp": 1.03468966, + "epoch": 0.42296708251916426, + "flos": 26069839547040.0, + "grad_norm": 1.851179719214286, + "language_loss": 0.77217472, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79922736, + "num_input_tokens_seen": 151061240, + "step": 7035, + "time_per_iteration": 4.240528345108032 + }, + { + "auxiliary_loss_clip": 0.0144246, + "auxiliary_loss_mlp": 0.0127167, + "balance_loss_clip": 1.12487519, + "balance_loss_mlp": 1.04297864, + "epoch": 0.4230272057718323, + "flos": 19460109018240.0, + "grad_norm": 1.8500728620322757, + "language_loss": 0.82822269, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.85536397, + "num_input_tokens_seen": 151076870, + "step": 7036, + "time_per_iteration": 2.7282328605651855 + }, + { + "auxiliary_loss_clip": 0.01448309, + "auxiliary_loss_mlp": 0.01265617, + "balance_loss_clip": 1.12982953, + "balance_loss_mlp": 1.03978729, + "epoch": 0.42308732902450025, + "flos": 22385537144640.0, + "grad_norm": 1.7383863436122482, + "language_loss": 0.70306987, + "learning_rate": 2.58654122792447e-06, + "loss": 0.73020911, + "num_input_tokens_seen": 151095110, + "step": 7037, + "time_per_iteration": 2.8153066635131836 + }, + { + "auxiliary_loss_clip": 0.01440601, + "auxiliary_loss_mlp": 0.0124939, + "balance_loss_clip": 1.12188506, + "balance_loss_mlp": 1.02012682, + "epoch": 0.4231474522771682, + "flos": 20997504357120.0, + "grad_norm": 1.7097189394560754, + "language_loss": 0.7824201, + "learning_rate": 2.586168879961155e-06, + "loss": 0.80931997, + "num_input_tokens_seen": 151114355, + "step": 7038, + "time_per_iteration": 2.8432681560516357 + }, + { + "auxiliary_loss_clip": 0.01453283, + "auxiliary_loss_mlp": 0.01277903, + "balance_loss_clip": 1.134269, + "balance_loss_mlp": 1.04444385, + "epoch": 0.4232075755298362, + "flos": 14977638401760.0, + "grad_norm": 2.282216130699784, + "language_loss": 0.6635865, + "learning_rate": 2.585796509770259e-06, + "loss": 0.6908983, + "num_input_tokens_seen": 151131505, + "step": 7039, + "time_per_iteration": 2.755775213241577 + }, + { + "auxiliary_loss_clip": 0.01446971, + "auxiliary_loss_mlp": 0.01259735, + "balance_loss_clip": 1.12857854, + "balance_loss_mlp": 1.02570343, + "epoch": 0.42326769878250414, + "flos": 24534833682240.0, + "grad_norm": 1.6216308813955196, + "language_loss": 0.75698817, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78405523, + "num_input_tokens_seen": 151151555, + "step": 7040, + "time_per_iteration": 2.8774824142456055 + }, + { + "auxiliary_loss_clip": 0.01449031, + "auxiliary_loss_mlp": 0.01256001, + "balance_loss_clip": 1.12941194, + "balance_loss_mlp": 1.02940798, + "epoch": 0.4233278220351721, + "flos": 26872748781120.0, + "grad_norm": 1.863840877934121, + "language_loss": 0.64957112, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67662144, + "num_input_tokens_seen": 151172385, + "step": 7041, + "time_per_iteration": 2.8377716541290283 + }, + { + "auxiliary_loss_clip": 0.01449381, + "auxiliary_loss_mlp": 0.0126304, + "balance_loss_clip": 1.13019884, + "balance_loss_mlp": 1.03377652, + "epoch": 0.4233879452878401, + "flos": 42818285734560.0, + "grad_norm": 2.439150325007659, + "language_loss": 0.738702, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.76582623, + "num_input_tokens_seen": 151194930, + "step": 7042, + "time_per_iteration": 3.080523729324341 + }, + { + "auxiliary_loss_clip": 0.01443713, + "auxiliary_loss_mlp": 0.01262492, + "balance_loss_clip": 1.12600899, + "balance_loss_mlp": 1.04085851, + "epoch": 0.42344806854050804, + "flos": 25231391262720.0, + "grad_norm": 1.4296863900103411, + "language_loss": 0.82297879, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.85004085, + "num_input_tokens_seen": 151217905, + "step": 7043, + "time_per_iteration": 2.869089126586914 + }, + { + "auxiliary_loss_clip": 0.01462715, + "auxiliary_loss_mlp": 0.01268813, + "balance_loss_clip": 1.14463782, + "balance_loss_mlp": 1.03783262, + "epoch": 0.423508191793176, + "flos": 22780752579360.0, + "grad_norm": 3.976090716118705, + "language_loss": 0.65196419, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67927951, + "num_input_tokens_seen": 151234580, + "step": 7044, + "time_per_iteration": 2.8786847591400146 + }, + { + "auxiliary_loss_clip": 0.01464939, + "auxiliary_loss_mlp": 0.01264899, + "balance_loss_clip": 1.14701486, + "balance_loss_mlp": 1.03487253, + "epoch": 0.42356831504584397, + "flos": 34640134267680.0, + "grad_norm": 1.7578178330983127, + "language_loss": 0.75307345, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.78037179, + "num_input_tokens_seen": 151254765, + "step": 7045, + "time_per_iteration": 2.9124820232391357 + }, + { + "auxiliary_loss_clip": 0.01457828, + "auxiliary_loss_mlp": 0.01252347, + "balance_loss_clip": 1.14151239, + "balance_loss_mlp": 1.02556348, + "epoch": 0.42362843829851193, + "flos": 17598348776160.0, + "grad_norm": 4.390152268341934, + "language_loss": 0.80667591, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.83377767, + "num_input_tokens_seen": 151269045, + "step": 7046, + "time_per_iteration": 2.841541051864624 + }, + { + "auxiliary_loss_clip": 0.01452089, + "auxiliary_loss_mlp": 0.01258804, + "balance_loss_clip": 1.13425207, + "balance_loss_mlp": 1.0301125, + "epoch": 0.4236885615511799, + "flos": 22567935487680.0, + "grad_norm": 1.7868383343521375, + "language_loss": 0.77000791, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.79711682, + "num_input_tokens_seen": 151287530, + "step": 7047, + "time_per_iteration": 2.8430607318878174 + }, + { + "auxiliary_loss_clip": 0.01459856, + "auxiliary_loss_mlp": 0.01259203, + "balance_loss_clip": 1.14394665, + "balance_loss_mlp": 1.03108454, + "epoch": 0.42374868480384786, + "flos": 26471654481600.0, + "grad_norm": 1.688520407974145, + "language_loss": 0.67801368, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70520425, + "num_input_tokens_seen": 151308905, + "step": 7048, + "time_per_iteration": 2.889343023300171 + }, + { + "auxiliary_loss_clip": 0.01458307, + "auxiliary_loss_mlp": 0.01259604, + "balance_loss_clip": 1.14088964, + "balance_loss_mlp": 1.03148508, + "epoch": 0.4238088080565159, + "flos": 20371986948960.0, + "grad_norm": 1.6539686637635955, + "language_loss": 0.78043211, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80761123, + "num_input_tokens_seen": 151326525, + "step": 7049, + "time_per_iteration": 2.8380026817321777 + }, + { + "auxiliary_loss_clip": 0.01454267, + "auxiliary_loss_mlp": 0.01257319, + "balance_loss_clip": 1.13635302, + "balance_loss_mlp": 1.02786446, + "epoch": 0.42386893130918385, + "flos": 21173872122720.0, + "grad_norm": 2.0400105886194453, + "language_loss": 0.82255405, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.84966993, + "num_input_tokens_seen": 151344675, + "step": 7050, + "time_per_iteration": 2.9016613960266113 + }, + { + "auxiliary_loss_clip": 0.01445209, + "auxiliary_loss_mlp": 0.01261351, + "balance_loss_clip": 1.12806046, + "balance_loss_mlp": 1.03189707, + "epoch": 0.4239290545618518, + "flos": 17677543502880.0, + "grad_norm": 2.1499978909435007, + "language_loss": 0.73551059, + "learning_rate": 2.581326338868687e-06, + "loss": 0.76257622, + "num_input_tokens_seen": 151360730, + "step": 7051, + "time_per_iteration": 2.8081486225128174 + }, + { + "auxiliary_loss_clip": 0.01457018, + "auxiliary_loss_mlp": 0.01252051, + "balance_loss_clip": 1.1408999, + "balance_loss_mlp": 1.02603018, + "epoch": 0.4239891778145198, + "flos": 24316706648160.0, + "grad_norm": 1.4212043494240438, + "language_loss": 0.86529118, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.89238191, + "num_input_tokens_seen": 151380445, + "step": 7052, + "time_per_iteration": 2.91979718208313 + }, + { + "auxiliary_loss_clip": 0.01455781, + "auxiliary_loss_mlp": 0.01256205, + "balance_loss_clip": 1.13756549, + "balance_loss_mlp": 1.02446175, + "epoch": 0.42404930106718774, + "flos": 20560529653920.0, + "grad_norm": 1.4091450587891785, + "language_loss": 0.72226572, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.7493856, + "num_input_tokens_seen": 151399325, + "step": 7053, + "time_per_iteration": 2.875765800476074 + }, + { + "auxiliary_loss_clip": 0.01449334, + "auxiliary_loss_mlp": 0.01247231, + "balance_loss_clip": 1.13120329, + "balance_loss_mlp": 1.01968384, + "epoch": 0.4241094243198557, + "flos": 22310173162080.0, + "grad_norm": 2.0473450080953346, + "language_loss": 0.82426679, + "learning_rate": 2.580208299200704e-06, + "loss": 0.85123247, + "num_input_tokens_seen": 151417240, + "step": 7054, + "time_per_iteration": 2.957667112350464 + }, + { + "auxiliary_loss_clip": 0.01565587, + "auxiliary_loss_mlp": 0.01228745, + "balance_loss_clip": 1.26448035, + "balance_loss_mlp": 1.02122498, + "epoch": 0.4241695475725237, + "flos": 70619145698880.0, + "grad_norm": 0.8188008890193661, + "language_loss": 0.60388553, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.63182878, + "num_input_tokens_seen": 151476015, + "step": 7055, + "time_per_iteration": 3.3838775157928467 + }, + { + "auxiliary_loss_clip": 0.01457425, + "auxiliary_loss_mlp": 0.01266065, + "balance_loss_clip": 1.1398927, + "balance_loss_mlp": 1.03775561, + "epoch": 0.42422967082519164, + "flos": 14029197360480.0, + "grad_norm": 2.8056538232601835, + "language_loss": 0.76622415, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.79345906, + "num_input_tokens_seen": 151492035, + "step": 7056, + "time_per_iteration": 2.8467819690704346 + }, + { + "auxiliary_loss_clip": 0.01463731, + "auxiliary_loss_mlp": 0.01255776, + "balance_loss_clip": 1.145648, + "balance_loss_mlp": 1.02079093, + "epoch": 0.4242897940778596, + "flos": 22347684476640.0, + "grad_norm": 3.354335715222499, + "language_loss": 0.84616435, + "learning_rate": 2.579090061518714e-06, + "loss": 0.87335938, + "num_input_tokens_seen": 151508970, + "step": 7057, + "time_per_iteration": 2.917539596557617 + }, + { + "auxiliary_loss_clip": 0.01459139, + "auxiliary_loss_mlp": 0.0125768, + "balance_loss_clip": 1.14121938, + "balance_loss_mlp": 1.02860761, + "epoch": 0.42434991733052757, + "flos": 22597747385760.0, + "grad_norm": 2.297183152995458, + "language_loss": 0.82818568, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85535383, + "num_input_tokens_seen": 151525295, + "step": 7058, + "time_per_iteration": 4.4187211990356445 + }, + { + "auxiliary_loss_clip": 0.01462477, + "auxiliary_loss_mlp": 0.01257524, + "balance_loss_clip": 1.14491522, + "balance_loss_mlp": 1.03321958, + "epoch": 0.42441004058319554, + "flos": 20013524265600.0, + "grad_norm": 1.7343552144504608, + "language_loss": 0.80143827, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82863832, + "num_input_tokens_seen": 151544435, + "step": 7059, + "time_per_iteration": 2.878955841064453 + }, + { + "auxiliary_loss_clip": 0.01455648, + "auxiliary_loss_mlp": 0.01257499, + "balance_loss_clip": 1.13756526, + "balance_loss_mlp": 1.027282, + "epoch": 0.4244701638358635, + "flos": 11146552562880.0, + "grad_norm": 2.2219287289252265, + "language_loss": 0.70353729, + "learning_rate": 2.57797162620435e-06, + "loss": 0.73066872, + "num_input_tokens_seen": 151559520, + "step": 7060, + "time_per_iteration": 2.836803436279297 + }, + { + "auxiliary_loss_clip": 0.01458323, + "auxiliary_loss_mlp": 0.01255957, + "balance_loss_clip": 1.14076233, + "balance_loss_mlp": 1.02631235, + "epoch": 0.42453028708853147, + "flos": 23990028127200.0, + "grad_norm": 1.674443673424473, + "language_loss": 0.76065409, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78779685, + "num_input_tokens_seen": 151579790, + "step": 7061, + "time_per_iteration": 2.887908458709717 + }, + { + "auxiliary_loss_clip": 0.01459388, + "auxiliary_loss_mlp": 0.01255552, + "balance_loss_clip": 1.14100623, + "balance_loss_mlp": 1.02609777, + "epoch": 0.42459041034119943, + "flos": 18408350576160.0, + "grad_norm": 2.0386553105376946, + "language_loss": 0.72873008, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.75587952, + "num_input_tokens_seen": 151598285, + "step": 7062, + "time_per_iteration": 3.0040764808654785 + }, + { + "auxiliary_loss_clip": 0.01448794, + "auxiliary_loss_mlp": 0.01257147, + "balance_loss_clip": 1.13146472, + "balance_loss_mlp": 1.02731097, + "epoch": 0.42465053359386745, + "flos": 20960030970720.0, + "grad_norm": 1.8836777663011208, + "language_loss": 0.66305757, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.69011688, + "num_input_tokens_seen": 151615430, + "step": 7063, + "time_per_iteration": 2.8627936840057373 + }, + { + "auxiliary_loss_clip": 0.01454031, + "auxiliary_loss_mlp": 0.01255574, + "balance_loss_clip": 1.13615239, + "balance_loss_mlp": 1.03126979, + "epoch": 0.4247106568465354, + "flos": 33108807434400.0, + "grad_norm": 1.787498097572192, + "language_loss": 0.78885972, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.81595576, + "num_input_tokens_seen": 151637030, + "step": 7064, + "time_per_iteration": 2.9791619777679443 + }, + { + "auxiliary_loss_clip": 0.01453679, + "auxiliary_loss_mlp": 0.01255, + "balance_loss_clip": 1.13569784, + "balance_loss_mlp": 1.02592731, + "epoch": 0.4247707800992034, + "flos": 20048949531360.0, + "grad_norm": 2.1740643978900867, + "language_loss": 0.75202596, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.77911282, + "num_input_tokens_seen": 151655745, + "step": 7065, + "time_per_iteration": 2.8878467082977295 + }, + { + "auxiliary_loss_clip": 0.01459698, + "auxiliary_loss_mlp": 0.01259527, + "balance_loss_clip": 1.14235544, + "balance_loss_mlp": 1.03426933, + "epoch": 0.42483090335187135, + "flos": 22387471480800.0, + "grad_norm": 1.6694983357743978, + "language_loss": 0.72406781, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.75126004, + "num_input_tokens_seen": 151678040, + "step": 7066, + "time_per_iteration": 4.296485424041748 + }, + { + "auxiliary_loss_clip": 0.01452085, + "auxiliary_loss_mlp": 0.01261082, + "balance_loss_clip": 1.13418853, + "balance_loss_mlp": 1.03105581, + "epoch": 0.4248910266045393, + "flos": 21358735796160.0, + "grad_norm": 2.525284980174437, + "language_loss": 0.79994327, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.82707494, + "num_input_tokens_seen": 151696410, + "step": 7067, + "time_per_iteration": 2.9407777786254883 + }, + { + "auxiliary_loss_clip": 0.015541, + "auxiliary_loss_mlp": 0.01215538, + "balance_loss_clip": 1.25229025, + "balance_loss_mlp": 1.00649261, + "epoch": 0.4249511498572073, + "flos": 64014421687200.0, + "grad_norm": 0.9324581310370246, + "language_loss": 0.63466555, + "learning_rate": 2.574988168733022e-06, + "loss": 0.66236192, + "num_input_tokens_seen": 151756365, + "step": 7068, + "time_per_iteration": 5.475610017776489 + }, + { + "auxiliary_loss_clip": 0.01451318, + "auxiliary_loss_mlp": 0.01271311, + "balance_loss_clip": 1.13264489, + "balance_loss_mlp": 1.0443368, + "epoch": 0.42501127310987524, + "flos": 19608940575360.0, + "grad_norm": 2.0661322107826607, + "language_loss": 0.72512329, + "learning_rate": 2.574615138284361e-06, + "loss": 0.75234956, + "num_input_tokens_seen": 151775165, + "step": 7069, + "time_per_iteration": 2.978868246078491 + }, + { + "auxiliary_loss_clip": 0.0145502, + "auxiliary_loss_mlp": 0.01267194, + "balance_loss_clip": 1.13569951, + "balance_loss_mlp": 1.03526044, + "epoch": 0.4250713963625432, + "flos": 19464281115840.0, + "grad_norm": 5.050543600442902, + "language_loss": 0.7958529, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.82307506, + "num_input_tokens_seen": 151792620, + "step": 7070, + "time_per_iteration": 2.772613048553467 + }, + { + "auxiliary_loss_clip": 0.01442691, + "auxiliary_loss_mlp": 0.01267385, + "balance_loss_clip": 1.1232779, + "balance_loss_mlp": 1.03812182, + "epoch": 0.4251315196152112, + "flos": 25340056534080.0, + "grad_norm": 2.1502573765449986, + "language_loss": 0.70684373, + "learning_rate": 2.573869012032795e-06, + "loss": 0.73394442, + "num_input_tokens_seen": 151812850, + "step": 7071, + "time_per_iteration": 2.816221237182617 + }, + { + "auxiliary_loss_clip": 0.01445974, + "auxiliary_loss_mlp": 0.01282727, + "balance_loss_clip": 1.12666678, + "balance_loss_mlp": 1.05785072, + "epoch": 0.42519164286787914, + "flos": 26361585868320.0, + "grad_norm": 3.5608907998216566, + "language_loss": 0.70570219, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73298925, + "num_input_tokens_seen": 151831785, + "step": 7072, + "time_per_iteration": 2.922168016433716 + }, + { + "auxiliary_loss_clip": 0.01451362, + "auxiliary_loss_mlp": 0.01271266, + "balance_loss_clip": 1.13298821, + "balance_loss_mlp": 1.04181218, + "epoch": 0.4252517661205471, + "flos": 26033466077280.0, + "grad_norm": 1.6946679462769003, + "language_loss": 0.81379938, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.84102571, + "num_input_tokens_seen": 151853885, + "step": 7073, + "time_per_iteration": 4.427642107009888 + }, + { + "auxiliary_loss_clip": 0.0146285, + "auxiliary_loss_mlp": 0.01269398, + "balance_loss_clip": 1.14594388, + "balance_loss_mlp": 1.04356742, + "epoch": 0.42531188937321507, + "flos": 12715276926240.0, + "grad_norm": 2.3599847886197334, + "language_loss": 0.90586579, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.9331882, + "num_input_tokens_seen": 151871780, + "step": 7074, + "time_per_iteration": 2.814411163330078 + }, + { + "auxiliary_loss_clip": 0.01459462, + "auxiliary_loss_mlp": 0.0125964, + "balance_loss_clip": 1.14124131, + "balance_loss_mlp": 1.02885056, + "epoch": 0.42537201262588303, + "flos": 22093904607840.0, + "grad_norm": 1.891186257338367, + "language_loss": 0.64623725, + "learning_rate": 2.572376498508805e-06, + "loss": 0.6734283, + "num_input_tokens_seen": 151891600, + "step": 7075, + "time_per_iteration": 2.8121583461761475 + }, + { + "auxiliary_loss_clip": 0.01445972, + "auxiliary_loss_mlp": 0.01256785, + "balance_loss_clip": 1.13004839, + "balance_loss_mlp": 1.03171766, + "epoch": 0.42543213587855105, + "flos": 23005365328800.0, + "grad_norm": 1.6006363980467972, + "language_loss": 0.74652183, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.77354944, + "num_input_tokens_seen": 151911330, + "step": 7076, + "time_per_iteration": 2.7659049034118652 + }, + { + "auxiliary_loss_clip": 0.01449656, + "auxiliary_loss_mlp": 0.01272655, + "balance_loss_clip": 1.132846, + "balance_loss_mlp": 1.04281926, + "epoch": 0.425492259131219, + "flos": 25084797467040.0, + "grad_norm": 4.101323841603271, + "language_loss": 0.790025, + "learning_rate": 2.571630111462766e-06, + "loss": 0.81724811, + "num_input_tokens_seen": 151930355, + "step": 7077, + "time_per_iteration": 2.8297054767608643 + }, + { + "auxiliary_loss_clip": 0.01454957, + "auxiliary_loss_mlp": 0.0126575, + "balance_loss_clip": 1.13788438, + "balance_loss_mlp": 1.03782165, + "epoch": 0.425552382383887, + "flos": 22818870744480.0, + "grad_norm": 1.6828603170290777, + "language_loss": 0.73284185, + "learning_rate": 2.571256885418265e-06, + "loss": 0.76004899, + "num_input_tokens_seen": 151949695, + "step": 7078, + "time_per_iteration": 2.7836451530456543 + }, + { + "auxiliary_loss_clip": 0.01465295, + "auxiliary_loss_mlp": 0.01262382, + "balance_loss_clip": 1.14908266, + "balance_loss_mlp": 1.03578877, + "epoch": 0.42561250563655495, + "flos": 13555318193280.0, + "grad_norm": 2.235411902247016, + "language_loss": 0.79830146, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.82557827, + "num_input_tokens_seen": 151967640, + "step": 7079, + "time_per_iteration": 2.7212183475494385 + }, + { + "auxiliary_loss_clip": 0.01452719, + "auxiliary_loss_mlp": 0.01252434, + "balance_loss_clip": 1.13554692, + "balance_loss_mlp": 1.02641296, + "epoch": 0.4256726288892229, + "flos": 46982118600000.0, + "grad_norm": 1.3598085473205417, + "language_loss": 0.72076797, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74781954, + "num_input_tokens_seen": 151994020, + "step": 7080, + "time_per_iteration": 3.071270227432251 + }, + { + "auxiliary_loss_clip": 0.01448267, + "auxiliary_loss_mlp": 0.01250964, + "balance_loss_clip": 1.13064873, + "balance_loss_mlp": 1.0239892, + "epoch": 0.4257327521418909, + "flos": 23588706258720.0, + "grad_norm": 3.4041506421876147, + "language_loss": 0.80559027, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.83258259, + "num_input_tokens_seen": 152013415, + "step": 7081, + "time_per_iteration": 2.8087263107299805 + }, + { + "auxiliary_loss_clip": 0.01457938, + "auxiliary_loss_mlp": 0.01260745, + "balance_loss_clip": 1.13994431, + "balance_loss_mlp": 1.03663111, + "epoch": 0.42579287539455885, + "flos": 18992032859520.0, + "grad_norm": 1.9532996213819191, + "language_loss": 0.81500471, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.84219158, + "num_input_tokens_seen": 152030860, + "step": 7082, + "time_per_iteration": 2.746267318725586 + }, + { + "auxiliary_loss_clip": 0.01460913, + "auxiliary_loss_mlp": 0.01262495, + "balance_loss_clip": 1.14242625, + "balance_loss_mlp": 1.03475726, + "epoch": 0.4258529986472268, + "flos": 25194259229760.0, + "grad_norm": 2.38009515276477, + "language_loss": 0.70172095, + "learning_rate": 2.569390430547065e-06, + "loss": 0.72895503, + "num_input_tokens_seen": 152050395, + "step": 7083, + "time_per_iteration": 2.804879665374756 + }, + { + "auxiliary_loss_clip": 0.01551369, + "auxiliary_loss_mlp": 0.01229797, + "balance_loss_clip": 1.25123, + "balance_loss_mlp": 1.02227783, + "epoch": 0.4259131218998948, + "flos": 69975649978560.0, + "grad_norm": 0.8632953198286915, + "language_loss": 0.67003441, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69784606, + "num_input_tokens_seen": 152113555, + "step": 7084, + "time_per_iteration": 3.427619218826294 + }, + { + "auxiliary_loss_clip": 0.01457461, + "auxiliary_loss_mlp": 0.01266081, + "balance_loss_clip": 1.13839841, + "balance_loss_mlp": 1.03967822, + "epoch": 0.42597324515256274, + "flos": 18006952851360.0, + "grad_norm": 2.1903157962444224, + "language_loss": 0.78525257, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.81248796, + "num_input_tokens_seen": 152131575, + "step": 7085, + "time_per_iteration": 2.8010151386260986 + }, + { + "auxiliary_loss_clip": 0.01460931, + "auxiliary_loss_mlp": 0.01261684, + "balance_loss_clip": 1.14154339, + "balance_loss_mlp": 1.03280187, + "epoch": 0.4260333684052307, + "flos": 15160605667200.0, + "grad_norm": 2.448314849906981, + "language_loss": 0.76165152, + "learning_rate": 2.568270298414995e-06, + "loss": 0.78887767, + "num_input_tokens_seen": 152149435, + "step": 7086, + "time_per_iteration": 2.753356456756592 + }, + { + "auxiliary_loss_clip": 0.01454944, + "auxiliary_loss_mlp": 0.01268245, + "balance_loss_clip": 1.13736856, + "balance_loss_mlp": 1.03840947, + "epoch": 0.42609349165789867, + "flos": 14941113219360.0, + "grad_norm": 1.999066947236384, + "language_loss": 0.80590546, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.83313739, + "num_input_tokens_seen": 152166860, + "step": 7087, + "time_per_iteration": 2.7763750553131104 + }, + { + "auxiliary_loss_clip": 0.01461348, + "auxiliary_loss_mlp": 0.01254956, + "balance_loss_clip": 1.14356232, + "balance_loss_mlp": 1.0241673, + "epoch": 0.42615361491056664, + "flos": 23734200137760.0, + "grad_norm": 1.8802542723373359, + "language_loss": 0.65858316, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68574619, + "num_input_tokens_seen": 152187475, + "step": 7088, + "time_per_iteration": 2.8189902305603027 + }, + { + "auxiliary_loss_clip": 0.01460542, + "auxiliary_loss_mlp": 0.01261561, + "balance_loss_clip": 1.1416285, + "balance_loss_mlp": 1.0303905, + "epoch": 0.42621373816323466, + "flos": 24938772593760.0, + "grad_norm": 2.100396615911057, + "language_loss": 0.6888203, + "learning_rate": 2.56714997234313e-06, + "loss": 0.71604133, + "num_input_tokens_seen": 152207235, + "step": 7089, + "time_per_iteration": 2.8206255435943604 + }, + { + "auxiliary_loss_clip": 0.01460331, + "auxiliary_loss_mlp": 0.01260377, + "balance_loss_clip": 1.14043665, + "balance_loss_mlp": 1.03168643, + "epoch": 0.4262738614159026, + "flos": 13554673414560.0, + "grad_norm": 2.8741109103037896, + "language_loss": 0.73624253, + "learning_rate": 2.566776487287525e-06, + "loss": 0.76344955, + "num_input_tokens_seen": 152224240, + "step": 7090, + "time_per_iteration": 2.806180238723755 + }, + { + "auxiliary_loss_clip": 0.01453403, + "auxiliary_loss_mlp": 0.01254665, + "balance_loss_clip": 1.13471258, + "balance_loss_mlp": 1.02349472, + "epoch": 0.4263339846685706, + "flos": 29751183552960.0, + "grad_norm": 2.254834353022419, + "language_loss": 0.75016522, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77724588, + "num_input_tokens_seen": 152242595, + "step": 7091, + "time_per_iteration": 2.8423259258270264 + }, + { + "auxiliary_loss_clip": 0.0145485, + "auxiliary_loss_mlp": 0.01251661, + "balance_loss_clip": 1.13676512, + "balance_loss_mlp": 1.02926421, + "epoch": 0.42639410792123855, + "flos": 16835947181280.0, + "grad_norm": 2.120743179916961, + "language_loss": 0.82795465, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.85501975, + "num_input_tokens_seen": 152260840, + "step": 7092, + "time_per_iteration": 2.7471835613250732 + }, + { + "auxiliary_loss_clip": 0.0145933, + "auxiliary_loss_mlp": 0.01270778, + "balance_loss_clip": 1.13999879, + "balance_loss_mlp": 1.04227793, + "epoch": 0.4264542311739065, + "flos": 28765420837920.0, + "grad_norm": 1.597929807924363, + "language_loss": 0.73937267, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76667374, + "num_input_tokens_seen": 152280580, + "step": 7093, + "time_per_iteration": 2.853189706802368 + }, + { + "auxiliary_loss_clip": 0.01459568, + "auxiliary_loss_mlp": 0.01252153, + "balance_loss_clip": 1.14069104, + "balance_loss_mlp": 1.02155423, + "epoch": 0.4265143544265745, + "flos": 24715714898880.0, + "grad_norm": 3.5335496128275117, + "language_loss": 0.70423228, + "learning_rate": 2.565282332284532e-06, + "loss": 0.73134947, + "num_input_tokens_seen": 152298455, + "step": 7094, + "time_per_iteration": 2.7635498046875 + }, + { + "auxiliary_loss_clip": 0.01456524, + "auxiliary_loss_mlp": 0.01264868, + "balance_loss_clip": 1.13750219, + "balance_loss_mlp": 1.03960991, + "epoch": 0.42657447767924245, + "flos": 21867812660160.0, + "grad_norm": 1.6147677251411718, + "language_loss": 0.81897646, + "learning_rate": 2.564908739909464e-06, + "loss": 0.84619039, + "num_input_tokens_seen": 152316995, + "step": 7095, + "time_per_iteration": 4.552060604095459 + }, + { + "auxiliary_loss_clip": 0.01454381, + "auxiliary_loss_mlp": 0.0125495, + "balance_loss_clip": 1.13575196, + "balance_loss_mlp": 1.0262593, + "epoch": 0.4266346009319104, + "flos": 21472369656480.0, + "grad_norm": 1.937948362190264, + "language_loss": 0.80560303, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.83269632, + "num_input_tokens_seen": 152334800, + "step": 7096, + "time_per_iteration": 2.7582128047943115 + }, + { + "auxiliary_loss_clip": 0.01442858, + "auxiliary_loss_mlp": 0.01264186, + "balance_loss_clip": 1.12370002, + "balance_loss_mlp": 1.0341599, + "epoch": 0.4266947241845784, + "flos": 25521696313920.0, + "grad_norm": 2.5876562916136923, + "language_loss": 0.65647393, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.6835444, + "num_input_tokens_seen": 152355175, + "step": 7097, + "time_per_iteration": 2.849203109741211 + }, + { + "auxiliary_loss_clip": 0.01445789, + "auxiliary_loss_mlp": 0.01259124, + "balance_loss_clip": 1.12650228, + "balance_loss_mlp": 1.03501058, + "epoch": 0.42675484743724634, + "flos": 26543604929760.0, + "grad_norm": 1.7355017772602572, + "language_loss": 0.74281192, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76986104, + "num_input_tokens_seen": 152377245, + "step": 7098, + "time_per_iteration": 2.8530349731445312 + }, + { + "auxiliary_loss_clip": 0.01453861, + "auxiliary_loss_mlp": 0.01254497, + "balance_loss_clip": 1.13433385, + "balance_loss_mlp": 1.02809525, + "epoch": 0.4268149706899143, + "flos": 23114978804160.0, + "grad_norm": 3.24379448221398, + "language_loss": 0.75392628, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.78100985, + "num_input_tokens_seen": 152396985, + "step": 7099, + "time_per_iteration": 2.7760672569274902 + }, + { + "auxiliary_loss_clip": 0.01458264, + "auxiliary_loss_mlp": 0.01265262, + "balance_loss_clip": 1.13702047, + "balance_loss_mlp": 1.03523564, + "epoch": 0.4268750939425823, + "flos": 22708498705920.0, + "grad_norm": 2.268998595129373, + "language_loss": 0.83519053, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.8624258, + "num_input_tokens_seen": 152415590, + "step": 7100, + "time_per_iteration": 2.777892589569092 + }, + { + "auxiliary_loss_clip": 0.01442523, + "auxiliary_loss_mlp": 0.01254498, + "balance_loss_clip": 1.12083411, + "balance_loss_mlp": 1.02943087, + "epoch": 0.42693521719525024, + "flos": 25377264423360.0, + "grad_norm": 1.4233379005305955, + "language_loss": 0.8237803, + "learning_rate": 2.562666736305627e-06, + "loss": 0.85075045, + "num_input_tokens_seen": 152436735, + "step": 7101, + "time_per_iteration": 2.8205504417419434 + }, + { + "auxiliary_loss_clip": 0.01452634, + "auxiliary_loss_mlp": 0.01264569, + "balance_loss_clip": 1.13219762, + "balance_loss_mlp": 1.03416181, + "epoch": 0.42699534044791826, + "flos": 18152674299360.0, + "grad_norm": 2.392975201081577, + "language_loss": 0.73278266, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.75995469, + "num_input_tokens_seen": 152455685, + "step": 7102, + "time_per_iteration": 2.749809503555298 + }, + { + "auxiliary_loss_clip": 0.0145177, + "auxiliary_loss_mlp": 0.01253219, + "balance_loss_clip": 1.13165069, + "balance_loss_mlp": 1.02986872, + "epoch": 0.4270554637005862, + "flos": 13700091437280.0, + "grad_norm": 1.7610684949047282, + "language_loss": 0.82859921, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.85564911, + "num_input_tokens_seen": 152473500, + "step": 7103, + "time_per_iteration": 2.7841296195983887 + }, + { + "auxiliary_loss_clip": 0.01456058, + "auxiliary_loss_mlp": 0.01273363, + "balance_loss_clip": 1.13660204, + "balance_loss_mlp": 1.04314566, + "epoch": 0.4271155869532542, + "flos": 17495524441440.0, + "grad_norm": 2.000629536240067, + "language_loss": 0.73888803, + "learning_rate": 2.561545446271294e-06, + "loss": 0.76618224, + "num_input_tokens_seen": 152491320, + "step": 7104, + "time_per_iteration": 4.2248382568359375 + }, + { + "auxiliary_loss_clip": 0.01455739, + "auxiliary_loss_mlp": 0.01255056, + "balance_loss_clip": 1.13563621, + "balance_loss_mlp": 1.03170514, + "epoch": 0.42717571020592215, + "flos": 32455033182720.0, + "grad_norm": 2.092934944753347, + "language_loss": 0.75080097, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77790892, + "num_input_tokens_seen": 152511970, + "step": 7105, + "time_per_iteration": 2.8744096755981445 + }, + { + "auxiliary_loss_clip": 0.01460226, + "auxiliary_loss_mlp": 0.01258488, + "balance_loss_clip": 1.14201832, + "balance_loss_mlp": 1.02979743, + "epoch": 0.4272358334585901, + "flos": 16254919869120.0, + "grad_norm": 2.8833667140202657, + "language_loss": 0.77115494, + "learning_rate": 2.560797813088819e-06, + "loss": 0.79834211, + "num_input_tokens_seen": 152530515, + "step": 7106, + "time_per_iteration": 4.578167676925659 + }, + { + "auxiliary_loss_clip": 0.01451981, + "auxiliary_loss_mlp": 0.01264835, + "balance_loss_clip": 1.13327408, + "balance_loss_mlp": 1.04339147, + "epoch": 0.4272959567112581, + "flos": 24202010799360.0, + "grad_norm": 1.8153054226590482, + "language_loss": 0.79742938, + "learning_rate": 2.560423964592229e-06, + "loss": 0.82459748, + "num_input_tokens_seen": 152549295, + "step": 7107, + "time_per_iteration": 2.824655771255493 + }, + { + "auxiliary_loss_clip": 0.01450759, + "auxiliary_loss_mlp": 0.01265647, + "balance_loss_clip": 1.13179827, + "balance_loss_mlp": 1.04172468, + "epoch": 0.42735607996392605, + "flos": 27965925138240.0, + "grad_norm": 7.763181577756942, + "language_loss": 0.6801132, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70727724, + "num_input_tokens_seen": 152570725, + "step": 7108, + "time_per_iteration": 2.826540231704712 + }, + { + "auxiliary_loss_clip": 0.0145949, + "auxiliary_loss_mlp": 0.01265544, + "balance_loss_clip": 1.13935292, + "balance_loss_mlp": 1.04162121, + "epoch": 0.427416203216594, + "flos": 20297267745120.0, + "grad_norm": 1.7270190314978329, + "language_loss": 0.71516591, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.74241626, + "num_input_tokens_seen": 152588950, + "step": 7109, + "time_per_iteration": 2.818480968475342 + }, + { + "auxiliary_loss_clip": 0.01457749, + "auxiliary_loss_mlp": 0.01258886, + "balance_loss_clip": 1.1380856, + "balance_loss_mlp": 1.0345819, + "epoch": 0.427476326469262, + "flos": 26946443924640.0, + "grad_norm": 2.2089233849885184, + "language_loss": 0.64686942, + "learning_rate": 2.559302291651174e-06, + "loss": 0.67403579, + "num_input_tokens_seen": 152608965, + "step": 7110, + "time_per_iteration": 2.832353353500366 + }, + { + "auxiliary_loss_clip": 0.01463056, + "auxiliary_loss_mlp": 0.01272846, + "balance_loss_clip": 1.14416146, + "balance_loss_mlp": 1.04663432, + "epoch": 0.42753644972192995, + "flos": 25705232501760.0, + "grad_norm": 1.7029342369980733, + "language_loss": 0.76625228, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.79361135, + "num_input_tokens_seen": 152630220, + "step": 7111, + "time_per_iteration": 2.840062141418457 + }, + { + "auxiliary_loss_clip": 0.01460067, + "auxiliary_loss_mlp": 0.01264702, + "balance_loss_clip": 1.1400305, + "balance_loss_mlp": 1.03849006, + "epoch": 0.4275965729745979, + "flos": 18769240661760.0, + "grad_norm": 1.9882900821561635, + "language_loss": 0.73170882, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75895655, + "num_input_tokens_seen": 152648835, + "step": 7112, + "time_per_iteration": 4.2741615772247314 + }, + { + "auxiliary_loss_clip": 0.01453741, + "auxiliary_loss_mlp": 0.01258409, + "balance_loss_clip": 1.13460767, + "balance_loss_mlp": 1.03200626, + "epoch": 0.4276566962272659, + "flos": 23766439438080.0, + "grad_norm": 1.786731201904189, + "language_loss": 0.71451819, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.74163967, + "num_input_tokens_seen": 152668375, + "step": 7113, + "time_per_iteration": 2.8237128257751465 + }, + { + "auxiliary_loss_clip": 0.01461664, + "auxiliary_loss_mlp": 0.01259196, + "balance_loss_clip": 1.14227104, + "balance_loss_mlp": 1.03241229, + "epoch": 0.42771681947993384, + "flos": 22494733410240.0, + "grad_norm": 1.8585556389504265, + "language_loss": 0.61956799, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64677662, + "num_input_tokens_seen": 152689725, + "step": 7114, + "time_per_iteration": 2.841075897216797 + }, + { + "auxiliary_loss_clip": 0.01459436, + "auxiliary_loss_mlp": 0.01266568, + "balance_loss_clip": 1.13982558, + "balance_loss_mlp": 1.03787649, + "epoch": 0.42777694273260186, + "flos": 25046982727200.0, + "grad_norm": 1.829422038638748, + "language_loss": 0.64843279, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.6756928, + "num_input_tokens_seen": 152709375, + "step": 7115, + "time_per_iteration": 2.7893495559692383 + }, + { + "auxiliary_loss_clip": 0.01454555, + "auxiliary_loss_mlp": 0.01251457, + "balance_loss_clip": 1.13592148, + "balance_loss_mlp": 1.02448273, + "epoch": 0.4278370659852698, + "flos": 18663533786880.0, + "grad_norm": 1.5465595749572147, + "language_loss": 0.73850584, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.76556599, + "num_input_tokens_seen": 152727510, + "step": 7116, + "time_per_iteration": 2.883103370666504 + }, + { + "auxiliary_loss_clip": 0.01452191, + "auxiliary_loss_mlp": 0.01259565, + "balance_loss_clip": 1.13263535, + "balance_loss_mlp": 1.03526044, + "epoch": 0.4278971892379378, + "flos": 27310747544640.0, + "grad_norm": 1.9263666508484907, + "language_loss": 0.69248855, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71960604, + "num_input_tokens_seen": 152746670, + "step": 7117, + "time_per_iteration": 2.8110780715942383 + }, + { + "auxiliary_loss_clip": 0.01461691, + "auxiliary_loss_mlp": 0.01261291, + "balance_loss_clip": 1.14216018, + "balance_loss_mlp": 1.03603292, + "epoch": 0.42795731249060576, + "flos": 12889937924640.0, + "grad_norm": 4.377819557930605, + "language_loss": 0.6926313, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.71986115, + "num_input_tokens_seen": 152760545, + "step": 7118, + "time_per_iteration": 2.7585513591766357 + }, + { + "auxiliary_loss_clip": 0.01455468, + "auxiliary_loss_mlp": 0.01253964, + "balance_loss_clip": 1.13688314, + "balance_loss_mlp": 1.02813387, + "epoch": 0.4280174357432737, + "flos": 33404270715360.0, + "grad_norm": 2.0395295875153368, + "language_loss": 0.74696016, + "learning_rate": 2.55593612908444e-06, + "loss": 0.77405447, + "num_input_tokens_seen": 152780970, + "step": 7119, + "time_per_iteration": 2.9006237983703613 + }, + { + "auxiliary_loss_clip": 0.01458729, + "auxiliary_loss_mlp": 0.01263236, + "balance_loss_clip": 1.14003038, + "balance_loss_mlp": 1.03740573, + "epoch": 0.4280775589959417, + "flos": 18261036145440.0, + "grad_norm": 1.8351039680979722, + "language_loss": 0.74589872, + "learning_rate": 2.555562005426573e-06, + "loss": 0.77311832, + "num_input_tokens_seen": 152798475, + "step": 7120, + "time_per_iteration": 2.757769823074341 + }, + { + "auxiliary_loss_clip": 0.01461024, + "auxiliary_loss_mlp": 0.012506, + "balance_loss_clip": 1.14325595, + "balance_loss_mlp": 1.02515185, + "epoch": 0.42813768224860965, + "flos": 21473469573120.0, + "grad_norm": 1.6461707151050085, + "language_loss": 0.77024919, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.79736543, + "num_input_tokens_seen": 152817555, + "step": 7121, + "time_per_iteration": 2.824206829071045 + }, + { + "auxiliary_loss_clip": 0.01452842, + "auxiliary_loss_mlp": 0.01269724, + "balance_loss_clip": 1.13458085, + "balance_loss_mlp": 1.04542017, + "epoch": 0.4281978055012776, + "flos": 15671085873120.0, + "grad_norm": 2.2643188619661743, + "language_loss": 0.86029744, + "learning_rate": 2.554813694924126e-06, + "loss": 0.88752311, + "num_input_tokens_seen": 152836295, + "step": 7122, + "time_per_iteration": 2.749694347381592 + }, + { + "auxiliary_loss_clip": 0.01456181, + "auxiliary_loss_mlp": 0.01251676, + "balance_loss_clip": 1.13742852, + "balance_loss_mlp": 1.02451098, + "epoch": 0.4282579287539456, + "flos": 17713916972640.0, + "grad_norm": 1.8489137972653127, + "language_loss": 0.81622326, + "learning_rate": 2.554439508107921e-06, + "loss": 0.84330177, + "num_input_tokens_seen": 152854950, + "step": 7123, + "time_per_iteration": 2.7434964179992676 + }, + { + "auxiliary_loss_clip": 0.01454129, + "auxiliary_loss_mlp": 0.01266991, + "balance_loss_clip": 1.13550806, + "balance_loss_mlp": 1.04135203, + "epoch": 0.42831805200661355, + "flos": 19283172330240.0, + "grad_norm": 1.6036214288996973, + "language_loss": 0.81092429, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83813554, + "num_input_tokens_seen": 152873995, + "step": 7124, + "time_per_iteration": 2.7962260246276855 + }, + { + "auxiliary_loss_clip": 0.01455308, + "auxiliary_loss_mlp": 0.01263625, + "balance_loss_clip": 1.13706303, + "balance_loss_mlp": 1.03531528, + "epoch": 0.4283781752592815, + "flos": 19794714524640.0, + "grad_norm": 2.061219207593157, + "language_loss": 0.81083214, + "learning_rate": 2.553691071416498e-06, + "loss": 0.83802152, + "num_input_tokens_seen": 152892925, + "step": 7125, + "time_per_iteration": 2.794398307800293 + }, + { + "auxiliary_loss_clip": 0.01454173, + "auxiliary_loss_mlp": 0.01256762, + "balance_loss_clip": 1.1367451, + "balance_loss_mlp": 1.0299778, + "epoch": 0.4284382985119495, + "flos": 16509913439040.0, + "grad_norm": 6.269921960960829, + "language_loss": 0.75061572, + "learning_rate": 2.553316821569659e-06, + "loss": 0.7777251, + "num_input_tokens_seen": 152910935, + "step": 7126, + "time_per_iteration": 2.718599796295166 + }, + { + "auxiliary_loss_clip": 0.01450485, + "auxiliary_loss_mlp": 0.01250999, + "balance_loss_clip": 1.13249695, + "balance_loss_mlp": 1.02307129, + "epoch": 0.42849842176461744, + "flos": 23332992053760.0, + "grad_norm": 1.6662324256537366, + "language_loss": 0.81413889, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.84115368, + "num_input_tokens_seen": 152931030, + "step": 7127, + "time_per_iteration": 2.8475935459136963 + }, + { + "auxiliary_loss_clip": 0.01457426, + "auxiliary_loss_mlp": 0.01255228, + "balance_loss_clip": 1.14012039, + "balance_loss_mlp": 1.02749109, + "epoch": 0.4285585450172854, + "flos": 17276335418880.0, + "grad_norm": 1.6270141817546242, + "language_loss": 0.76453388, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.79166043, + "num_input_tokens_seen": 152948085, + "step": 7128, + "time_per_iteration": 2.7979860305786133 + }, + { + "auxiliary_loss_clip": 0.0145205, + "auxiliary_loss_mlp": 0.01262235, + "balance_loss_clip": 1.13615084, + "balance_loss_mlp": 1.03259051, + "epoch": 0.42861866826995343, + "flos": 24281888232960.0, + "grad_norm": 1.8602543773131253, + "language_loss": 0.73727983, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76442271, + "num_input_tokens_seen": 152966265, + "step": 7129, + "time_per_iteration": 2.798470973968506 + }, + { + "auxiliary_loss_clip": 0.01450223, + "auxiliary_loss_mlp": 0.01252929, + "balance_loss_clip": 1.13373387, + "balance_loss_mlp": 1.02728939, + "epoch": 0.4286787915226214, + "flos": 24355431663840.0, + "grad_norm": 2.5189429946414004, + "language_loss": 0.78160155, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.80863303, + "num_input_tokens_seen": 152986775, + "step": 7130, + "time_per_iteration": 2.866909980773926 + }, + { + "auxiliary_loss_clip": 0.01466004, + "auxiliary_loss_mlp": 0.01249231, + "balance_loss_clip": 1.14925432, + "balance_loss_mlp": 1.01729774, + "epoch": 0.42873891477528936, + "flos": 15451783066080.0, + "grad_norm": 2.0562554633663805, + "language_loss": 0.73331106, + "learning_rate": 2.551445257891886e-06, + "loss": 0.76046348, + "num_input_tokens_seen": 153003595, + "step": 7131, + "time_per_iteration": 2.730839490890503 + }, + { + "auxiliary_loss_clip": 0.01457107, + "auxiliary_loss_mlp": 0.01262478, + "balance_loss_clip": 1.14086604, + "balance_loss_mlp": 1.03607559, + "epoch": 0.4287990380279573, + "flos": 17641511386560.0, + "grad_norm": 2.226655290802522, + "language_loss": 0.77318889, + "learning_rate": 2.551070882366973e-06, + "loss": 0.80038476, + "num_input_tokens_seen": 153021960, + "step": 7132, + "time_per_iteration": 2.814223289489746 + }, + { + "auxiliary_loss_clip": 0.01460266, + "auxiliary_loss_mlp": 0.01255217, + "balance_loss_clip": 1.143664, + "balance_loss_mlp": 1.02824259, + "epoch": 0.4288591612806253, + "flos": 27164798527680.0, + "grad_norm": 1.5603381292588219, + "language_loss": 0.78660738, + "learning_rate": 2.550696485945397e-06, + "loss": 0.81376219, + "num_input_tokens_seen": 153042110, + "step": 7133, + "time_per_iteration": 2.9028992652893066 + }, + { + "auxiliary_loss_clip": 0.01467964, + "auxiliary_loss_mlp": 0.01272916, + "balance_loss_clip": 1.15214801, + "balance_loss_mlp": 1.04784918, + "epoch": 0.42891928453329325, + "flos": 17164977248160.0, + "grad_norm": 2.714722725187016, + "language_loss": 0.74859035, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77599907, + "num_input_tokens_seen": 153058925, + "step": 7134, + "time_per_iteration": 4.406763553619385 + }, + { + "auxiliary_loss_clip": 0.01460389, + "auxiliary_loss_mlp": 0.01251481, + "balance_loss_clip": 1.14405322, + "balance_loss_mlp": 1.02641368, + "epoch": 0.4289794077859612, + "flos": 18189009840960.0, + "grad_norm": 1.8611068143387197, + "language_loss": 0.84213972, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.8692584, + "num_input_tokens_seen": 153078070, + "step": 7135, + "time_per_iteration": 2.770458698272705 + }, + { + "auxiliary_loss_clip": 0.01463761, + "auxiliary_loss_mlp": 0.01252243, + "balance_loss_clip": 1.1489042, + "balance_loss_mlp": 1.02774858, + "epoch": 0.4290395310386292, + "flos": 28259416154880.0, + "grad_norm": 2.1824791090741402, + "language_loss": 0.7511338, + "learning_rate": 2.549573171442666e-06, + "loss": 0.77829379, + "num_input_tokens_seen": 153096680, + "step": 7136, + "time_per_iteration": 2.829993724822998 + }, + { + "auxiliary_loss_clip": 0.0145433, + "auxiliary_loss_mlp": 0.01252161, + "balance_loss_clip": 1.13847351, + "balance_loss_mlp": 1.02156258, + "epoch": 0.42909965429129715, + "flos": 16217901620640.0, + "grad_norm": 2.105149582657309, + "language_loss": 0.79379296, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.82085782, + "num_input_tokens_seen": 153113305, + "step": 7137, + "time_per_iteration": 2.783229351043701 + }, + { + "auxiliary_loss_clip": 0.01466815, + "auxiliary_loss_mlp": 0.0125961, + "balance_loss_clip": 1.15088975, + "balance_loss_mlp": 1.02996516, + "epoch": 0.4291597775439651, + "flos": 23115130516800.0, + "grad_norm": 2.288192159682276, + "language_loss": 0.7680741, + "learning_rate": 2.548824190884499e-06, + "loss": 0.79533827, + "num_input_tokens_seen": 153132735, + "step": 7138, + "time_per_iteration": 2.8144118785858154 + }, + { + "auxiliary_loss_clip": 0.01551107, + "auxiliary_loss_mlp": 0.01222748, + "balance_loss_clip": 1.26374054, + "balance_loss_mlp": 1.01370239, + "epoch": 0.4292199007966331, + "flos": 67552661288160.0, + "grad_norm": 0.7716574046363706, + "language_loss": 0.56123632, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58897489, + "num_input_tokens_seen": 153187925, + "step": 7139, + "time_per_iteration": 3.26309871673584 + }, + { + "auxiliary_loss_clip": 0.01465286, + "auxiliary_loss_mlp": 0.01249931, + "balance_loss_clip": 1.14972353, + "balance_loss_mlp": 1.02524567, + "epoch": 0.42928002404930105, + "flos": 23001913866240.0, + "grad_norm": 1.7556210436177424, + "language_loss": 0.81059343, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.83774555, + "num_input_tokens_seen": 153206990, + "step": 7140, + "time_per_iteration": 2.813053607940674 + }, + { + "auxiliary_loss_clip": 0.01467121, + "auxiliary_loss_mlp": 0.01248142, + "balance_loss_clip": 1.15059352, + "balance_loss_mlp": 1.02116728, + "epoch": 0.429340147301969, + "flos": 11546433161280.0, + "grad_norm": 2.9308025559289215, + "language_loss": 0.81928414, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84643674, + "num_input_tokens_seen": 153222345, + "step": 7141, + "time_per_iteration": 2.8267927169799805 + }, + { + "auxiliary_loss_clip": 0.01467743, + "auxiliary_loss_mlp": 0.01255999, + "balance_loss_clip": 1.15159416, + "balance_loss_mlp": 1.02807093, + "epoch": 0.42940027055463703, + "flos": 25267347522720.0, + "grad_norm": 1.9951727565134136, + "language_loss": 0.86024153, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88747901, + "num_input_tokens_seen": 153240570, + "step": 7142, + "time_per_iteration": 4.337784767150879 + }, + { + "auxiliary_loss_clip": 0.01477077, + "auxiliary_loss_mlp": 0.01262722, + "balance_loss_clip": 1.16169429, + "balance_loss_mlp": 1.03689241, + "epoch": 0.429460393807305, + "flos": 23807250502560.0, + "grad_norm": 2.263494548237266, + "language_loss": 0.78444982, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.81184781, + "num_input_tokens_seen": 153259575, + "step": 7143, + "time_per_iteration": 2.871087074279785 + }, + { + "auxiliary_loss_clip": 0.01473415, + "auxiliary_loss_mlp": 0.01252434, + "balance_loss_clip": 1.15863752, + "balance_loss_mlp": 1.02431536, + "epoch": 0.42952051705997296, + "flos": 13920039023040.0, + "grad_norm": 1.955436037834006, + "language_loss": 0.77195871, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.79921722, + "num_input_tokens_seen": 153276650, + "step": 7144, + "time_per_iteration": 4.477447032928467 + }, + { + "auxiliary_loss_clip": 0.01459916, + "auxiliary_loss_mlp": 0.0124709, + "balance_loss_clip": 1.14476037, + "balance_loss_mlp": 1.02145076, + "epoch": 0.4295806403126409, + "flos": 26762793952320.0, + "grad_norm": 2.0248155039837554, + "language_loss": 0.73524123, + "learning_rate": 2.54620210411532e-06, + "loss": 0.76231128, + "num_input_tokens_seen": 153298025, + "step": 7145, + "time_per_iteration": 2.8860321044921875 + }, + { + "auxiliary_loss_clip": 0.01475473, + "auxiliary_loss_mlp": 0.01256039, + "balance_loss_clip": 1.1610148, + "balance_loss_mlp": 1.02944565, + "epoch": 0.4296407635653089, + "flos": 20954341746720.0, + "grad_norm": 1.9462815615098223, + "language_loss": 0.79687023, + "learning_rate": 2.545827437329352e-06, + "loss": 0.82418537, + "num_input_tokens_seen": 153315775, + "step": 7146, + "time_per_iteration": 2.8100850582122803 + }, + { + "auxiliary_loss_clip": 0.01459166, + "auxiliary_loss_mlp": 0.01250387, + "balance_loss_clip": 1.14394784, + "balance_loss_mlp": 1.02837145, + "epoch": 0.42970088681797686, + "flos": 15854204851200.0, + "grad_norm": 1.9237288333295641, + "language_loss": 0.82757407, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85466963, + "num_input_tokens_seen": 153332765, + "step": 7147, + "time_per_iteration": 2.8094608783721924 + }, + { + "auxiliary_loss_clip": 0.0147011, + "auxiliary_loss_mlp": 0.01268484, + "balance_loss_clip": 1.15555787, + "balance_loss_mlp": 1.04341662, + "epoch": 0.4297610100706448, + "flos": 22384816509600.0, + "grad_norm": 3.2652070070087826, + "language_loss": 0.87520397, + "learning_rate": 2.545078041678131e-06, + "loss": 0.90258992, + "num_input_tokens_seen": 153350760, + "step": 7148, + "time_per_iteration": 2.8483636379241943 + }, + { + "auxiliary_loss_clip": 0.01467184, + "auxiliary_loss_mlp": 0.01254117, + "balance_loss_clip": 1.1523869, + "balance_loss_mlp": 1.02828717, + "epoch": 0.4298211333233128, + "flos": 27928034542080.0, + "grad_norm": 1.580115582908821, + "language_loss": 0.77934235, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80655539, + "num_input_tokens_seen": 153370765, + "step": 7149, + "time_per_iteration": 2.9791135787963867 + }, + { + "auxiliary_loss_clip": 0.01465256, + "auxiliary_loss_mlp": 0.01253683, + "balance_loss_clip": 1.15064192, + "balance_loss_mlp": 1.02956927, + "epoch": 0.42988125657598075, + "flos": 24427913106240.0, + "grad_norm": 1.8441470777015536, + "language_loss": 0.80114317, + "learning_rate": 2.544328563349256e-06, + "loss": 0.82833254, + "num_input_tokens_seen": 153390725, + "step": 7150, + "time_per_iteration": 4.540201187133789 + }, + { + "auxiliary_loss_clip": 0.01469091, + "auxiliary_loss_mlp": 0.01249798, + "balance_loss_clip": 1.15414464, + "balance_loss_mlp": 1.02129793, + "epoch": 0.4299413798286487, + "flos": 15851929161600.0, + "grad_norm": 1.771233530313759, + "language_loss": 0.75101423, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.77820313, + "num_input_tokens_seen": 153408010, + "step": 7151, + "time_per_iteration": 2.888827085494995 + }, + { + "auxiliary_loss_clip": 0.01470487, + "auxiliary_loss_mlp": 0.01267366, + "balance_loss_clip": 1.15534759, + "balance_loss_mlp": 1.04039168, + "epoch": 0.4300015030813167, + "flos": 22311690288480.0, + "grad_norm": 2.0842827450316443, + "language_loss": 0.69984764, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72722614, + "num_input_tokens_seen": 153426865, + "step": 7152, + "time_per_iteration": 2.883619546890259 + }, + { + "auxiliary_loss_clip": 0.01457309, + "auxiliary_loss_mlp": 0.01246327, + "balance_loss_clip": 1.14291275, + "balance_loss_mlp": 1.01973391, + "epoch": 0.43006162633398465, + "flos": 34900589492640.0, + "grad_norm": 2.1711693028253296, + "language_loss": 0.71130455, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73834085, + "num_input_tokens_seen": 153449410, + "step": 7153, + "time_per_iteration": 2.9899282455444336 + }, + { + "auxiliary_loss_clip": 0.01460351, + "auxiliary_loss_mlp": 0.01266949, + "balance_loss_clip": 1.14521956, + "balance_loss_mlp": 1.04169083, + "epoch": 0.4301217495866526, + "flos": 15963439044960.0, + "grad_norm": 2.285570484481176, + "language_loss": 0.78079891, + "learning_rate": 2.542829359113276e-06, + "loss": 0.80807185, + "num_input_tokens_seen": 153467910, + "step": 7154, + "time_per_iteration": 2.9184112548828125 + }, + { + "auxiliary_loss_clip": 0.014594, + "auxiliary_loss_mlp": 0.01249078, + "balance_loss_clip": 1.14493251, + "balance_loss_mlp": 1.02324796, + "epoch": 0.43018187283932063, + "flos": 18772350770880.0, + "grad_norm": 1.597160140047352, + "language_loss": 0.79014874, + "learning_rate": 2.542454506558389e-06, + "loss": 0.8172335, + "num_input_tokens_seen": 153487100, + "step": 7155, + "time_per_iteration": 2.8958632946014404 + }, + { + "auxiliary_loss_clip": 0.0145738, + "auxiliary_loss_mlp": 0.0124856, + "balance_loss_clip": 1.14299369, + "balance_loss_mlp": 1.02654457, + "epoch": 0.4302419960919886, + "flos": 20153480633280.0, + "grad_norm": 1.8378065023584749, + "language_loss": 0.887941, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.91500032, + "num_input_tokens_seen": 153505565, + "step": 7156, + "time_per_iteration": 2.8313310146331787 + }, + { + "auxiliary_loss_clip": 0.01465164, + "auxiliary_loss_mlp": 0.0126127, + "balance_loss_clip": 1.15032125, + "balance_loss_mlp": 1.03238833, + "epoch": 0.43030211934465656, + "flos": 26435167227360.0, + "grad_norm": 2.790119988992194, + "language_loss": 0.82874465, + "learning_rate": 2.541704739753042e-06, + "loss": 0.85600901, + "num_input_tokens_seen": 153526130, + "step": 7157, + "time_per_iteration": 2.96817684173584 + }, + { + "auxiliary_loss_clip": 0.01468084, + "auxiliary_loss_mlp": 0.01254633, + "balance_loss_clip": 1.15321076, + "balance_loss_mlp": 1.02746773, + "epoch": 0.43036224259732453, + "flos": 24391539636480.0, + "grad_norm": 1.7559240166471561, + "language_loss": 0.71867907, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.74590623, + "num_input_tokens_seen": 153546370, + "step": 7158, + "time_per_iteration": 2.8523218631744385 + }, + { + "auxiliary_loss_clip": 0.01459414, + "auxiliary_loss_mlp": 0.01254466, + "balance_loss_clip": 1.14497221, + "balance_loss_mlp": 1.02653813, + "epoch": 0.4304223658499925, + "flos": 17203626407520.0, + "grad_norm": 1.9132180111524792, + "language_loss": 0.8295849, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.85672367, + "num_input_tokens_seen": 153562800, + "step": 7159, + "time_per_iteration": 2.919523239135742 + }, + { + "auxiliary_loss_clip": 0.01459531, + "auxiliary_loss_mlp": 0.01252919, + "balance_loss_clip": 1.14447212, + "balance_loss_mlp": 1.02422798, + "epoch": 0.43048248910266046, + "flos": 14904777677760.0, + "grad_norm": 93.71223917644761, + "language_loss": 0.82685733, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85398185, + "num_input_tokens_seen": 153578395, + "step": 7160, + "time_per_iteration": 2.901492118835449 + }, + { + "auxiliary_loss_clip": 0.01463446, + "auxiliary_loss_mlp": 0.01258055, + "balance_loss_clip": 1.14836121, + "balance_loss_mlp": 1.02898264, + "epoch": 0.4305426123553284, + "flos": 21581983131840.0, + "grad_norm": 3.6884882057820065, + "language_loss": 0.77100557, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79822063, + "num_input_tokens_seen": 153596880, + "step": 7161, + "time_per_iteration": 2.8734145164489746 + }, + { + "auxiliary_loss_clip": 0.01456503, + "auxiliary_loss_mlp": 0.01248215, + "balance_loss_clip": 1.14114726, + "balance_loss_mlp": 1.0210495, + "epoch": 0.4306027356079964, + "flos": 22603474537920.0, + "grad_norm": 3.9203039970549507, + "language_loss": 0.73030275, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75734991, + "num_input_tokens_seen": 153616570, + "step": 7162, + "time_per_iteration": 2.8961548805236816 + }, + { + "auxiliary_loss_clip": 0.01480945, + "auxiliary_loss_mlp": 0.01220436, + "balance_loss_clip": 1.19332063, + "balance_loss_mlp": 1.01215363, + "epoch": 0.43066285886066435, + "flos": 70678048860000.0, + "grad_norm": 0.7894938767120727, + "language_loss": 0.5888871, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61590087, + "num_input_tokens_seen": 153671450, + "step": 7163, + "time_per_iteration": 3.2692513465881348 + }, + { + "auxiliary_loss_clip": 0.01450833, + "auxiliary_loss_mlp": 0.01252966, + "balance_loss_clip": 1.13473964, + "balance_loss_mlp": 1.02980649, + "epoch": 0.4307229821133323, + "flos": 26722817307360.0, + "grad_norm": 1.7671391850209814, + "language_loss": 0.79084879, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81788683, + "num_input_tokens_seen": 153691405, + "step": 7164, + "time_per_iteration": 2.920551061630249 + }, + { + "auxiliary_loss_clip": 0.01451686, + "auxiliary_loss_mlp": 0.01256852, + "balance_loss_clip": 1.13590336, + "balance_loss_mlp": 1.02892423, + "epoch": 0.4307831053660003, + "flos": 26179528878720.0, + "grad_norm": 2.0220002737343243, + "language_loss": 0.67413628, + "learning_rate": 2.538704852009177e-06, + "loss": 0.7012217, + "num_input_tokens_seen": 153711555, + "step": 7165, + "time_per_iteration": 2.897915840148926 + }, + { + "auxiliary_loss_clip": 0.01452938, + "auxiliary_loss_mlp": 0.01250463, + "balance_loss_clip": 1.13911128, + "balance_loss_mlp": 1.02348828, + "epoch": 0.43084322861866825, + "flos": 18912003713280.0, + "grad_norm": 2.9790582790753812, + "language_loss": 0.75220418, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77923822, + "num_input_tokens_seen": 153730095, + "step": 7166, + "time_per_iteration": 2.80828595161438 + }, + { + "auxiliary_loss_clip": 0.01447626, + "auxiliary_loss_mlp": 0.01255857, + "balance_loss_clip": 1.13336015, + "balance_loss_mlp": 1.03212476, + "epoch": 0.4309033518713362, + "flos": 26435356868160.0, + "grad_norm": 1.6852939194293302, + "language_loss": 0.71944821, + "learning_rate": 2.537954675511372e-06, + "loss": 0.74648297, + "num_input_tokens_seen": 153749320, + "step": 7167, + "time_per_iteration": 2.8108723163604736 + }, + { + "auxiliary_loss_clip": 0.01446445, + "auxiliary_loss_mlp": 0.01248194, + "balance_loss_clip": 1.13279104, + "balance_loss_mlp": 1.0238899, + "epoch": 0.43096347512400424, + "flos": 21215327965920.0, + "grad_norm": 1.5298546998556968, + "language_loss": 0.78558296, + "learning_rate": 2.537579556656414e-06, + "loss": 0.81252933, + "num_input_tokens_seen": 153767825, + "step": 7168, + "time_per_iteration": 2.7899200916290283 + }, + { + "auxiliary_loss_clip": 0.01448962, + "auxiliary_loss_mlp": 0.01251149, + "balance_loss_clip": 1.13600063, + "balance_loss_mlp": 1.02436495, + "epoch": 0.4310235983766722, + "flos": 16541735529600.0, + "grad_norm": 2.221688943746687, + "language_loss": 0.82643145, + "learning_rate": 2.537204417416387e-06, + "loss": 0.85343254, + "num_input_tokens_seen": 153785350, + "step": 7169, + "time_per_iteration": 2.792637586593628 + }, + { + "auxiliary_loss_clip": 0.01500884, + "auxiliary_loss_mlp": 0.01208946, + "balance_loss_clip": 1.21240151, + "balance_loss_mlp": 1.0014267, + "epoch": 0.43108372162934017, + "flos": 64782133224480.0, + "grad_norm": 0.6758685842890098, + "language_loss": 0.60732293, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.63442123, + "num_input_tokens_seen": 153856400, + "step": 7170, + "time_per_iteration": 3.5522372722625732 + }, + { + "auxiliary_loss_clip": 0.01459245, + "auxiliary_loss_mlp": 0.01263354, + "balance_loss_clip": 1.14489484, + "balance_loss_mlp": 1.03714228, + "epoch": 0.43114384488200813, + "flos": 13445818502400.0, + "grad_norm": 1.791553737908246, + "language_loss": 0.76026261, + "learning_rate": 2.536454077838021e-06, + "loss": 0.78748858, + "num_input_tokens_seen": 153875230, + "step": 7171, + "time_per_iteration": 3.075106143951416 + }, + { + "auxiliary_loss_clip": 0.01452578, + "auxiliary_loss_mlp": 0.01255043, + "balance_loss_clip": 1.13929892, + "balance_loss_mlp": 1.02940369, + "epoch": 0.4312039681346761, + "flos": 26289104425920.0, + "grad_norm": 2.8510797577216582, + "language_loss": 0.77657974, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.80365598, + "num_input_tokens_seen": 153894740, + "step": 7172, + "time_per_iteration": 4.416921138763428 + }, + { + "auxiliary_loss_clip": 0.01455832, + "auxiliary_loss_mlp": 0.01270549, + "balance_loss_clip": 1.14308739, + "balance_loss_mlp": 1.04643512, + "epoch": 0.43126409138734406, + "flos": 20378814017760.0, + "grad_norm": 1.8332057119044762, + "language_loss": 0.76674914, + "learning_rate": 2.535703656890086e-06, + "loss": 0.79401302, + "num_input_tokens_seen": 153913230, + "step": 7173, + "time_per_iteration": 2.769336700439453 + }, + { + "auxiliary_loss_clip": 0.01461657, + "auxiliary_loss_mlp": 0.01267699, + "balance_loss_clip": 1.14972949, + "balance_loss_mlp": 1.04472995, + "epoch": 0.431324214640012, + "flos": 22125043991520.0, + "grad_norm": 1.4742324400155422, + "language_loss": 0.76848209, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79577565, + "num_input_tokens_seen": 153933250, + "step": 7174, + "time_per_iteration": 2.7578279972076416 + }, + { + "auxiliary_loss_clip": 0.0146147, + "auxiliary_loss_mlp": 0.0126232, + "balance_loss_clip": 1.14797902, + "balance_loss_mlp": 1.03858829, + "epoch": 0.43138433789268, + "flos": 15232783684320.0, + "grad_norm": 1.8955893756293252, + "language_loss": 0.82632911, + "learning_rate": 2.534953154686407e-06, + "loss": 0.853567, + "num_input_tokens_seen": 153951325, + "step": 7175, + "time_per_iteration": 2.7639012336730957 + }, + { + "auxiliary_loss_clip": 0.01462261, + "auxiliary_loss_mlp": 0.01266349, + "balance_loss_clip": 1.15108371, + "balance_loss_mlp": 1.03956568, + "epoch": 0.43144446114534796, + "flos": 18152257089600.0, + "grad_norm": 1.9799138339176032, + "language_loss": 0.74276555, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77005172, + "num_input_tokens_seen": 153966975, + "step": 7176, + "time_per_iteration": 2.720951795578003 + }, + { + "auxiliary_loss_clip": 0.01457752, + "auxiliary_loss_mlp": 0.01259164, + "balance_loss_clip": 1.14598525, + "balance_loss_mlp": 1.03753042, + "epoch": 0.4315045843980159, + "flos": 22932087395040.0, + "grad_norm": 2.2536928933386315, + "language_loss": 0.7355988, + "learning_rate": 2.534202571340819e-06, + "loss": 0.76276797, + "num_input_tokens_seen": 153986695, + "step": 7177, + "time_per_iteration": 2.7782278060913086 + }, + { + "auxiliary_loss_clip": 0.01460924, + "auxiliary_loss_mlp": 0.01256621, + "balance_loss_clip": 1.14912522, + "balance_loss_mlp": 1.02850223, + "epoch": 0.4315647076506839, + "flos": 22129026448320.0, + "grad_norm": 3.131936446987228, + "language_loss": 0.81578833, + "learning_rate": 2.533827249275387e-06, + "loss": 0.84296381, + "num_input_tokens_seen": 154004710, + "step": 7178, + "time_per_iteration": 2.7650740146636963 + }, + { + "auxiliary_loss_clip": 0.01458235, + "auxiliary_loss_mlp": 0.0125294, + "balance_loss_clip": 1.14787447, + "balance_loss_mlp": 1.03187799, + "epoch": 0.43162483090335185, + "flos": 26873621128800.0, + "grad_norm": 1.4959715356633303, + "language_loss": 0.8432734, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.87038517, + "num_input_tokens_seen": 154024320, + "step": 7179, + "time_per_iteration": 2.7846193313598633 + }, + { + "auxiliary_loss_clip": 0.01457908, + "auxiliary_loss_mlp": 0.01262126, + "balance_loss_clip": 1.1470449, + "balance_loss_mlp": 1.03705931, + "epoch": 0.4316849541560198, + "flos": 13914577368000.0, + "grad_norm": 1.8293820973728312, + "language_loss": 0.75086999, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77807033, + "num_input_tokens_seen": 154041755, + "step": 7180, + "time_per_iteration": 4.25342059135437 + }, + { + "auxiliary_loss_clip": 0.01457451, + "auxiliary_loss_mlp": 0.0126146, + "balance_loss_clip": 1.14556491, + "balance_loss_mlp": 1.03486669, + "epoch": 0.4317450774086878, + "flos": 16437014786880.0, + "grad_norm": 1.927817982478003, + "language_loss": 0.81878114, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.84597027, + "num_input_tokens_seen": 154056775, + "step": 7181, + "time_per_iteration": 2.773930549621582 + }, + { + "auxiliary_loss_clip": 0.01457665, + "auxiliary_loss_mlp": 0.01255268, + "balance_loss_clip": 1.14764929, + "balance_loss_mlp": 1.02714884, + "epoch": 0.4318052006613558, + "flos": 20556888550560.0, + "grad_norm": 1.6971643291344618, + "language_loss": 0.8907817, + "learning_rate": 2.532325758728165e-06, + "loss": 0.91791105, + "num_input_tokens_seen": 154075015, + "step": 7182, + "time_per_iteration": 4.452470541000366 + }, + { + "auxiliary_loss_clip": 0.01456414, + "auxiliary_loss_mlp": 0.01259116, + "balance_loss_clip": 1.14504993, + "balance_loss_mlp": 1.03443003, + "epoch": 0.43186532391402377, + "flos": 22822246350720.0, + "grad_norm": 1.7172701738867424, + "language_loss": 0.76106513, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.78822041, + "num_input_tokens_seen": 154095170, + "step": 7183, + "time_per_iteration": 2.771183729171753 + }, + { + "auxiliary_loss_clip": 0.01463659, + "auxiliary_loss_mlp": 0.01255932, + "balance_loss_clip": 1.15340233, + "balance_loss_mlp": 1.03162813, + "epoch": 0.43192544716669173, + "flos": 25558942131360.0, + "grad_norm": 1.5959201428350342, + "language_loss": 0.77590954, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.80310541, + "num_input_tokens_seen": 154116895, + "step": 7184, + "time_per_iteration": 2.82246470451355 + }, + { + "auxiliary_loss_clip": 0.01458626, + "auxiliary_loss_mlp": 0.01261211, + "balance_loss_clip": 1.14726639, + "balance_loss_mlp": 1.03938675, + "epoch": 0.4319855704193597, + "flos": 30957235207200.0, + "grad_norm": 1.8379290507139545, + "language_loss": 0.7328186, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.76001698, + "num_input_tokens_seen": 154138395, + "step": 7185, + "time_per_iteration": 2.8759267330169678 + }, + { + "auxiliary_loss_clip": 0.01458708, + "auxiliary_loss_mlp": 0.01263275, + "balance_loss_clip": 1.14776289, + "balance_loss_mlp": 1.03668213, + "epoch": 0.43204569367202766, + "flos": 24240773743200.0, + "grad_norm": 2.25020689976404, + "language_loss": 0.75913507, + "learning_rate": 2.530823945207421e-06, + "loss": 0.7863549, + "num_input_tokens_seen": 154156775, + "step": 7186, + "time_per_iteration": 2.779296875 + }, + { + "auxiliary_loss_clip": 0.01465998, + "auxiliary_loss_mlp": 0.0126069, + "balance_loss_clip": 1.15581656, + "balance_loss_mlp": 1.03352475, + "epoch": 0.43210581692469563, + "flos": 18408995354880.0, + "grad_norm": 3.3694591873076454, + "language_loss": 0.75914502, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78641188, + "num_input_tokens_seen": 154177500, + "step": 7187, + "time_per_iteration": 2.862274169921875 + }, + { + "auxiliary_loss_clip": 0.01548607, + "auxiliary_loss_mlp": 0.01225044, + "balance_loss_clip": 1.2647295, + "balance_loss_mlp": 1.01828766, + "epoch": 0.4321659401773636, + "flos": 49838857735680.0, + "grad_norm": 0.8494006713508957, + "language_loss": 0.68099439, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70873094, + "num_input_tokens_seen": 154237110, + "step": 7188, + "time_per_iteration": 4.816180944442749 + }, + { + "auxiliary_loss_clip": 0.01458763, + "auxiliary_loss_mlp": 0.01252521, + "balance_loss_clip": 1.1484803, + "balance_loss_mlp": 1.02936101, + "epoch": 0.43222606343003156, + "flos": 17130462258240.0, + "grad_norm": 1.9414980216176234, + "language_loss": 0.78303176, + "learning_rate": 2.529697373663614e-06, + "loss": 0.8101446, + "num_input_tokens_seen": 154253910, + "step": 7189, + "time_per_iteration": 2.7793750762939453 + }, + { + "auxiliary_loss_clip": 0.01462367, + "auxiliary_loss_mlp": 0.01267359, + "balance_loss_clip": 1.15119147, + "balance_loss_mlp": 1.04114711, + "epoch": 0.4322861866826995, + "flos": 22752306095040.0, + "grad_norm": 2.789819881557188, + "language_loss": 0.70964301, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73694026, + "num_input_tokens_seen": 154274770, + "step": 7190, + "time_per_iteration": 2.7912533283233643 + }, + { + "auxiliary_loss_clip": 0.01451862, + "auxiliary_loss_mlp": 0.01252838, + "balance_loss_clip": 1.14116216, + "balance_loss_mlp": 1.02967834, + "epoch": 0.4323463099353675, + "flos": 27894429828000.0, + "grad_norm": 1.4492241668753476, + "language_loss": 0.79670846, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.8237555, + "num_input_tokens_seen": 154295035, + "step": 7191, + "time_per_iteration": 2.8344156742095947 + }, + { + "auxiliary_loss_clip": 0.01458485, + "auxiliary_loss_mlp": 0.01251677, + "balance_loss_clip": 1.14803803, + "balance_loss_mlp": 1.02832651, + "epoch": 0.43240643318803546, + "flos": 21616801547040.0, + "grad_norm": 1.6503963414428056, + "language_loss": 0.7490412, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.77614284, + "num_input_tokens_seen": 154314905, + "step": 7192, + "time_per_iteration": 2.830461263656616 + }, + { + "auxiliary_loss_clip": 0.0145805, + "auxiliary_loss_mlp": 0.01259566, + "balance_loss_clip": 1.14849281, + "balance_loss_mlp": 1.03449941, + "epoch": 0.4324665564407034, + "flos": 17559851329440.0, + "grad_norm": 1.88193520003964, + "language_loss": 0.78780603, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81498224, + "num_input_tokens_seen": 154331740, + "step": 7193, + "time_per_iteration": 2.7507617473602295 + }, + { + "auxiliary_loss_clip": 0.01454993, + "auxiliary_loss_mlp": 0.01253081, + "balance_loss_clip": 1.14478111, + "balance_loss_mlp": 1.02801442, + "epoch": 0.4325266796933714, + "flos": 18404216406720.0, + "grad_norm": 2.2431105573776544, + "language_loss": 0.75557256, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.78265327, + "num_input_tokens_seen": 154348740, + "step": 7194, + "time_per_iteration": 2.7890148162841797 + }, + { + "auxiliary_loss_clip": 0.01462622, + "auxiliary_loss_mlp": 0.01258589, + "balance_loss_clip": 1.15274882, + "balance_loss_mlp": 1.03218651, + "epoch": 0.4325868029460394, + "flos": 22566987283680.0, + "grad_norm": 2.241289437913749, + "language_loss": 0.60128367, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.62849575, + "num_input_tokens_seen": 154368835, + "step": 7195, + "time_per_iteration": 2.767033576965332 + }, + { + "auxiliary_loss_clip": 0.01462372, + "auxiliary_loss_mlp": 0.01264184, + "balance_loss_clip": 1.15239453, + "balance_loss_mlp": 1.03701854, + "epoch": 0.43264692619870737, + "flos": 14606962850880.0, + "grad_norm": 3.0127659793212795, + "language_loss": 0.6548624, + "learning_rate": 2.527068004376515e-06, + "loss": 0.68212789, + "num_input_tokens_seen": 154384620, + "step": 7196, + "time_per_iteration": 2.7595698833465576 + }, + { + "auxiliary_loss_clip": 0.01457448, + "auxiliary_loss_mlp": 0.0125798, + "balance_loss_clip": 1.14745617, + "balance_loss_mlp": 1.02967, + "epoch": 0.43270704945137534, + "flos": 21503509040160.0, + "grad_norm": 1.9962378754326688, + "language_loss": 0.72783977, + "learning_rate": 2.526692300132797e-06, + "loss": 0.75499403, + "num_input_tokens_seen": 154402865, + "step": 7197, + "time_per_iteration": 2.8006694316864014 + }, + { + "auxiliary_loss_clip": 0.01452898, + "auxiliary_loss_mlp": 0.01254042, + "balance_loss_clip": 1.14359927, + "balance_loss_mlp": 1.03031039, + "epoch": 0.4327671727040433, + "flos": 25158265041600.0, + "grad_norm": 1.4838912031597424, + "language_loss": 0.72628164, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.75335103, + "num_input_tokens_seen": 154423625, + "step": 7198, + "time_per_iteration": 2.8008594512939453 + }, + { + "auxiliary_loss_clip": 0.01451918, + "auxiliary_loss_mlp": 0.01252462, + "balance_loss_clip": 1.14133143, + "balance_loss_mlp": 1.02815771, + "epoch": 0.43282729595671127, + "flos": 25449518296800.0, + "grad_norm": 1.5182827248508974, + "language_loss": 0.81274927, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83979309, + "num_input_tokens_seen": 154444775, + "step": 7199, + "time_per_iteration": 2.880359411239624 + }, + { + "auxiliary_loss_clip": 0.0145791, + "auxiliary_loss_mlp": 0.0126624, + "balance_loss_clip": 1.14807057, + "balance_loss_mlp": 1.04021931, + "epoch": 0.43288741920937923, + "flos": 24127974302400.0, + "grad_norm": 2.059841284183061, + "language_loss": 0.68951261, + "learning_rate": 2.525565067625286e-06, + "loss": 0.71675408, + "num_input_tokens_seen": 154460815, + "step": 7200, + "time_per_iteration": 2.7907652854919434 + }, + { + "auxiliary_loss_clip": 0.01455358, + "auxiliary_loss_mlp": 0.01269662, + "balance_loss_clip": 1.14593053, + "balance_loss_mlp": 1.04326022, + "epoch": 0.4329475424620472, + "flos": 19206784287360.0, + "grad_norm": 1.7791854258216568, + "language_loss": 0.86890686, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89615709, + "num_input_tokens_seen": 154479145, + "step": 7201, + "time_per_iteration": 2.786853551864624 + }, + { + "auxiliary_loss_clip": 0.01464594, + "auxiliary_loss_mlp": 0.01266936, + "balance_loss_clip": 1.153373, + "balance_loss_mlp": 1.03748167, + "epoch": 0.43300766571471516, + "flos": 22640606570880.0, + "grad_norm": 1.91037616187889, + "language_loss": 0.6463865, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.67370176, + "num_input_tokens_seen": 154498905, + "step": 7202, + "time_per_iteration": 2.8156604766845703 + }, + { + "auxiliary_loss_clip": 0.01457465, + "auxiliary_loss_mlp": 0.01256552, + "balance_loss_clip": 1.14903104, + "balance_loss_mlp": 1.0347271, + "epoch": 0.4330677889673831, + "flos": 22122920014560.0, + "grad_norm": 1.9672614048667967, + "language_loss": 0.82011163, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.84725183, + "num_input_tokens_seen": 154517270, + "step": 7203, + "time_per_iteration": 2.8008389472961426 + }, + { + "auxiliary_loss_clip": 0.01464656, + "auxiliary_loss_mlp": 0.01262287, + "balance_loss_clip": 1.15439272, + "balance_loss_mlp": 1.03645706, + "epoch": 0.4331279122200511, + "flos": 23223757860000.0, + "grad_norm": 2.0427640690257105, + "language_loss": 0.8148582, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.84212762, + "num_input_tokens_seen": 154535945, + "step": 7204, + "time_per_iteration": 2.768808126449585 + }, + { + "auxiliary_loss_clip": 0.01455487, + "auxiliary_loss_mlp": 0.01259984, + "balance_loss_clip": 1.14481068, + "balance_loss_mlp": 1.03415334, + "epoch": 0.43318803547271906, + "flos": 18261718852320.0, + "grad_norm": 1.9979208354335525, + "language_loss": 0.73908341, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.76623809, + "num_input_tokens_seen": 154554935, + "step": 7205, + "time_per_iteration": 2.7217371463775635 + }, + { + "auxiliary_loss_clip": 0.01464271, + "auxiliary_loss_mlp": 0.01261703, + "balance_loss_clip": 1.15483105, + "balance_loss_mlp": 1.03892517, + "epoch": 0.433248158725387, + "flos": 27420929942400.0, + "grad_norm": 1.7426756564654255, + "language_loss": 0.75486016, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.78211993, + "num_input_tokens_seen": 154576065, + "step": 7206, + "time_per_iteration": 2.791004180908203 + }, + { + "auxiliary_loss_clip": 0.01457301, + "auxiliary_loss_mlp": 0.01256252, + "balance_loss_clip": 1.14723575, + "balance_loss_mlp": 1.03080332, + "epoch": 0.433308281978055, + "flos": 23220040900320.0, + "grad_norm": 2.2500653988608508, + "language_loss": 0.79082072, + "learning_rate": 2.522934161574342e-06, + "loss": 0.81795633, + "num_input_tokens_seen": 154595110, + "step": 7207, + "time_per_iteration": 2.806929349899292 + }, + { + "auxiliary_loss_clip": 0.01464587, + "auxiliary_loss_mlp": 0.01254084, + "balance_loss_clip": 1.15434718, + "balance_loss_mlp": 1.02901649, + "epoch": 0.433368405230723, + "flos": 15854280707520.0, + "grad_norm": 1.8496842693696487, + "language_loss": 0.80960637, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83679312, + "num_input_tokens_seen": 154612255, + "step": 7208, + "time_per_iteration": 2.727451801300049 + }, + { + "auxiliary_loss_clip": 0.01469237, + "auxiliary_loss_mlp": 0.01258704, + "balance_loss_clip": 1.15984964, + "balance_loss_mlp": 1.03344584, + "epoch": 0.433428528483391, + "flos": 19028102904000.0, + "grad_norm": 2.1436946269790504, + "language_loss": 0.7005291, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72780854, + "num_input_tokens_seen": 154630440, + "step": 7209, + "time_per_iteration": 2.798780679702759 + }, + { + "auxiliary_loss_clip": 0.01464546, + "auxiliary_loss_mlp": 0.01248616, + "balance_loss_clip": 1.15460777, + "balance_loss_mlp": 1.02354932, + "epoch": 0.43348865173605894, + "flos": 24720607631520.0, + "grad_norm": 3.020575492622102, + "language_loss": 0.81398332, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.841115, + "num_input_tokens_seen": 154652515, + "step": 7210, + "time_per_iteration": 4.420356273651123 + }, + { + "auxiliary_loss_clip": 0.01457618, + "auxiliary_loss_mlp": 0.01252533, + "balance_loss_clip": 1.14783919, + "balance_loss_mlp": 1.02880096, + "epoch": 0.4335487749887269, + "flos": 22092842619360.0, + "grad_norm": 1.9500941770167965, + "language_loss": 0.82220513, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84930664, + "num_input_tokens_seen": 154670965, + "step": 7211, + "time_per_iteration": 2.8332910537719727 + }, + { + "auxiliary_loss_clip": 0.0146793, + "auxiliary_loss_mlp": 0.01252391, + "balance_loss_clip": 1.15757728, + "balance_loss_mlp": 1.02656066, + "epoch": 0.43360889824139487, + "flos": 22385157863040.0, + "grad_norm": 2.23778970140798, + "language_loss": 0.75032103, + "learning_rate": 2.521054347790029e-06, + "loss": 0.77752423, + "num_input_tokens_seen": 154689980, + "step": 7212, + "time_per_iteration": 2.86772084236145 + }, + { + "auxiliary_loss_clip": 0.01460113, + "auxiliary_loss_mlp": 0.01248597, + "balance_loss_clip": 1.14957929, + "balance_loss_mlp": 1.02562785, + "epoch": 0.43366902149406283, + "flos": 17530115287680.0, + "grad_norm": 1.9920363715565752, + "language_loss": 0.7671181, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.79420519, + "num_input_tokens_seen": 154706570, + "step": 7213, + "time_per_iteration": 2.7852396965026855 + }, + { + "auxiliary_loss_clip": 0.01461254, + "auxiliary_loss_mlp": 0.01249747, + "balance_loss_clip": 1.15224767, + "balance_loss_mlp": 1.02506137, + "epoch": 0.4337291447467308, + "flos": 19024499728800.0, + "grad_norm": 2.0808891089588926, + "language_loss": 0.65022522, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67733526, + "num_input_tokens_seen": 154725210, + "step": 7214, + "time_per_iteration": 2.7708067893981934 + }, + { + "auxiliary_loss_clip": 0.01464514, + "auxiliary_loss_mlp": 0.01257035, + "balance_loss_clip": 1.15463781, + "balance_loss_mlp": 1.03502011, + "epoch": 0.43378926799939876, + "flos": 27236445550560.0, + "grad_norm": 1.673225226401254, + "language_loss": 0.72221637, + "learning_rate": 2.519926222304191e-06, + "loss": 0.74943185, + "num_input_tokens_seen": 154745945, + "step": 7215, + "time_per_iteration": 2.799100160598755 + }, + { + "auxiliary_loss_clip": 0.01461631, + "auxiliary_loss_mlp": 0.0125879, + "balance_loss_clip": 1.15249014, + "balance_loss_mlp": 1.03505826, + "epoch": 0.43384939125206673, + "flos": 15962945978880.0, + "grad_norm": 1.8194641397041018, + "language_loss": 0.75121409, + "learning_rate": 2.519550141025255e-06, + "loss": 0.7784183, + "num_input_tokens_seen": 154763580, + "step": 7216, + "time_per_iteration": 2.74352765083313 + }, + { + "auxiliary_loss_clip": 0.01465734, + "auxiliary_loss_mlp": 0.01264458, + "balance_loss_clip": 1.1555115, + "balance_loss_mlp": 1.03538513, + "epoch": 0.4339095145047347, + "flos": 21795065720640.0, + "grad_norm": 3.995211576634329, + "language_loss": 0.75197875, + "learning_rate": 2.519174040044927e-06, + "loss": 0.77928066, + "num_input_tokens_seen": 154776825, + "step": 7217, + "time_per_iteration": 4.269510507583618 + }, + { + "auxiliary_loss_clip": 0.01464927, + "auxiliary_loss_mlp": 0.01262536, + "balance_loss_clip": 1.15506899, + "balance_loss_mlp": 1.03804088, + "epoch": 0.43396963775740266, + "flos": 14211595703520.0, + "grad_norm": 1.9815431521472933, + "language_loss": 0.73955798, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76683271, + "num_input_tokens_seen": 154794025, + "step": 7218, + "time_per_iteration": 2.7631635665893555 + }, + { + "auxiliary_loss_clip": 0.01463098, + "auxiliary_loss_mlp": 0.01265806, + "balance_loss_clip": 1.15296853, + "balance_loss_mlp": 1.03883171, + "epoch": 0.4340297610100706, + "flos": 19721095237440.0, + "grad_norm": 2.312812655412577, + "language_loss": 0.69215882, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.71944785, + "num_input_tokens_seen": 154813105, + "step": 7219, + "time_per_iteration": 2.779395818710327 + }, + { + "auxiliary_loss_clip": 0.01462964, + "auxiliary_loss_mlp": 0.01255061, + "balance_loss_clip": 1.15283966, + "balance_loss_mlp": 1.03171051, + "epoch": 0.4340898842627386, + "flos": 18955545605280.0, + "grad_norm": 1.6090246545275797, + "language_loss": 0.7715857, + "learning_rate": 2.518045619038202e-06, + "loss": 0.7987659, + "num_input_tokens_seen": 154833525, + "step": 7220, + "time_per_iteration": 2.819779634475708 + }, + { + "auxiliary_loss_clip": 0.01461827, + "auxiliary_loss_mlp": 0.01254302, + "balance_loss_clip": 1.15297151, + "balance_loss_mlp": 1.02923512, + "epoch": 0.4341500075154066, + "flos": 22020512889600.0, + "grad_norm": 2.0647352905182257, + "language_loss": 0.69464433, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.72180563, + "num_input_tokens_seen": 154853090, + "step": 7221, + "time_per_iteration": 4.436882257461548 + }, + { + "auxiliary_loss_clip": 0.01460375, + "auxiliary_loss_mlp": 0.01265565, + "balance_loss_clip": 1.1515367, + "balance_loss_mlp": 1.04259634, + "epoch": 0.4342101307680746, + "flos": 23584496232960.0, + "grad_norm": 1.6010511654663204, + "language_loss": 0.65076458, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67802399, + "num_input_tokens_seen": 154872055, + "step": 7222, + "time_per_iteration": 3.0007612705230713 + }, + { + "auxiliary_loss_clip": 0.01462826, + "auxiliary_loss_mlp": 0.01246952, + "balance_loss_clip": 1.15369201, + "balance_loss_mlp": 1.01864207, + "epoch": 0.43427025402074254, + "flos": 17969934602880.0, + "grad_norm": 2.9898716374584753, + "language_loss": 0.73198104, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.75907874, + "num_input_tokens_seen": 154886645, + "step": 7223, + "time_per_iteration": 2.7641961574554443 + }, + { + "auxiliary_loss_clip": 0.01455311, + "auxiliary_loss_mlp": 0.01250864, + "balance_loss_clip": 1.14430714, + "balance_loss_mlp": 1.02427149, + "epoch": 0.4343303772734105, + "flos": 26288801000640.0, + "grad_norm": 2.3542667508011696, + "language_loss": 0.94198585, + "learning_rate": 2.516540782741694e-06, + "loss": 0.96904767, + "num_input_tokens_seen": 154906775, + "step": 7224, + "time_per_iteration": 2.8086705207824707 + }, + { + "auxiliary_loss_clip": 0.01454243, + "auxiliary_loss_mlp": 0.01247131, + "balance_loss_clip": 1.14520144, + "balance_loss_mlp": 1.02225494, + "epoch": 0.43439050052607847, + "flos": 26836413239520.0, + "grad_norm": 1.8064570721575892, + "language_loss": 0.61287522, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63988894, + "num_input_tokens_seen": 154926990, + "step": 7225, + "time_per_iteration": 2.831854820251465 + }, + { + "auxiliary_loss_clip": 0.01459636, + "auxiliary_loss_mlp": 0.01251474, + "balance_loss_clip": 1.14958549, + "balance_loss_mlp": 1.02297401, + "epoch": 0.43445062377874644, + "flos": 21399812357760.0, + "grad_norm": 1.8040615588416355, + "language_loss": 0.77458757, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.80169868, + "num_input_tokens_seen": 154946210, + "step": 7226, + "time_per_iteration": 4.246364116668701 + }, + { + "auxiliary_loss_clip": 0.01458499, + "auxiliary_loss_mlp": 0.01252471, + "balance_loss_clip": 1.1484431, + "balance_loss_mlp": 1.0270226, + "epoch": 0.4345107470314144, + "flos": 19904024574720.0, + "grad_norm": 1.6562894988162915, + "language_loss": 0.84695941, + "learning_rate": 2.515411949802964e-06, + "loss": 0.87406909, + "num_input_tokens_seen": 154964995, + "step": 7227, + "time_per_iteration": 2.7655715942382812 + }, + { + "auxiliary_loss_clip": 0.01459876, + "auxiliary_loss_mlp": 0.0125829, + "balance_loss_clip": 1.14966941, + "balance_loss_mlp": 1.03226876, + "epoch": 0.43457087028408237, + "flos": 26435053442880.0, + "grad_norm": 2.113971861174452, + "language_loss": 0.76412177, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.79130346, + "num_input_tokens_seen": 154984775, + "step": 7228, + "time_per_iteration": 2.7975833415985107 + }, + { + "auxiliary_loss_clip": 0.01461709, + "auxiliary_loss_mlp": 0.01263285, + "balance_loss_clip": 1.1505363, + "balance_loss_mlp": 1.03650093, + "epoch": 0.43463099353675033, + "flos": 31871540540160.0, + "grad_norm": 1.683689004828288, + "language_loss": 0.80356902, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.83081901, + "num_input_tokens_seen": 155008125, + "step": 7229, + "time_per_iteration": 2.8707072734832764 + }, + { + "auxiliary_loss_clip": 0.01459624, + "auxiliary_loss_mlp": 0.01255667, + "balance_loss_clip": 1.14850521, + "balance_loss_mlp": 1.02926481, + "epoch": 0.4346911167894183, + "flos": 24573558697920.0, + "grad_norm": 2.0254096904485728, + "language_loss": 0.81632763, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84348059, + "num_input_tokens_seen": 155027885, + "step": 7230, + "time_per_iteration": 2.770831346511841 + }, + { + "auxiliary_loss_clip": 0.01461367, + "auxiliary_loss_mlp": 0.01266027, + "balance_loss_clip": 1.15007114, + "balance_loss_mlp": 1.03638232, + "epoch": 0.43475124004208626, + "flos": 17092495805760.0, + "grad_norm": 3.0439677394715488, + "language_loss": 0.76950598, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79677999, + "num_input_tokens_seen": 155043375, + "step": 7231, + "time_per_iteration": 2.743170738220215 + }, + { + "auxiliary_loss_clip": 0.01449842, + "auxiliary_loss_mlp": 0.01254819, + "balance_loss_clip": 1.13857198, + "balance_loss_mlp": 1.03471065, + "epoch": 0.4348113632947542, + "flos": 26107047436320.0, + "grad_norm": 1.6423936899511986, + "language_loss": 0.69060546, + "learning_rate": 2.513530170872575e-06, + "loss": 0.71765208, + "num_input_tokens_seen": 155062930, + "step": 7232, + "time_per_iteration": 2.823554039001465 + }, + { + "auxiliary_loss_clip": 0.01459779, + "auxiliary_loss_mlp": 0.01249983, + "balance_loss_clip": 1.14933348, + "balance_loss_mlp": 1.02663302, + "epoch": 0.4348714865474222, + "flos": 34203159564480.0, + "grad_norm": 1.8970518816105033, + "language_loss": 0.71859705, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74569464, + "num_input_tokens_seen": 155084980, + "step": 7233, + "time_per_iteration": 2.904015064239502 + }, + { + "auxiliary_loss_clip": 0.01455972, + "auxiliary_loss_mlp": 0.0125027, + "balance_loss_clip": 1.1436367, + "balance_loss_mlp": 1.02253306, + "epoch": 0.43493160980009016, + "flos": 31540007214720.0, + "grad_norm": 1.8491362947075627, + "language_loss": 0.74346113, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.77052361, + "num_input_tokens_seen": 155107260, + "step": 7234, + "time_per_iteration": 2.8848679065704346 + }, + { + "auxiliary_loss_clip": 0.01459782, + "auxiliary_loss_mlp": 0.01261793, + "balance_loss_clip": 1.1484704, + "balance_loss_mlp": 1.03443682, + "epoch": 0.4349917330527582, + "flos": 24063647414400.0, + "grad_norm": 1.7792384045861178, + "language_loss": 0.58422303, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61143875, + "num_input_tokens_seen": 155126720, + "step": 7235, + "time_per_iteration": 2.833284378051758 + }, + { + "auxiliary_loss_clip": 0.0145268, + "auxiliary_loss_mlp": 0.01254612, + "balance_loss_clip": 1.14208961, + "balance_loss_mlp": 1.03183401, + "epoch": 0.43505185630542614, + "flos": 30522005199360.0, + "grad_norm": 1.5817940089075277, + "language_loss": 0.77481192, + "learning_rate": 2.512024397126566e-06, + "loss": 0.80188483, + "num_input_tokens_seen": 155148640, + "step": 7236, + "time_per_iteration": 2.8374252319335938 + }, + { + "auxiliary_loss_clip": 0.01460456, + "auxiliary_loss_mlp": 0.01251709, + "balance_loss_clip": 1.15135908, + "balance_loss_mlp": 1.02816808, + "epoch": 0.4351119795580941, + "flos": 15736588534080.0, + "grad_norm": 1.7534768040999762, + "language_loss": 0.81044257, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83756423, + "num_input_tokens_seen": 155165870, + "step": 7237, + "time_per_iteration": 2.7255444526672363 + }, + { + "auxiliary_loss_clip": 0.01453261, + "auxiliary_loss_mlp": 0.01250505, + "balance_loss_clip": 1.14230287, + "balance_loss_mlp": 1.02868009, + "epoch": 0.4351721028107621, + "flos": 18733625755200.0, + "grad_norm": 1.5956486397742213, + "language_loss": 0.63295579, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65999341, + "num_input_tokens_seen": 155185315, + "step": 7238, + "time_per_iteration": 2.778292179107666 + }, + { + "auxiliary_loss_clip": 0.01458336, + "auxiliary_loss_mlp": 0.01250666, + "balance_loss_clip": 1.14822292, + "balance_loss_mlp": 1.02578974, + "epoch": 0.43523222606343004, + "flos": 25229153501280.0, + "grad_norm": 8.779938368034497, + "language_loss": 0.86032951, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88741958, + "num_input_tokens_seen": 155205790, + "step": 7239, + "time_per_iteration": 2.859297275543213 + }, + { + "auxiliary_loss_clip": 0.01462109, + "auxiliary_loss_mlp": 0.01260044, + "balance_loss_clip": 1.1515131, + "balance_loss_mlp": 1.03669357, + "epoch": 0.435292349316098, + "flos": 22711305389760.0, + "grad_norm": 1.6725209708109785, + "language_loss": 0.72762769, + "learning_rate": 2.510518312724309e-06, + "loss": 0.7548492, + "num_input_tokens_seen": 155226475, + "step": 7240, + "time_per_iteration": 2.7826499938964844 + }, + { + "auxiliary_loss_clip": 0.01460038, + "auxiliary_loss_mlp": 0.01257328, + "balance_loss_clip": 1.14952397, + "balance_loss_mlp": 1.03397751, + "epoch": 0.43535247256876597, + "flos": 25778282866560.0, + "grad_norm": 2.0128763305040023, + "language_loss": 0.82115829, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.84833193, + "num_input_tokens_seen": 155247110, + "step": 7241, + "time_per_iteration": 2.776414394378662 + }, + { + "auxiliary_loss_clip": 0.01458361, + "auxiliary_loss_mlp": 0.01259658, + "balance_loss_clip": 1.14842343, + "balance_loss_mlp": 1.03420901, + "epoch": 0.43541259582143393, + "flos": 17529963575040.0, + "grad_norm": 2.8235186409980026, + "language_loss": 0.7922802, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81946045, + "num_input_tokens_seen": 155261335, + "step": 7242, + "time_per_iteration": 2.7365493774414062 + }, + { + "auxiliary_loss_clip": 0.01450216, + "auxiliary_loss_mlp": 0.01267056, + "balance_loss_clip": 1.13984847, + "balance_loss_mlp": 1.04160762, + "epoch": 0.4354727190741019, + "flos": 15197017065120.0, + "grad_norm": 2.226243695337579, + "language_loss": 0.68183959, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70901227, + "num_input_tokens_seen": 155278510, + "step": 7243, + "time_per_iteration": 2.7323503494262695 + }, + { + "auxiliary_loss_clip": 0.01455498, + "auxiliary_loss_mlp": 0.01265982, + "balance_loss_clip": 1.14449525, + "balance_loss_mlp": 1.04663694, + "epoch": 0.43553284232676986, + "flos": 16650855938880.0, + "grad_norm": 1.9708409014153216, + "language_loss": 0.81239253, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83960736, + "num_input_tokens_seen": 155296450, + "step": 7244, + "time_per_iteration": 2.814500331878662 + }, + { + "auxiliary_loss_clip": 0.01454092, + "auxiliary_loss_mlp": 0.01247191, + "balance_loss_clip": 1.1425997, + "balance_loss_mlp": 1.02517557, + "epoch": 0.43559296557943783, + "flos": 23402932309440.0, + "grad_norm": 1.692184879810418, + "language_loss": 0.73770052, + "learning_rate": 2.508635271753234e-06, + "loss": 0.76471329, + "num_input_tokens_seen": 155316080, + "step": 7245, + "time_per_iteration": 2.760854959487915 + }, + { + "auxiliary_loss_clip": 0.01459782, + "auxiliary_loss_mlp": 0.01250443, + "balance_loss_clip": 1.14982259, + "balance_loss_mlp": 1.02594805, + "epoch": 0.4356530888321058, + "flos": 22421265835680.0, + "grad_norm": 1.5848422912204154, + "language_loss": 0.76737523, + "learning_rate": 2.508258605639389e-06, + "loss": 0.79447746, + "num_input_tokens_seen": 155336765, + "step": 7246, + "time_per_iteration": 2.766204357147217 + }, + { + "auxiliary_loss_clip": 0.01460526, + "auxiliary_loss_mlp": 0.01261307, + "balance_loss_clip": 1.15012181, + "balance_loss_mlp": 1.03891039, + "epoch": 0.43571321208477376, + "flos": 21618280745280.0, + "grad_norm": 2.0935366863092195, + "language_loss": 0.85761452, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.88483286, + "num_input_tokens_seen": 155356440, + "step": 7247, + "time_per_iteration": 2.8216400146484375 + }, + { + "auxiliary_loss_clip": 0.01457615, + "auxiliary_loss_mlp": 0.01259724, + "balance_loss_clip": 1.14664435, + "balance_loss_mlp": 1.03542018, + "epoch": 0.4357733353374418, + "flos": 23989610917440.0, + "grad_norm": 1.9253433065841647, + "language_loss": 0.7244944, + "learning_rate": 2.507505215606333e-06, + "loss": 0.75166786, + "num_input_tokens_seen": 155377070, + "step": 7248, + "time_per_iteration": 4.465957403182983 + }, + { + "auxiliary_loss_clip": 0.01460718, + "auxiliary_loss_mlp": 0.01244996, + "balance_loss_clip": 1.15045869, + "balance_loss_mlp": 1.01802182, + "epoch": 0.43583345859010975, + "flos": 25267082025600.0, + "grad_norm": 1.4852545521202185, + "language_loss": 0.87028432, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89734143, + "num_input_tokens_seen": 155398415, + "step": 7249, + "time_per_iteration": 2.8019537925720215 + }, + { + "auxiliary_loss_clip": 0.01454031, + "auxiliary_loss_mlp": 0.01246046, + "balance_loss_clip": 1.14287829, + "balance_loss_mlp": 1.02059782, + "epoch": 0.4358935818427777, + "flos": 23698585231200.0, + "grad_norm": 1.8913346278893233, + "language_loss": 0.82111859, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84811938, + "num_input_tokens_seen": 155415625, + "step": 7250, + "time_per_iteration": 2.875555992126465 + }, + { + "auxiliary_loss_clip": 0.01468423, + "auxiliary_loss_mlp": 0.01257055, + "balance_loss_clip": 1.15810311, + "balance_loss_mlp": 1.03084338, + "epoch": 0.4359537050954457, + "flos": 29535370136640.0, + "grad_norm": 4.709126682064617, + "language_loss": 0.84872162, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.87597632, + "num_input_tokens_seen": 155435505, + "step": 7251, + "time_per_iteration": 2.7712531089782715 + }, + { + "auxiliary_loss_clip": 0.01456328, + "auxiliary_loss_mlp": 0.01247195, + "balance_loss_clip": 1.14465082, + "balance_loss_mlp": 1.02231872, + "epoch": 0.43601382834811364, + "flos": 22713505223040.0, + "grad_norm": 2.1589410978555486, + "language_loss": 0.69337595, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.72041124, + "num_input_tokens_seen": 155455425, + "step": 7252, + "time_per_iteration": 2.797369956970215 + }, + { + "auxiliary_loss_clip": 0.01460004, + "auxiliary_loss_mlp": 0.01248431, + "balance_loss_clip": 1.14870071, + "balance_loss_mlp": 1.02374578, + "epoch": 0.4360739516007816, + "flos": 19100736059040.0, + "grad_norm": 2.03023175260284, + "language_loss": 0.83804518, + "learning_rate": 2.505621403992348e-06, + "loss": 0.86512953, + "num_input_tokens_seen": 155474250, + "step": 7253, + "time_per_iteration": 2.7622523307800293 + }, + { + "auxiliary_loss_clip": 0.01465266, + "auxiliary_loss_mlp": 0.01252873, + "balance_loss_clip": 1.15592909, + "balance_loss_mlp": 1.0272336, + "epoch": 0.43613407485344957, + "flos": 23406687197280.0, + "grad_norm": 1.5483086838044526, + "language_loss": 0.70231467, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72949606, + "num_input_tokens_seen": 155494685, + "step": 7254, + "time_per_iteration": 2.7994494438171387 + }, + { + "auxiliary_loss_clip": 0.01457509, + "auxiliary_loss_mlp": 0.01252062, + "balance_loss_clip": 1.14654207, + "balance_loss_mlp": 1.02871132, + "epoch": 0.43619419810611754, + "flos": 22639999720320.0, + "grad_norm": 2.0362195437206987, + "language_loss": 0.8141861, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.84128177, + "num_input_tokens_seen": 155513040, + "step": 7255, + "time_per_iteration": 2.768144369125366 + }, + { + "auxiliary_loss_clip": 0.01463352, + "auxiliary_loss_mlp": 0.01247366, + "balance_loss_clip": 1.15142822, + "balance_loss_mlp": 1.02153563, + "epoch": 0.4362543213587855, + "flos": 20050049448000.0, + "grad_norm": 1.8972628220500478, + "language_loss": 0.77825034, + "learning_rate": 2.504490886831089e-06, + "loss": 0.80535746, + "num_input_tokens_seen": 155530100, + "step": 7256, + "time_per_iteration": 4.234647512435913 + }, + { + "auxiliary_loss_clip": 0.01458698, + "auxiliary_loss_mlp": 0.01250868, + "balance_loss_clip": 1.14762688, + "balance_loss_mlp": 1.02522814, + "epoch": 0.43631444461145347, + "flos": 21363476816160.0, + "grad_norm": 1.4641423797016082, + "language_loss": 0.76394105, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.79103673, + "num_input_tokens_seen": 155549375, + "step": 7257, + "time_per_iteration": 2.730363130569458 + }, + { + "auxiliary_loss_clip": 0.01456057, + "auxiliary_loss_mlp": 0.01248342, + "balance_loss_clip": 1.14405942, + "balance_loss_mlp": 1.02041399, + "epoch": 0.43637456786412143, + "flos": 22420696913280.0, + "grad_norm": 1.9313776748779974, + "language_loss": 0.73252124, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75956523, + "num_input_tokens_seen": 155569395, + "step": 7258, + "time_per_iteration": 2.781543254852295 + }, + { + "auxiliary_loss_clip": 0.01460839, + "auxiliary_loss_mlp": 0.01254176, + "balance_loss_clip": 1.14914179, + "balance_loss_mlp": 1.02567601, + "epoch": 0.4364346911167894, + "flos": 28551390045120.0, + "grad_norm": 1.8486168967772674, + "language_loss": 0.76845378, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.79560393, + "num_input_tokens_seen": 155589090, + "step": 7259, + "time_per_iteration": 4.445538759231567 + }, + { + "auxiliary_loss_clip": 0.01523936, + "auxiliary_loss_mlp": 0.01216812, + "balance_loss_clip": 1.24090993, + "balance_loss_mlp": 1.01005554, + "epoch": 0.43649481436945736, + "flos": 62665644909600.0, + "grad_norm": 0.7396333044448018, + "language_loss": 0.56896484, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59637225, + "num_input_tokens_seen": 155648660, + "step": 7260, + "time_per_iteration": 3.250368595123291 + }, + { + "auxiliary_loss_clip": 0.01460708, + "auxiliary_loss_mlp": 0.0125021, + "balance_loss_clip": 1.14832115, + "balance_loss_mlp": 1.02418911, + "epoch": 0.4365549376221254, + "flos": 30594790067040.0, + "grad_norm": 2.022931006144127, + "language_loss": 0.71442175, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.7415309, + "num_input_tokens_seen": 155669945, + "step": 7261, + "time_per_iteration": 2.8324224948883057 + }, + { + "auxiliary_loss_clip": 0.01461211, + "auxiliary_loss_mlp": 0.01258804, + "balance_loss_clip": 1.14839935, + "balance_loss_mlp": 1.03087616, + "epoch": 0.43661506087479335, + "flos": 17167821860160.0, + "grad_norm": 1.9833714330890253, + "language_loss": 0.6987071, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.72590721, + "num_input_tokens_seen": 155688555, + "step": 7262, + "time_per_iteration": 2.8310110569000244 + }, + { + "auxiliary_loss_clip": 0.0145124, + "auxiliary_loss_mlp": 0.01245075, + "balance_loss_clip": 1.1400274, + "balance_loss_mlp": 1.02210546, + "epoch": 0.4366751841274613, + "flos": 22049300727360.0, + "grad_norm": 1.9908983700154272, + "language_loss": 0.79428053, + "learning_rate": 2.501852344559726e-06, + "loss": 0.82124364, + "num_input_tokens_seen": 155705370, + "step": 7263, + "time_per_iteration": 2.7544596195220947 + }, + { + "auxiliary_loss_clip": 0.01463946, + "auxiliary_loss_mlp": 0.01259245, + "balance_loss_clip": 1.15196681, + "balance_loss_mlp": 1.03627634, + "epoch": 0.4367353073801293, + "flos": 15999129807840.0, + "grad_norm": 1.6493081085649621, + "language_loss": 0.75028574, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77751768, + "num_input_tokens_seen": 155721890, + "step": 7264, + "time_per_iteration": 4.249289274215698 + }, + { + "auxiliary_loss_clip": 0.0145352, + "auxiliary_loss_mlp": 0.01244852, + "balance_loss_clip": 1.14123344, + "balance_loss_mlp": 1.01997542, + "epoch": 0.43679543063279724, + "flos": 38220626203200.0, + "grad_norm": 2.056788519269458, + "language_loss": 0.62211853, + "learning_rate": 2.501098303852298e-06, + "loss": 0.64910221, + "num_input_tokens_seen": 155743970, + "step": 7265, + "time_per_iteration": 2.8933613300323486 + }, + { + "auxiliary_loss_clip": 0.01456445, + "auxiliary_loss_mlp": 0.01250478, + "balance_loss_clip": 1.14395845, + "balance_loss_mlp": 1.02903509, + "epoch": 0.4368555538854652, + "flos": 15194931016320.0, + "grad_norm": 2.104860987037392, + "language_loss": 0.7259751, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.75304437, + "num_input_tokens_seen": 155761830, + "step": 7266, + "time_per_iteration": 2.739564895629883 + }, + { + "auxiliary_loss_clip": 0.0145566, + "auxiliary_loss_mlp": 0.01258964, + "balance_loss_clip": 1.14395201, + "balance_loss_mlp": 1.03523183, + "epoch": 0.4369156771381332, + "flos": 23070943846080.0, + "grad_norm": 2.280254806499321, + "language_loss": 0.82132691, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.84847313, + "num_input_tokens_seen": 155779610, + "step": 7267, + "time_per_iteration": 2.79069185256958 + }, + { + "auxiliary_loss_clip": 0.01458108, + "auxiliary_loss_mlp": 0.01253319, + "balance_loss_clip": 1.14620388, + "balance_loss_mlp": 1.03206635, + "epoch": 0.43697580039080114, + "flos": 23443364092320.0, + "grad_norm": 1.7786733330690736, + "language_loss": 0.74487871, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.77199298, + "num_input_tokens_seen": 155798765, + "step": 7268, + "time_per_iteration": 2.8179163932800293 + }, + { + "auxiliary_loss_clip": 0.01460914, + "auxiliary_loss_mlp": 0.01255881, + "balance_loss_clip": 1.14774609, + "balance_loss_mlp": 1.03024137, + "epoch": 0.4370359236434691, + "flos": 18516636565920.0, + "grad_norm": 2.278812809022553, + "language_loss": 0.80078691, + "learning_rate": 2.499589994531454e-06, + "loss": 0.82795489, + "num_input_tokens_seen": 155817750, + "step": 7269, + "time_per_iteration": 2.787104606628418 + }, + { + "auxiliary_loss_clip": 0.01459152, + "auxiliary_loss_mlp": 0.01257369, + "balance_loss_clip": 1.14671612, + "balance_loss_mlp": 1.03745127, + "epoch": 0.43709604689613707, + "flos": 23224781920320.0, + "grad_norm": 1.8574187604626324, + "language_loss": 0.7472772, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77444243, + "num_input_tokens_seen": 155836490, + "step": 7270, + "time_per_iteration": 2.792778491973877 + }, + { + "auxiliary_loss_clip": 0.0145693, + "auxiliary_loss_mlp": 0.01253002, + "balance_loss_clip": 1.14293551, + "balance_loss_mlp": 1.02831614, + "epoch": 0.43715617014880503, + "flos": 23805885088800.0, + "grad_norm": 1.868789901164259, + "language_loss": 0.79609466, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.82319397, + "num_input_tokens_seen": 155856225, + "step": 7271, + "time_per_iteration": 2.784424066543579 + }, + { + "auxiliary_loss_clip": 0.01513645, + "auxiliary_loss_mlp": 0.01235458, + "balance_loss_clip": 1.23004675, + "balance_loss_mlp": 1.02946472, + "epoch": 0.437216293401473, + "flos": 61948112692320.0, + "grad_norm": 0.6992552305310845, + "language_loss": 0.54839158, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.57588267, + "num_input_tokens_seen": 155916770, + "step": 7272, + "time_per_iteration": 3.374204158782959 + }, + { + "auxiliary_loss_clip": 0.01458411, + "auxiliary_loss_mlp": 0.01261275, + "balance_loss_clip": 1.14570618, + "balance_loss_mlp": 1.03201151, + "epoch": 0.43727641665414096, + "flos": 21984480773280.0, + "grad_norm": 1.7214595143425877, + "language_loss": 0.70185268, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72904956, + "num_input_tokens_seen": 155936490, + "step": 7273, + "time_per_iteration": 2.7683281898498535 + }, + { + "auxiliary_loss_clip": 0.01455331, + "auxiliary_loss_mlp": 0.01265177, + "balance_loss_clip": 1.14194989, + "balance_loss_mlp": 1.04297125, + "epoch": 0.437336539906809, + "flos": 39534015643200.0, + "grad_norm": 1.8834837095968506, + "language_loss": 0.7545377, + "learning_rate": 2.497704181736367e-06, + "loss": 0.78174281, + "num_input_tokens_seen": 155957595, + "step": 7274, + "time_per_iteration": 2.899198055267334 + }, + { + "auxiliary_loss_clip": 0.01447976, + "auxiliary_loss_mlp": 0.01256681, + "balance_loss_clip": 1.13512123, + "balance_loss_mlp": 1.03733635, + "epoch": 0.43739666315947695, + "flos": 17459075115360.0, + "grad_norm": 1.8213197215322559, + "language_loss": 0.80516994, + "learning_rate": 2.49732696250116e-06, + "loss": 0.83221656, + "num_input_tokens_seen": 155975710, + "step": 7275, + "time_per_iteration": 2.8111300468444824 + }, + { + "auxiliary_loss_clip": 0.01459939, + "auxiliary_loss_mlp": 0.01260833, + "balance_loss_clip": 1.14601159, + "balance_loss_mlp": 1.03710103, + "epoch": 0.4374567864121449, + "flos": 16360399175040.0, + "grad_norm": 2.050347439221823, + "language_loss": 0.80529666, + "learning_rate": 2.496949724407266e-06, + "loss": 0.83250439, + "num_input_tokens_seen": 155993090, + "step": 7276, + "time_per_iteration": 2.718311071395874 + }, + { + "auxiliary_loss_clip": 0.01456665, + "auxiliary_loss_mlp": 0.01258085, + "balance_loss_clip": 1.14274693, + "balance_loss_mlp": 1.03320885, + "epoch": 0.4375169096648129, + "flos": 30589935262560.0, + "grad_norm": 2.009060163206197, + "language_loss": 0.73223567, + "learning_rate": 2.496572467468988e-06, + "loss": 0.7593832, + "num_input_tokens_seen": 156013685, + "step": 7277, + "time_per_iteration": 2.9094231128692627 + }, + { + "auxiliary_loss_clip": 0.01458805, + "auxiliary_loss_mlp": 0.0125429, + "balance_loss_clip": 1.14554667, + "balance_loss_mlp": 1.02960396, + "epoch": 0.43757703291748085, + "flos": 30558264884640.0, + "grad_norm": 2.2973273535167635, + "language_loss": 0.72842622, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.75555712, + "num_input_tokens_seen": 156034300, + "step": 7278, + "time_per_iteration": 2.80558705329895 + }, + { + "auxiliary_loss_clip": 0.01451215, + "auxiliary_loss_mlp": 0.01261926, + "balance_loss_clip": 1.13810921, + "balance_loss_mlp": 1.03876615, + "epoch": 0.4376371561701488, + "flos": 21399508932480.0, + "grad_norm": 1.5855987398656504, + "language_loss": 0.65985101, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68698239, + "num_input_tokens_seen": 156053805, + "step": 7279, + "time_per_iteration": 2.7809441089630127 + }, + { + "auxiliary_loss_clip": 0.01462212, + "auxiliary_loss_mlp": 0.01265761, + "balance_loss_clip": 1.14925599, + "balance_loss_mlp": 1.0384053, + "epoch": 0.4376972794228168, + "flos": 23406649269120.0, + "grad_norm": 1.8829553856947734, + "language_loss": 0.82094979, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.84822947, + "num_input_tokens_seen": 156073295, + "step": 7280, + "time_per_iteration": 2.7887895107269287 + }, + { + "auxiliary_loss_clip": 0.01449564, + "auxiliary_loss_mlp": 0.0125252, + "balance_loss_clip": 1.13631892, + "balance_loss_mlp": 1.02897882, + "epoch": 0.43775740267548474, + "flos": 22895145002880.0, + "grad_norm": 1.6894466230221288, + "language_loss": 0.77075267, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.79777348, + "num_input_tokens_seen": 156094540, + "step": 7281, + "time_per_iteration": 2.7844595909118652 + }, + { + "auxiliary_loss_clip": 0.01447399, + "auxiliary_loss_mlp": 0.01247193, + "balance_loss_clip": 1.13426113, + "balance_loss_mlp": 1.02155375, + "epoch": 0.4378175259281527, + "flos": 23296618584000.0, + "grad_norm": 2.655872149192656, + "language_loss": 0.75854927, + "learning_rate": 2.494685900612569e-06, + "loss": 0.78549522, + "num_input_tokens_seen": 156114070, + "step": 7282, + "time_per_iteration": 2.810835361480713 + }, + { + "auxiliary_loss_clip": 0.01459443, + "auxiliary_loss_mlp": 0.01250759, + "balance_loss_clip": 1.14616489, + "balance_loss_mlp": 1.02531016, + "epoch": 0.43787764918082067, + "flos": 23879087166240.0, + "grad_norm": 1.806791821474353, + "language_loss": 0.84987426, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87697625, + "num_input_tokens_seen": 156132130, + "step": 7283, + "time_per_iteration": 2.7923848628997803 + }, + { + "auxiliary_loss_clip": 0.01458141, + "auxiliary_loss_mlp": 0.01258469, + "balance_loss_clip": 1.14421737, + "balance_loss_mlp": 1.03035057, + "epoch": 0.43793777243348864, + "flos": 23990445336960.0, + "grad_norm": 1.8971461866953245, + "language_loss": 0.80271852, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82988465, + "num_input_tokens_seen": 156150820, + "step": 7284, + "time_per_iteration": 2.8566088676452637 + }, + { + "auxiliary_loss_clip": 0.01455352, + "auxiliary_loss_mlp": 0.01260049, + "balance_loss_clip": 1.1415987, + "balance_loss_mlp": 1.03765225, + "epoch": 0.4379978956861566, + "flos": 18626022472320.0, + "grad_norm": 1.5649070057071826, + "language_loss": 0.80365789, + "learning_rate": 2.493553735281787e-06, + "loss": 0.83081192, + "num_input_tokens_seen": 156170125, + "step": 7285, + "time_per_iteration": 2.8157196044921875 + }, + { + "auxiliary_loss_clip": 0.01450408, + "auxiliary_loss_mlp": 0.01261272, + "balance_loss_clip": 1.13770938, + "balance_loss_mlp": 1.03868449, + "epoch": 0.43805801893882457, + "flos": 21983570497440.0, + "grad_norm": 2.372223112792661, + "language_loss": 0.74926388, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77638066, + "num_input_tokens_seen": 156187320, + "step": 7286, + "time_per_iteration": 2.876800775527954 + }, + { + "auxiliary_loss_clip": 0.0145264, + "auxiliary_loss_mlp": 0.01263648, + "balance_loss_clip": 1.13838458, + "balance_loss_mlp": 1.04258585, + "epoch": 0.43811814219149253, + "flos": 26395607792160.0, + "grad_norm": 1.5559943582633091, + "language_loss": 0.73720694, + "learning_rate": 2.492798864792712e-06, + "loss": 0.76436973, + "num_input_tokens_seen": 156207455, + "step": 7287, + "time_per_iteration": 4.470860958099365 + }, + { + "auxiliary_loss_clip": 0.01447191, + "auxiliary_loss_mlp": 0.01257372, + "balance_loss_clip": 1.13205838, + "balance_loss_mlp": 1.03363991, + "epoch": 0.43817826544416055, + "flos": 17495221016160.0, + "grad_norm": 1.749275465525395, + "language_loss": 0.82037205, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84741771, + "num_input_tokens_seen": 156226560, + "step": 7288, + "time_per_iteration": 2.8958420753479004 + }, + { + "auxiliary_loss_clip": 0.01442569, + "auxiliary_loss_mlp": 0.01259713, + "balance_loss_clip": 1.12811327, + "balance_loss_mlp": 1.03712511, + "epoch": 0.4382383886968285, + "flos": 21583234761120.0, + "grad_norm": 1.4470643328088066, + "language_loss": 0.84353483, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.87055767, + "num_input_tokens_seen": 156246740, + "step": 7289, + "time_per_iteration": 2.896339178085327 + }, + { + "auxiliary_loss_clip": 0.0144754, + "auxiliary_loss_mlp": 0.0125558, + "balance_loss_clip": 1.13266671, + "balance_loss_mlp": 1.03070331, + "epoch": 0.4382985119494965, + "flos": 27925986421440.0, + "grad_norm": 1.5663087292767515, + "language_loss": 0.78330815, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.81033933, + "num_input_tokens_seen": 156266440, + "step": 7290, + "time_per_iteration": 2.892815113067627 + }, + { + "auxiliary_loss_clip": 0.01442105, + "auxiliary_loss_mlp": 0.01254464, + "balance_loss_clip": 1.12672639, + "balance_loss_mlp": 1.03149533, + "epoch": 0.43835863520216445, + "flos": 24939189803520.0, + "grad_norm": 10.30587056002522, + "language_loss": 0.78086162, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80782729, + "num_input_tokens_seen": 156286900, + "step": 7291, + "time_per_iteration": 2.8445186614990234 + }, + { + "auxiliary_loss_clip": 0.014408, + "auxiliary_loss_mlp": 0.01251259, + "balance_loss_clip": 1.12511086, + "balance_loss_mlp": 1.02962542, + "epoch": 0.4384187584548324, + "flos": 33513087699360.0, + "grad_norm": 1.894273727216764, + "language_loss": 0.64852262, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67544317, + "num_input_tokens_seen": 156307690, + "step": 7292, + "time_per_iteration": 2.9129345417022705 + }, + { + "auxiliary_loss_clip": 0.01437793, + "auxiliary_loss_mlp": 0.01255949, + "balance_loss_clip": 1.12194014, + "balance_loss_mlp": 1.03526926, + "epoch": 0.4384788817075004, + "flos": 23953313304000.0, + "grad_norm": 1.8704872306721232, + "language_loss": 0.74268305, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76962048, + "num_input_tokens_seen": 156326620, + "step": 7293, + "time_per_iteration": 2.8229634761810303 + }, + { + "auxiliary_loss_clip": 0.01449848, + "auxiliary_loss_mlp": 0.01253111, + "balance_loss_clip": 1.13338733, + "balance_loss_mlp": 1.02880704, + "epoch": 0.43853900496016834, + "flos": 19100508490080.0, + "grad_norm": 2.0127320523844974, + "language_loss": 0.78350657, + "learning_rate": 2.490156230192516e-06, + "loss": 0.81053615, + "num_input_tokens_seen": 156345495, + "step": 7294, + "time_per_iteration": 4.356847047805786 + }, + { + "auxiliary_loss_clip": 0.01445899, + "auxiliary_loss_mlp": 0.01250257, + "balance_loss_clip": 1.13166797, + "balance_loss_mlp": 1.02652454, + "epoch": 0.4385991282128363, + "flos": 13227084617760.0, + "grad_norm": 1.7141076501877877, + "language_loss": 0.73103565, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75799727, + "num_input_tokens_seen": 156363155, + "step": 7295, + "time_per_iteration": 2.803813934326172 + }, + { + "auxiliary_loss_clip": 0.01445557, + "auxiliary_loss_mlp": 0.01275351, + "balance_loss_clip": 1.13047516, + "balance_loss_mlp": 1.05047417, + "epoch": 0.4386592514655043, + "flos": 14321664316800.0, + "grad_norm": 1.8669636329440873, + "language_loss": 0.75329626, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.7805053, + "num_input_tokens_seen": 156380940, + "step": 7296, + "time_per_iteration": 2.8384897708892822 + }, + { + "auxiliary_loss_clip": 0.01448994, + "auxiliary_loss_mlp": 0.01250903, + "balance_loss_clip": 1.13306344, + "balance_loss_mlp": 1.02526355, + "epoch": 0.43871937471817224, + "flos": 22786669372320.0, + "grad_norm": 1.8139721916987008, + "language_loss": 0.6931839, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.7201829, + "num_input_tokens_seen": 156400415, + "step": 7297, + "time_per_iteration": 4.508381128311157 + }, + { + "auxiliary_loss_clip": 0.01439772, + "auxiliary_loss_mlp": 0.01250603, + "balance_loss_clip": 1.1246376, + "balance_loss_mlp": 1.02668047, + "epoch": 0.4387794979708402, + "flos": 28074704194080.0, + "grad_norm": 1.9787295844822126, + "language_loss": 0.70501161, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.73191535, + "num_input_tokens_seen": 156421120, + "step": 7298, + "time_per_iteration": 2.831611156463623 + }, + { + "auxiliary_loss_clip": 0.01444824, + "auxiliary_loss_mlp": 0.01250984, + "balance_loss_clip": 1.12886429, + "balance_loss_mlp": 1.0272522, + "epoch": 0.43883962122350817, + "flos": 26251934464800.0, + "grad_norm": 1.6582093016244468, + "language_loss": 0.72290045, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74985856, + "num_input_tokens_seen": 156441535, + "step": 7299, + "time_per_iteration": 2.7983269691467285 + }, + { + "auxiliary_loss_clip": 0.01446109, + "auxiliary_loss_mlp": 0.01256589, + "balance_loss_clip": 1.12975883, + "balance_loss_mlp": 1.03094935, + "epoch": 0.43889974447617613, + "flos": 25886075790240.0, + "grad_norm": 2.2487973694257284, + "language_loss": 0.77068281, + "learning_rate": 2.487890389750719e-06, + "loss": 0.79770976, + "num_input_tokens_seen": 156462015, + "step": 7300, + "time_per_iteration": 2.8294754028320312 + }, + { + "auxiliary_loss_clip": 0.01442373, + "auxiliary_loss_mlp": 0.0124212, + "balance_loss_clip": 1.12551761, + "balance_loss_mlp": 1.01743388, + "epoch": 0.43895986772884416, + "flos": 25048992919680.0, + "grad_norm": 2.6097987233215894, + "language_loss": 0.70599252, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.73283744, + "num_input_tokens_seen": 156482165, + "step": 7301, + "time_per_iteration": 2.8251616954803467 + }, + { + "auxiliary_loss_clip": 0.01449711, + "auxiliary_loss_mlp": 0.01254068, + "balance_loss_clip": 1.1332202, + "balance_loss_mlp": 1.02747464, + "epoch": 0.4390199909815121, + "flos": 25996978823040.0, + "grad_norm": 2.3171280723868564, + "language_loss": 0.70562536, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.73266315, + "num_input_tokens_seen": 156503170, + "step": 7302, + "time_per_iteration": 4.285557746887207 + }, + { + "auxiliary_loss_clip": 0.01442869, + "auxiliary_loss_mlp": 0.01252387, + "balance_loss_clip": 1.12679899, + "balance_loss_mlp": 1.02941775, + "epoch": 0.4390801142341801, + "flos": 29024776146240.0, + "grad_norm": 2.018065933547279, + "language_loss": 0.82398874, + "learning_rate": 2.486757219574983e-06, + "loss": 0.8509413, + "num_input_tokens_seen": 156523005, + "step": 7303, + "time_per_iteration": 2.865574836730957 + }, + { + "auxiliary_loss_clip": 0.01456425, + "auxiliary_loss_mlp": 0.01267547, + "balance_loss_clip": 1.14016581, + "balance_loss_mlp": 1.03713918, + "epoch": 0.43914023748684805, + "flos": 33441402748320.0, + "grad_norm": 2.116265871881314, + "language_loss": 0.68408144, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.71132118, + "num_input_tokens_seen": 156544440, + "step": 7304, + "time_per_iteration": 2.8784220218658447 + }, + { + "auxiliary_loss_clip": 0.01446253, + "auxiliary_loss_mlp": 0.0125163, + "balance_loss_clip": 1.13007963, + "balance_loss_mlp": 1.02885175, + "epoch": 0.439200360739516, + "flos": 34534692889920.0, + "grad_norm": 1.6471890285029287, + "language_loss": 0.78391218, + "learning_rate": 2.486001680477873e-06, + "loss": 0.81089103, + "num_input_tokens_seen": 156565410, + "step": 7305, + "time_per_iteration": 3.052480697631836 + }, + { + "auxiliary_loss_clip": 0.01447354, + "auxiliary_loss_mlp": 0.0124885, + "balance_loss_clip": 1.13002861, + "balance_loss_mlp": 1.02149355, + "epoch": 0.439260483992184, + "flos": 21909989138400.0, + "grad_norm": 2.1508369445153805, + "language_loss": 0.68891466, + "learning_rate": 2.485623883278308e-06, + "loss": 0.7158767, + "num_input_tokens_seen": 156584210, + "step": 7306, + "time_per_iteration": 2.782866954803467 + }, + { + "auxiliary_loss_clip": 0.01447027, + "auxiliary_loss_mlp": 0.012449, + "balance_loss_clip": 1.12969589, + "balance_loss_mlp": 1.01926088, + "epoch": 0.43932060724485195, + "flos": 20998566345600.0, + "grad_norm": 1.857364710473504, + "language_loss": 0.62910223, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.65602148, + "num_input_tokens_seen": 156602730, + "step": 7307, + "time_per_iteration": 2.776482105255127 + }, + { + "auxiliary_loss_clip": 0.01450452, + "auxiliary_loss_mlp": 0.01246158, + "balance_loss_clip": 1.13432717, + "balance_loss_mlp": 1.02013707, + "epoch": 0.4393807304975199, + "flos": 17748887100480.0, + "grad_norm": 2.345094589155335, + "language_loss": 0.71747911, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.7444452, + "num_input_tokens_seen": 156619405, + "step": 7308, + "time_per_iteration": 2.79386830329895 + }, + { + "auxiliary_loss_clip": 0.01446213, + "auxiliary_loss_mlp": 0.0125086, + "balance_loss_clip": 1.12855482, + "balance_loss_mlp": 1.0259831, + "epoch": 0.4394408537501879, + "flos": 22530727598400.0, + "grad_norm": 2.1069807626213306, + "language_loss": 0.76819241, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.79516315, + "num_input_tokens_seen": 156638165, + "step": 7309, + "time_per_iteration": 2.7745609283447266 + }, + { + "auxiliary_loss_clip": 0.01441499, + "auxiliary_loss_mlp": 0.01251226, + "balance_loss_clip": 1.12435412, + "balance_loss_mlp": 1.03092694, + "epoch": 0.43950097700285584, + "flos": 23442908954400.0, + "grad_norm": 1.740715656316733, + "language_loss": 0.70984578, + "learning_rate": 2.484112510474251e-06, + "loss": 0.73677307, + "num_input_tokens_seen": 156658845, + "step": 7310, + "time_per_iteration": 2.846987009048462 + }, + { + "auxiliary_loss_clip": 0.01447092, + "auxiliary_loss_mlp": 0.01250963, + "balance_loss_clip": 1.13187146, + "balance_loss_mlp": 1.02379799, + "epoch": 0.4395611002555238, + "flos": 23182681298400.0, + "grad_norm": 2.1416825129734667, + "language_loss": 0.75972629, + "learning_rate": 2.483734621343429e-06, + "loss": 0.78670681, + "num_input_tokens_seen": 156677275, + "step": 7311, + "time_per_iteration": 2.8236372470855713 + }, + { + "auxiliary_loss_clip": 0.01449461, + "auxiliary_loss_mlp": 0.01257759, + "balance_loss_clip": 1.13286233, + "balance_loss_mlp": 1.03383613, + "epoch": 0.43962122350819177, + "flos": 22129557442560.0, + "grad_norm": 2.0156447068805305, + "language_loss": 0.81336248, + "learning_rate": 2.483356713869341e-06, + "loss": 0.84043467, + "num_input_tokens_seen": 156695815, + "step": 7312, + "time_per_iteration": 2.799992799758911 + }, + { + "auxiliary_loss_clip": 0.01450408, + "auxiliary_loss_mlp": 0.01259599, + "balance_loss_clip": 1.13443136, + "balance_loss_mlp": 1.03643942, + "epoch": 0.43968134676085974, + "flos": 17422474076640.0, + "grad_norm": 3.148440884235076, + "language_loss": 0.84759605, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87469614, + "num_input_tokens_seen": 156714385, + "step": 7313, + "time_per_iteration": 2.790114641189575 + }, + { + "auxiliary_loss_clip": 0.01442613, + "auxiliary_loss_mlp": 0.01252281, + "balance_loss_clip": 1.12566137, + "balance_loss_mlp": 1.02797723, + "epoch": 0.43974147001352776, + "flos": 18954635329440.0, + "grad_norm": 1.8862492039225414, + "language_loss": 0.67734742, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.70429635, + "num_input_tokens_seen": 156732615, + "step": 7314, + "time_per_iteration": 2.7964017391204834 + }, + { + "auxiliary_loss_clip": 0.01450368, + "auxiliary_loss_mlp": 0.01261374, + "balance_loss_clip": 1.13336492, + "balance_loss_mlp": 1.03897667, + "epoch": 0.4398015932661957, + "flos": 18955545605280.0, + "grad_norm": 2.607432722292059, + "language_loss": 0.76859784, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.79571521, + "num_input_tokens_seen": 156750920, + "step": 7315, + "time_per_iteration": 2.7971181869506836 + }, + { + "auxiliary_loss_clip": 0.0144722, + "auxiliary_loss_mlp": 0.01258571, + "balance_loss_clip": 1.13134205, + "balance_loss_mlp": 1.0378902, + "epoch": 0.4398617165188637, + "flos": 24201821158560.0, + "grad_norm": 2.4876580258848358, + "language_loss": 0.7450608, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.77211869, + "num_input_tokens_seen": 156768520, + "step": 7316, + "time_per_iteration": 2.8279409408569336 + }, + { + "auxiliary_loss_clip": 0.01458309, + "auxiliary_loss_mlp": 0.01260271, + "balance_loss_clip": 1.14219534, + "balance_loss_mlp": 1.03692091, + "epoch": 0.43992183977153165, + "flos": 22238867492640.0, + "grad_norm": 2.676058837378655, + "language_loss": 0.65043664, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67762244, + "num_input_tokens_seen": 156788700, + "step": 7317, + "time_per_iteration": 2.8853588104248047 + }, + { + "auxiliary_loss_clip": 0.01457549, + "auxiliary_loss_mlp": 0.01254407, + "balance_loss_clip": 1.14135504, + "balance_loss_mlp": 1.02819514, + "epoch": 0.4399819630241996, + "flos": 18699452118720.0, + "grad_norm": 4.872261468050751, + "language_loss": 0.7994051, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.82652467, + "num_input_tokens_seen": 156806470, + "step": 7318, + "time_per_iteration": 2.776944160461426 + }, + { + "auxiliary_loss_clip": 0.01451092, + "auxiliary_loss_mlp": 0.01260142, + "balance_loss_clip": 1.13441432, + "balance_loss_mlp": 1.036219, + "epoch": 0.4400420862768676, + "flos": 23881704209280.0, + "grad_norm": 2.04054603614725, + "language_loss": 0.80038738, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.82749975, + "num_input_tokens_seen": 156825895, + "step": 7319, + "time_per_iteration": 2.854006290435791 + }, + { + "auxiliary_loss_clip": 0.01440662, + "auxiliary_loss_mlp": 0.01254443, + "balance_loss_clip": 1.12554836, + "balance_loss_mlp": 1.03280902, + "epoch": 0.44010220952953555, + "flos": 28040113347840.0, + "grad_norm": 2.0553366201373833, + "language_loss": 0.79513657, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.82208759, + "num_input_tokens_seen": 156845990, + "step": 7320, + "time_per_iteration": 2.8382956981658936 + }, + { + "auxiliary_loss_clip": 0.01443416, + "auxiliary_loss_mlp": 0.01251296, + "balance_loss_clip": 1.12725651, + "balance_loss_mlp": 1.02889943, + "epoch": 0.4401623327822035, + "flos": 23771256314400.0, + "grad_norm": 1.638688651031711, + "language_loss": 0.69789922, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.72484636, + "num_input_tokens_seen": 156866685, + "step": 7321, + "time_per_iteration": 2.8214902877807617 + }, + { + "auxiliary_loss_clip": 0.01499339, + "auxiliary_loss_mlp": 0.01218483, + "balance_loss_clip": 1.21097279, + "balance_loss_mlp": 1.0102005, + "epoch": 0.4402224560348715, + "flos": 70782200316000.0, + "grad_norm": 0.8763903155950634, + "language_loss": 0.56875318, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.59593141, + "num_input_tokens_seen": 156923450, + "step": 7322, + "time_per_iteration": 3.580883264541626 + }, + { + "auxiliary_loss_clip": 0.01443354, + "auxiliary_loss_mlp": 0.01254644, + "balance_loss_clip": 1.12769043, + "balance_loss_mlp": 1.03377306, + "epoch": 0.44028257928753944, + "flos": 22893931301760.0, + "grad_norm": 1.502845061541942, + "language_loss": 0.76423973, + "learning_rate": 2.479198525097822e-06, + "loss": 0.79121971, + "num_input_tokens_seen": 156944795, + "step": 7323, + "time_per_iteration": 2.8460867404937744 + }, + { + "auxiliary_loss_clip": 0.01449985, + "auxiliary_loss_mlp": 0.0125836, + "balance_loss_clip": 1.13405275, + "balance_loss_mlp": 1.03462815, + "epoch": 0.4403427025402074, + "flos": 17897756585760.0, + "grad_norm": 1.6400584753327636, + "language_loss": 0.80694908, + "learning_rate": 2.478820398622511e-06, + "loss": 0.83403254, + "num_input_tokens_seen": 156962755, + "step": 7324, + "time_per_iteration": 2.854949712753296 + }, + { + "auxiliary_loss_clip": 0.01496879, + "auxiliary_loss_mlp": 0.01237625, + "balance_loss_clip": 1.2083075, + "balance_loss_mlp": 1.03086853, + "epoch": 0.4404028257928754, + "flos": 69569625018240.0, + "grad_norm": 0.6779466096422226, + "language_loss": 0.54557288, + "learning_rate": 2.478442253990283e-06, + "loss": 0.57291788, + "num_input_tokens_seen": 157028095, + "step": 7325, + "time_per_iteration": 4.884345769882202 + }, + { + "auxiliary_loss_clip": 0.01447978, + "auxiliary_loss_mlp": 0.01249506, + "balance_loss_clip": 1.13299847, + "balance_loss_mlp": 1.02844429, + "epoch": 0.44046294904554334, + "flos": 20925819406080.0, + "grad_norm": 1.697246246397322, + "language_loss": 0.69525886, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.72223365, + "num_input_tokens_seen": 157048365, + "step": 7326, + "time_per_iteration": 2.8014261722564697 + }, + { + "auxiliary_loss_clip": 0.01448879, + "auxiliary_loss_mlp": 0.01254752, + "balance_loss_clip": 1.13405228, + "balance_loss_mlp": 1.03292727, + "epoch": 0.44052307229821136, + "flos": 23625762435360.0, + "grad_norm": 1.7346360260440685, + "language_loss": 0.76308614, + "learning_rate": 2.477685910312432e-06, + "loss": 0.79012245, + "num_input_tokens_seen": 157069130, + "step": 7327, + "time_per_iteration": 2.8346195220947266 + }, + { + "auxiliary_loss_clip": 0.01441409, + "auxiliary_loss_mlp": 0.01247729, + "balance_loss_clip": 1.12586474, + "balance_loss_mlp": 1.0251416, + "epoch": 0.4405831955508793, + "flos": 17599296980160.0, + "grad_norm": 2.316785066111441, + "language_loss": 0.84223831, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86912978, + "num_input_tokens_seen": 157084940, + "step": 7328, + "time_per_iteration": 2.8385963439941406 + }, + { + "auxiliary_loss_clip": 0.01447826, + "auxiliary_loss_mlp": 0.01260123, + "balance_loss_clip": 1.13175273, + "balance_loss_mlp": 1.03906178, + "epoch": 0.4406433188035473, + "flos": 21465125377920.0, + "grad_norm": 4.0294158500085775, + "language_loss": 0.7756207, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.80270016, + "num_input_tokens_seen": 157102770, + "step": 7329, + "time_per_iteration": 2.797290563583374 + }, + { + "auxiliary_loss_clip": 0.01447621, + "auxiliary_loss_mlp": 0.0125408, + "balance_loss_clip": 1.13067412, + "balance_loss_mlp": 1.02863121, + "epoch": 0.44070344205621526, + "flos": 22675728411360.0, + "grad_norm": 1.5945137965260017, + "language_loss": 0.73102224, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75803924, + "num_input_tokens_seen": 157122035, + "step": 7330, + "time_per_iteration": 2.7855613231658936 + }, + { + "auxiliary_loss_clip": 0.01453537, + "auxiliary_loss_mlp": 0.01262148, + "balance_loss_clip": 1.13635302, + "balance_loss_mlp": 1.03898811, + "epoch": 0.4407635653088832, + "flos": 23443477876800.0, + "grad_norm": 2.772732390956919, + "language_loss": 0.74158573, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.76874262, + "num_input_tokens_seen": 157142800, + "step": 7331, + "time_per_iteration": 2.8776278495788574 + }, + { + "auxiliary_loss_clip": 0.0144433, + "auxiliary_loss_mlp": 0.012458, + "balance_loss_clip": 1.12801838, + "balance_loss_mlp": 1.02225876, + "epoch": 0.4408236885615512, + "flos": 24023405272320.0, + "grad_norm": 1.6794369721883895, + "language_loss": 0.76449448, + "learning_rate": 2.475794734375581e-06, + "loss": 0.79139578, + "num_input_tokens_seen": 157163295, + "step": 7332, + "time_per_iteration": 2.7998602390289307 + }, + { + "auxiliary_loss_clip": 0.01446008, + "auxiliary_loss_mlp": 0.01248749, + "balance_loss_clip": 1.1297009, + "balance_loss_mlp": 1.02597094, + "epoch": 0.44088381181421915, + "flos": 12678448318560.0, + "grad_norm": 2.2350388961471714, + "language_loss": 0.73560393, + "learning_rate": 2.475416445004285e-06, + "loss": 0.76255155, + "num_input_tokens_seen": 157180890, + "step": 7333, + "time_per_iteration": 4.257771015167236 + }, + { + "auxiliary_loss_clip": 0.01453439, + "auxiliary_loss_mlp": 0.01249877, + "balance_loss_clip": 1.13740253, + "balance_loss_mlp": 1.02938795, + "epoch": 0.4409439350668871, + "flos": 24572079499680.0, + "grad_norm": 1.7261322207949437, + "language_loss": 0.79469979, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.821733, + "num_input_tokens_seen": 157200580, + "step": 7334, + "time_per_iteration": 2.8196473121643066 + }, + { + "auxiliary_loss_clip": 0.01458253, + "auxiliary_loss_mlp": 0.01254918, + "balance_loss_clip": 1.14088368, + "balance_loss_mlp": 1.02279389, + "epoch": 0.4410040583195551, + "flos": 22670456397120.0, + "grad_norm": 2.3923052818420105, + "language_loss": 0.75431514, + "learning_rate": 2.47465981219252e-06, + "loss": 0.78144681, + "num_input_tokens_seen": 157218345, + "step": 7335, + "time_per_iteration": 4.414506673812866 + }, + { + "auxiliary_loss_clip": 0.014531, + "auxiliary_loss_mlp": 0.01253711, + "balance_loss_clip": 1.13582253, + "balance_loss_mlp": 1.02902484, + "epoch": 0.44106418157222305, + "flos": 10854047678400.0, + "grad_norm": 1.9618839519365265, + "language_loss": 0.7234875, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.75055557, + "num_input_tokens_seen": 157234395, + "step": 7336, + "time_per_iteration": 2.7621288299560547 + }, + { + "auxiliary_loss_clip": 0.01448179, + "auxiliary_loss_mlp": 0.0125535, + "balance_loss_clip": 1.12846208, + "balance_loss_mlp": 1.02799368, + "epoch": 0.441124304824891, + "flos": 21728918280960.0, + "grad_norm": 2.3252280457050705, + "language_loss": 0.63281101, + "learning_rate": 2.473903107384165e-06, + "loss": 0.65984631, + "num_input_tokens_seen": 157254805, + "step": 7337, + "time_per_iteration": 2.8137929439544678 + }, + { + "auxiliary_loss_clip": 0.01457883, + "auxiliary_loss_mlp": 0.01222397, + "balance_loss_clip": 1.16150558, + "balance_loss_mlp": 1.01182556, + "epoch": 0.441184428077559, + "flos": 63227593992960.0, + "grad_norm": 0.7410299851305859, + "language_loss": 0.52657592, + "learning_rate": 2.473524728017134e-06, + "loss": 0.55337876, + "num_input_tokens_seen": 157317870, + "step": 7338, + "time_per_iteration": 3.3910892009735107 + }, + { + "auxiliary_loss_clip": 0.01456124, + "auxiliary_loss_mlp": 0.0125539, + "balance_loss_clip": 1.13724279, + "balance_loss_mlp": 1.02402878, + "epoch": 0.44124455133022694, + "flos": 21180054412800.0, + "grad_norm": 2.272643145692179, + "language_loss": 0.70578742, + "learning_rate": 2.473146330693997e-06, + "loss": 0.73290259, + "num_input_tokens_seen": 157336505, + "step": 7339, + "time_per_iteration": 2.8087029457092285 + }, + { + "auxiliary_loss_clip": 0.01449728, + "auxiliary_loss_mlp": 0.01243082, + "balance_loss_clip": 1.13195014, + "balance_loss_mlp": 1.02240181, + "epoch": 0.4413046745828949, + "flos": 17459833678560.0, + "grad_norm": 1.4324947259860425, + "language_loss": 0.69902921, + "learning_rate": 2.472767915429105e-06, + "loss": 0.72595727, + "num_input_tokens_seen": 157354995, + "step": 7340, + "time_per_iteration": 4.419029951095581 + }, + { + "auxiliary_loss_clip": 0.0147217, + "auxiliary_loss_mlp": 0.01212425, + "balance_loss_clip": 1.17319691, + "balance_loss_mlp": 1.001091, + "epoch": 0.4413647978355629, + "flos": 61591546781280.0, + "grad_norm": 0.9175048801155042, + "language_loss": 0.64019173, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66703773, + "num_input_tokens_seen": 157404260, + "step": 7341, + "time_per_iteration": 3.056216239929199 + }, + { + "auxiliary_loss_clip": 0.01452707, + "auxiliary_loss_mlp": 0.01251828, + "balance_loss_clip": 1.13559282, + "balance_loss_mlp": 1.02389991, + "epoch": 0.4414249210882309, + "flos": 27529481429280.0, + "grad_norm": 2.234590551562244, + "language_loss": 0.73595154, + "learning_rate": 2.47201103113145e-06, + "loss": 0.76299691, + "num_input_tokens_seen": 157423045, + "step": 7342, + "time_per_iteration": 2.868856430053711 + }, + { + "auxiliary_loss_clip": 0.01452054, + "auxiliary_loss_mlp": 0.0125694, + "balance_loss_clip": 1.13223088, + "balance_loss_mlp": 1.03282666, + "epoch": 0.44148504434089886, + "flos": 23516262744480.0, + "grad_norm": 2.611883018126723, + "language_loss": 0.79480118, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.82189107, + "num_input_tokens_seen": 157441815, + "step": 7343, + "time_per_iteration": 2.841053009033203 + }, + { + "auxiliary_loss_clip": 0.01446476, + "auxiliary_loss_mlp": 0.01249859, + "balance_loss_clip": 1.12758732, + "balance_loss_mlp": 1.02860689, + "epoch": 0.4415451675935668, + "flos": 21582779623200.0, + "grad_norm": 1.659856204142748, + "language_loss": 0.76795506, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.79491842, + "num_input_tokens_seen": 157460470, + "step": 7344, + "time_per_iteration": 2.820413112640381 + }, + { + "auxiliary_loss_clip": 0.01502433, + "auxiliary_loss_mlp": 0.01218018, + "balance_loss_clip": 1.20468199, + "balance_loss_mlp": 1.00744629, + "epoch": 0.4416052908462348, + "flos": 59012216393760.0, + "grad_norm": 0.7910429426847875, + "language_loss": 0.63764215, + "learning_rate": 2.470875570480556e-06, + "loss": 0.66484666, + "num_input_tokens_seen": 157512655, + "step": 7345, + "time_per_iteration": 3.0131630897521973 + }, + { + "auxiliary_loss_clip": 0.01454204, + "auxiliary_loss_mlp": 0.01256237, + "balance_loss_clip": 1.1343348, + "balance_loss_mlp": 1.0347935, + "epoch": 0.44166541409890275, + "flos": 26359954957440.0, + "grad_norm": 1.763707354836162, + "language_loss": 0.85563356, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88273799, + "num_input_tokens_seen": 157533700, + "step": 7346, + "time_per_iteration": 2.8709652423858643 + }, + { + "auxiliary_loss_clip": 0.01455964, + "auxiliary_loss_mlp": 0.01265274, + "balance_loss_clip": 1.13680565, + "balance_loss_mlp": 1.04154193, + "epoch": 0.4417255373515707, + "flos": 20194708907520.0, + "grad_norm": 1.9267506354704274, + "language_loss": 0.80705714, + "learning_rate": 2.470118507411128e-06, + "loss": 0.83426952, + "num_input_tokens_seen": 157551105, + "step": 7347, + "time_per_iteration": 2.8122658729553223 + }, + { + "auxiliary_loss_clip": 0.01454123, + "auxiliary_loss_mlp": 0.01260053, + "balance_loss_clip": 1.1355933, + "balance_loss_mlp": 1.03765643, + "epoch": 0.4417856606042387, + "flos": 17889450318720.0, + "grad_norm": 2.0951882610226256, + "language_loss": 0.83000171, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.8571434, + "num_input_tokens_seen": 157568285, + "step": 7348, + "time_per_iteration": 2.8697423934936523 + }, + { + "auxiliary_loss_clip": 0.01457847, + "auxiliary_loss_mlp": 0.01267401, + "balance_loss_clip": 1.13814747, + "balance_loss_mlp": 1.04519463, + "epoch": 0.44184578385690665, + "flos": 27966494060640.0, + "grad_norm": 3.1557120469012614, + "language_loss": 0.70439672, + "learning_rate": 2.469361373033938e-06, + "loss": 0.73164928, + "num_input_tokens_seen": 157590405, + "step": 7349, + "time_per_iteration": 2.881692886352539 + }, + { + "auxiliary_loss_clip": 0.01444419, + "auxiliary_loss_mlp": 0.0126276, + "balance_loss_clip": 1.12485254, + "balance_loss_mlp": 1.04131699, + "epoch": 0.4419059071095746, + "flos": 23370162014880.0, + "grad_norm": 3.3453507438032375, + "language_loss": 0.74419272, + "learning_rate": 2.468982779140819e-06, + "loss": 0.77126455, + "num_input_tokens_seen": 157607420, + "step": 7350, + "time_per_iteration": 2.7774009704589844 + }, + { + "auxiliary_loss_clip": 0.0145526, + "auxiliary_loss_mlp": 0.01269036, + "balance_loss_clip": 1.13648736, + "balance_loss_mlp": 1.04606676, + "epoch": 0.4419660303622426, + "flos": 15013936015200.0, + "grad_norm": 2.422769918638662, + "language_loss": 0.80919433, + "learning_rate": 2.468604167463827e-06, + "loss": 0.83643723, + "num_input_tokens_seen": 157624990, + "step": 7351, + "time_per_iteration": 2.791116714477539 + }, + { + "auxiliary_loss_clip": 0.01450703, + "auxiliary_loss_mlp": 0.01253464, + "balance_loss_clip": 1.13223648, + "balance_loss_mlp": 1.03621674, + "epoch": 0.44202615361491054, + "flos": 25373964673440.0, + "grad_norm": 1.759078005533697, + "language_loss": 0.73028719, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75732887, + "num_input_tokens_seen": 157645300, + "step": 7352, + "time_per_iteration": 2.8157689571380615 + }, + { + "auxiliary_loss_clip": 0.01453384, + "auxiliary_loss_mlp": 0.0125927, + "balance_loss_clip": 1.13533568, + "balance_loss_mlp": 1.03591967, + "epoch": 0.4420862768675785, + "flos": 24683437670400.0, + "grad_norm": 1.8509449596535592, + "language_loss": 0.87304926, + "learning_rate": 2.467846890815649e-06, + "loss": 0.90017581, + "num_input_tokens_seen": 157664060, + "step": 7353, + "time_per_iteration": 2.8093101978302 + }, + { + "auxiliary_loss_clip": 0.01455194, + "auxiliary_loss_mlp": 0.01252554, + "balance_loss_clip": 1.13628304, + "balance_loss_mlp": 1.02958453, + "epoch": 0.44214640012024653, + "flos": 19530011345760.0, + "grad_norm": 3.314338784100458, + "language_loss": 0.75911897, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.78619641, + "num_input_tokens_seen": 157680905, + "step": 7354, + "time_per_iteration": 2.8167946338653564 + }, + { + "auxiliary_loss_clip": 0.01449935, + "auxiliary_loss_mlp": 0.01256161, + "balance_loss_clip": 1.13223827, + "balance_loss_mlp": 1.03490865, + "epoch": 0.4422065233729145, + "flos": 47561021935200.0, + "grad_norm": 2.500108711605213, + "language_loss": 0.6521706, + "learning_rate": 2.467089543204268e-06, + "loss": 0.67923158, + "num_input_tokens_seen": 157701980, + "step": 7355, + "time_per_iteration": 3.025534152984619 + }, + { + "auxiliary_loss_clip": 0.01454817, + "auxiliary_loss_mlp": 0.01255703, + "balance_loss_clip": 1.13604927, + "balance_loss_mlp": 1.03292465, + "epoch": 0.44226664662558246, + "flos": 19283058545760.0, + "grad_norm": 1.9584706481573662, + "language_loss": 0.78439522, + "learning_rate": 2.466710842823274e-06, + "loss": 0.81150049, + "num_input_tokens_seen": 157720555, + "step": 7356, + "time_per_iteration": 2.8093788623809814 + }, + { + "auxiliary_loss_clip": 0.01455395, + "auxiliary_loss_mlp": 0.01246902, + "balance_loss_clip": 1.13812661, + "balance_loss_mlp": 1.02107167, + "epoch": 0.4423267698782504, + "flos": 17823758016960.0, + "grad_norm": 1.8575569880460479, + "language_loss": 0.77522534, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.80224836, + "num_input_tokens_seen": 157739160, + "step": 7357, + "time_per_iteration": 2.781991720199585 + }, + { + "auxiliary_loss_clip": 0.01456473, + "auxiliary_loss_mlp": 0.01244528, + "balance_loss_clip": 1.14071536, + "balance_loss_mlp": 1.01946068, + "epoch": 0.4423868931309184, + "flos": 29206946920320.0, + "grad_norm": 1.7844327695491404, + "language_loss": 0.73249769, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75950766, + "num_input_tokens_seen": 157760020, + "step": 7358, + "time_per_iteration": 2.8859007358551025 + }, + { + "auxiliary_loss_clip": 0.01453567, + "auxiliary_loss_mlp": 0.01252292, + "balance_loss_clip": 1.13751197, + "balance_loss_mlp": 1.02817845, + "epoch": 0.44244701638358636, + "flos": 29715568646400.0, + "grad_norm": 2.296198435835237, + "language_loss": 0.75911617, + "learning_rate": 2.465574635551405e-06, + "loss": 0.78617477, + "num_input_tokens_seen": 157780435, + "step": 7359, + "time_per_iteration": 2.881317377090454 + }, + { + "auxiliary_loss_clip": 0.01451008, + "auxiliary_loss_mlp": 0.01255052, + "balance_loss_clip": 1.13455725, + "balance_loss_mlp": 1.03151059, + "epoch": 0.4425071396362543, + "flos": 22932277035840.0, + "grad_norm": 2.1500503795363977, + "language_loss": 0.70014226, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72720277, + "num_input_tokens_seen": 157799420, + "step": 7360, + "time_per_iteration": 2.7956037521362305 + }, + { + "auxiliary_loss_clip": 0.01450999, + "auxiliary_loss_mlp": 0.0125824, + "balance_loss_clip": 1.13529396, + "balance_loss_mlp": 1.03450763, + "epoch": 0.4425672628889223, + "flos": 19794411099360.0, + "grad_norm": 2.4814994669605652, + "language_loss": 0.69971848, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.72681081, + "num_input_tokens_seen": 157817025, + "step": 7361, + "time_per_iteration": 2.852097988128662 + }, + { + "auxiliary_loss_clip": 0.01449171, + "auxiliary_loss_mlp": 0.012457, + "balance_loss_clip": 1.13318682, + "balance_loss_mlp": 1.02196813, + "epoch": 0.44262738614159025, + "flos": 13663907608320.0, + "grad_norm": 2.136891144599901, + "language_loss": 0.82253927, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84948802, + "num_input_tokens_seen": 157834345, + "step": 7362, + "time_per_iteration": 2.764998197555542 + }, + { + "auxiliary_loss_clip": 0.01454661, + "auxiliary_loss_mlp": 0.0125507, + "balance_loss_clip": 1.13849878, + "balance_loss_mlp": 1.02809525, + "epoch": 0.4426875093942582, + "flos": 14211974985120.0, + "grad_norm": 2.1043615339007467, + "language_loss": 0.74441963, + "learning_rate": 2.464059445424366e-06, + "loss": 0.77151692, + "num_input_tokens_seen": 157852290, + "step": 7363, + "time_per_iteration": 4.386328935623169 + }, + { + "auxiliary_loss_clip": 0.01517147, + "auxiliary_loss_mlp": 0.01218208, + "balance_loss_clip": 1.22879493, + "balance_loss_mlp": 1.01068878, + "epoch": 0.4427476326469262, + "flos": 70125088386240.0, + "grad_norm": 0.6859086780050654, + "language_loss": 0.55619377, + "learning_rate": 2.463680603863743e-06, + "loss": 0.58354729, + "num_input_tokens_seen": 157923060, + "step": 7364, + "time_per_iteration": 3.40554141998291 + }, + { + "auxiliary_loss_clip": 0.01447653, + "auxiliary_loss_mlp": 0.01240541, + "balance_loss_clip": 1.13259387, + "balance_loss_mlp": 1.01909757, + "epoch": 0.44280775589959415, + "flos": 25447318463520.0, + "grad_norm": 5.43620370066613, + "language_loss": 0.74637312, + "learning_rate": 2.463301744720305e-06, + "loss": 0.77325505, + "num_input_tokens_seen": 157944110, + "step": 7365, + "time_per_iteration": 2.8258683681488037 + }, + { + "auxiliary_loss_clip": 0.01445378, + "auxiliary_loss_mlp": 0.01254751, + "balance_loss_clip": 1.12993908, + "balance_loss_mlp": 1.03311694, + "epoch": 0.4428678791522621, + "flos": 22859454240000.0, + "grad_norm": 1.6828512931980075, + "language_loss": 0.74254584, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.7695471, + "num_input_tokens_seen": 157964295, + "step": 7366, + "time_per_iteration": 2.8338520526885986 + }, + { + "auxiliary_loss_clip": 0.01458812, + "auxiliary_loss_mlp": 0.01269053, + "balance_loss_clip": 1.14269066, + "balance_loss_mlp": 1.04589343, + "epoch": 0.44292800240493013, + "flos": 25814845977120.0, + "grad_norm": 1.9840326675245012, + "language_loss": 0.73496157, + "learning_rate": 2.46254397374245e-06, + "loss": 0.76224023, + "num_input_tokens_seen": 157983970, + "step": 7367, + "time_per_iteration": 2.792158603668213 + }, + { + "auxiliary_loss_clip": 0.01452756, + "auxiliary_loss_mlp": 0.01257001, + "balance_loss_clip": 1.13696802, + "balance_loss_mlp": 1.03345954, + "epoch": 0.4429881256575981, + "flos": 32419683773280.0, + "grad_norm": 1.558639335643872, + "language_loss": 0.73792541, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76502299, + "num_input_tokens_seen": 158006515, + "step": 7368, + "time_per_iteration": 2.886554002761841 + }, + { + "auxiliary_loss_clip": 0.01450784, + "auxiliary_loss_mlp": 0.0124935, + "balance_loss_clip": 1.13551128, + "balance_loss_mlp": 1.02542758, + "epoch": 0.44304824891026606, + "flos": 22165892984160.0, + "grad_norm": 1.6835668139374627, + "language_loss": 0.79885757, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82585895, + "num_input_tokens_seen": 158025565, + "step": 7369, + "time_per_iteration": 2.849546432495117 + }, + { + "auxiliary_loss_clip": 0.01446366, + "auxiliary_loss_mlp": 0.01254399, + "balance_loss_clip": 1.13043833, + "balance_loss_mlp": 1.03314626, + "epoch": 0.443108372162934, + "flos": 25340966809920.0, + "grad_norm": 4.115631710179004, + "language_loss": 0.71768904, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74469674, + "num_input_tokens_seen": 158045620, + "step": 7370, + "time_per_iteration": 2.876490354537964 + }, + { + "auxiliary_loss_clip": 0.01442527, + "auxiliary_loss_mlp": 0.01262789, + "balance_loss_clip": 1.1266017, + "balance_loss_mlp": 1.04077387, + "epoch": 0.443168495415602, + "flos": 23333181694560.0, + "grad_norm": 2.041702548963336, + "language_loss": 0.70539308, + "learning_rate": 2.461028221425126e-06, + "loss": 0.73244631, + "num_input_tokens_seen": 158063505, + "step": 7371, + "time_per_iteration": 4.34818172454834 + }, + { + "auxiliary_loss_clip": 0.01447869, + "auxiliary_loss_mlp": 0.01249764, + "balance_loss_clip": 1.13247573, + "balance_loss_mlp": 1.02813041, + "epoch": 0.44322861866826996, + "flos": 21873653596800.0, + "grad_norm": 2.494959973037916, + "language_loss": 0.67877984, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70575619, + "num_input_tokens_seen": 158080335, + "step": 7372, + "time_per_iteration": 5.273115396499634 + }, + { + "auxiliary_loss_clip": 0.01448176, + "auxiliary_loss_mlp": 0.0124875, + "balance_loss_clip": 1.13152587, + "balance_loss_mlp": 1.02559054, + "epoch": 0.4432887419209379, + "flos": 20086498774080.0, + "grad_norm": 1.8756524308288194, + "language_loss": 0.83349121, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.86046046, + "num_input_tokens_seen": 158098955, + "step": 7373, + "time_per_iteration": 2.864640951156616 + }, + { + "auxiliary_loss_clip": 0.01490433, + "auxiliary_loss_mlp": 0.01215927, + "balance_loss_clip": 1.20263565, + "balance_loss_mlp": 1.00840759, + "epoch": 0.4433488651736059, + "flos": 70042593909600.0, + "grad_norm": 0.8018994263499936, + "language_loss": 0.55193537, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57899898, + "num_input_tokens_seen": 158164110, + "step": 7374, + "time_per_iteration": 3.45626163482666 + }, + { + "auxiliary_loss_clip": 0.01455264, + "auxiliary_loss_mlp": 0.0126737, + "balance_loss_clip": 1.13962007, + "balance_loss_mlp": 1.04592729, + "epoch": 0.44340898842627385, + "flos": 16283935275840.0, + "grad_norm": 2.4518854415162243, + "language_loss": 0.83274066, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.85996699, + "num_input_tokens_seen": 158179850, + "step": 7375, + "time_per_iteration": 2.7654101848602295 + }, + { + "auxiliary_loss_clip": 0.01445425, + "auxiliary_loss_mlp": 0.01253146, + "balance_loss_clip": 1.13005185, + "balance_loss_mlp": 1.03246617, + "epoch": 0.4434691116789418, + "flos": 16613534265120.0, + "grad_norm": 1.9655801185832922, + "language_loss": 0.84018576, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86717147, + "num_input_tokens_seen": 158196590, + "step": 7376, + "time_per_iteration": 2.7308197021484375 + }, + { + "auxiliary_loss_clip": 0.01444875, + "auxiliary_loss_mlp": 0.0125489, + "balance_loss_clip": 1.12982869, + "balance_loss_mlp": 1.03077626, + "epoch": 0.4435292349316098, + "flos": 19065310793280.0, + "grad_norm": 1.995329586249868, + "language_loss": 0.77360409, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.80060172, + "num_input_tokens_seen": 158216355, + "step": 7377, + "time_per_iteration": 4.405495643615723 + }, + { + "auxiliary_loss_clip": 0.01445338, + "auxiliary_loss_mlp": 0.01249708, + "balance_loss_clip": 1.13017952, + "balance_loss_mlp": 1.02902794, + "epoch": 0.44358935818427775, + "flos": 21253446131040.0, + "grad_norm": 1.997286019538722, + "language_loss": 0.75998753, + "learning_rate": 2.458374982357057e-06, + "loss": 0.78693795, + "num_input_tokens_seen": 158235825, + "step": 7378, + "time_per_iteration": 2.7900941371917725 + }, + { + "auxiliary_loss_clip": 0.01450237, + "auxiliary_loss_mlp": 0.01260837, + "balance_loss_clip": 1.13268411, + "balance_loss_mlp": 1.04091978, + "epoch": 0.4436494814369457, + "flos": 12496960251360.0, + "grad_norm": 2.0873795581357437, + "language_loss": 0.69175911, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71886981, + "num_input_tokens_seen": 158254230, + "step": 7379, + "time_per_iteration": 2.796041250228882 + }, + { + "auxiliary_loss_clip": 0.01450059, + "auxiliary_loss_mlp": 0.01257401, + "balance_loss_clip": 1.13487434, + "balance_loss_mlp": 1.03462267, + "epoch": 0.44370960468961373, + "flos": 23662135905120.0, + "grad_norm": 1.744125914546111, + "language_loss": 0.73060679, + "learning_rate": 2.457616757401656e-06, + "loss": 0.75768143, + "num_input_tokens_seen": 158273400, + "step": 7380, + "time_per_iteration": 2.792184352874756 + }, + { + "auxiliary_loss_clip": 0.01446169, + "auxiliary_loss_mlp": 0.01250642, + "balance_loss_clip": 1.13072121, + "balance_loss_mlp": 1.02958071, + "epoch": 0.4437697279422817, + "flos": 32419645845120.0, + "grad_norm": 3.0377355678591433, + "language_loss": 0.64927983, + "learning_rate": 2.457237618887458e-06, + "loss": 0.67624795, + "num_input_tokens_seen": 158296840, + "step": 7381, + "time_per_iteration": 2.823347330093384 + }, + { + "auxiliary_loss_clip": 0.01450768, + "auxiliary_loss_mlp": 0.01255745, + "balance_loss_clip": 1.13542771, + "balance_loss_mlp": 1.03220367, + "epoch": 0.44382985119494966, + "flos": 18114783703200.0, + "grad_norm": 2.1358156388927245, + "language_loss": 0.80144709, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82851219, + "num_input_tokens_seen": 158314935, + "step": 7382, + "time_per_iteration": 2.7512762546539307 + }, + { + "auxiliary_loss_clip": 0.01455521, + "auxiliary_loss_mlp": 0.01256903, + "balance_loss_clip": 1.13969326, + "balance_loss_mlp": 1.0325985, + "epoch": 0.44388997444761763, + "flos": 30776809128480.0, + "grad_norm": 1.8372058158840041, + "language_loss": 0.65338022, + "learning_rate": 2.456479289857949e-06, + "loss": 0.68050444, + "num_input_tokens_seen": 158334620, + "step": 7383, + "time_per_iteration": 2.8520095348358154 + }, + { + "auxiliary_loss_clip": 0.01453009, + "auxiliary_loss_mlp": 0.01248748, + "balance_loss_clip": 1.13777924, + "balance_loss_mlp": 1.0232991, + "epoch": 0.4439500977002856, + "flos": 20341226846880.0, + "grad_norm": 3.016098655637739, + "language_loss": 0.75714105, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.78415859, + "num_input_tokens_seen": 158350550, + "step": 7384, + "time_per_iteration": 2.735145092010498 + }, + { + "auxiliary_loss_clip": 0.01454263, + "auxiliary_loss_mlp": 0.0125358, + "balance_loss_clip": 1.13970494, + "balance_loss_mlp": 1.03022957, + "epoch": 0.44401022095295356, + "flos": 20373048937440.0, + "grad_norm": 1.6062034196531887, + "language_loss": 0.80935216, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83643067, + "num_input_tokens_seen": 158369555, + "step": 7385, + "time_per_iteration": 2.7681448459625244 + }, + { + "auxiliary_loss_clip": 0.01449384, + "auxiliary_loss_mlp": 0.01249869, + "balance_loss_clip": 1.13521791, + "balance_loss_mlp": 1.02594662, + "epoch": 0.4440703442056215, + "flos": 20232751216320.0, + "grad_norm": 1.8954543658924339, + "language_loss": 0.82061702, + "learning_rate": 2.455341666526582e-06, + "loss": 0.84760958, + "num_input_tokens_seen": 158388045, + "step": 7386, + "time_per_iteration": 2.7887747287750244 + }, + { + "auxiliary_loss_clip": 0.01454886, + "auxiliary_loss_mlp": 0.01250386, + "balance_loss_clip": 1.14047027, + "balance_loss_mlp": 1.02112257, + "epoch": 0.4441304674582895, + "flos": 39497869742400.0, + "grad_norm": 1.997311568675846, + "language_loss": 0.69765878, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.72471154, + "num_input_tokens_seen": 158410115, + "step": 7387, + "time_per_iteration": 2.9397404193878174 + }, + { + "auxiliary_loss_clip": 0.01453315, + "auxiliary_loss_mlp": 0.01253996, + "balance_loss_clip": 1.13981271, + "balance_loss_mlp": 1.03198051, + "epoch": 0.44419059071095746, + "flos": 14831423887680.0, + "grad_norm": 2.5121279216685988, + "language_loss": 0.72057462, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.7476477, + "num_input_tokens_seen": 158427765, + "step": 7388, + "time_per_iteration": 2.764185905456543 + }, + { + "auxiliary_loss_clip": 0.01455526, + "auxiliary_loss_mlp": 0.01246942, + "balance_loss_clip": 1.14088428, + "balance_loss_mlp": 1.02130246, + "epoch": 0.4442507139636254, + "flos": 22640037648480.0, + "grad_norm": 2.119318385708216, + "language_loss": 0.69458377, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.72160846, + "num_input_tokens_seen": 158446375, + "step": 7389, + "time_per_iteration": 2.7962806224823 + }, + { + "auxiliary_loss_clip": 0.01447491, + "auxiliary_loss_mlp": 0.01247986, + "balance_loss_clip": 1.13436007, + "balance_loss_mlp": 1.02444458, + "epoch": 0.4443108372162934, + "flos": 38293828280640.0, + "grad_norm": 2.024410780259809, + "language_loss": 0.75533056, + "learning_rate": 2.453824593752788e-06, + "loss": 0.78228539, + "num_input_tokens_seen": 158467260, + "step": 7390, + "time_per_iteration": 2.9088730812072754 + }, + { + "auxiliary_loss_clip": 0.01453883, + "auxiliary_loss_mlp": 0.01244277, + "balance_loss_clip": 1.14081001, + "balance_loss_mlp": 1.02111745, + "epoch": 0.44437096046896135, + "flos": 17750935221120.0, + "grad_norm": 2.0242608242097093, + "language_loss": 0.81421983, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.84120142, + "num_input_tokens_seen": 158486720, + "step": 7391, + "time_per_iteration": 2.751425266265869 + }, + { + "auxiliary_loss_clip": 0.01456669, + "auxiliary_loss_mlp": 0.01239614, + "balance_loss_clip": 1.14502811, + "balance_loss_mlp": 1.01550102, + "epoch": 0.4444310837216293, + "flos": 13733885792160.0, + "grad_norm": 1.728836045048036, + "language_loss": 0.73718762, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.76415044, + "num_input_tokens_seen": 158502530, + "step": 7392, + "time_per_iteration": 2.7183117866516113 + }, + { + "auxiliary_loss_clip": 0.01451059, + "auxiliary_loss_mlp": 0.01246854, + "balance_loss_clip": 1.13907456, + "balance_loss_mlp": 1.02522016, + "epoch": 0.44449120697429734, + "flos": 25012619449920.0, + "grad_norm": 1.8187813557217385, + "language_loss": 0.7937665, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.82074559, + "num_input_tokens_seen": 158522715, + "step": 7393, + "time_per_iteration": 2.7947773933410645 + }, + { + "auxiliary_loss_clip": 0.01456617, + "auxiliary_loss_mlp": 0.01251491, + "balance_loss_clip": 1.14486384, + "balance_loss_mlp": 1.02852166, + "epoch": 0.4445513302269653, + "flos": 32674487702400.0, + "grad_norm": 2.119361655418334, + "language_loss": 0.81190509, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83898616, + "num_input_tokens_seen": 158543615, + "step": 7394, + "time_per_iteration": 2.835386276245117 + }, + { + "auxiliary_loss_clip": 0.01449014, + "auxiliary_loss_mlp": 0.01244071, + "balance_loss_clip": 1.13636076, + "balance_loss_mlp": 1.0228188, + "epoch": 0.44461145347963327, + "flos": 11657412050400.0, + "grad_norm": 2.1702182667918963, + "language_loss": 0.79990613, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.826837, + "num_input_tokens_seen": 158560330, + "step": 7395, + "time_per_iteration": 2.760650396347046 + }, + { + "auxiliary_loss_clip": 0.01449835, + "auxiliary_loss_mlp": 0.01242879, + "balance_loss_clip": 1.13857806, + "balance_loss_mlp": 1.01914752, + "epoch": 0.44467157673230123, + "flos": 20888877013920.0, + "grad_norm": 2.0082814534961164, + "language_loss": 0.68454254, + "learning_rate": 2.451548468607584e-06, + "loss": 0.71146971, + "num_input_tokens_seen": 158579735, + "step": 7396, + "time_per_iteration": 2.786051034927368 + }, + { + "auxiliary_loss_clip": 0.0146277, + "auxiliary_loss_mlp": 0.01249034, + "balance_loss_clip": 1.15033817, + "balance_loss_mlp": 1.02453947, + "epoch": 0.4447316999849692, + "flos": 18547813877760.0, + "grad_norm": 1.8431672461486035, + "language_loss": 0.80975902, + "learning_rate": 2.451169054403126e-06, + "loss": 0.83687711, + "num_input_tokens_seen": 158597075, + "step": 7397, + "time_per_iteration": 2.7780892848968506 + }, + { + "auxiliary_loss_clip": 0.01455414, + "auxiliary_loss_mlp": 0.01248921, + "balance_loss_clip": 1.14348233, + "balance_loss_mlp": 1.02785957, + "epoch": 0.44479182323763716, + "flos": 23771332170720.0, + "grad_norm": 1.911150062801868, + "language_loss": 0.67427087, + "learning_rate": 2.450789623090293e-06, + "loss": 0.70131421, + "num_input_tokens_seen": 158616650, + "step": 7398, + "time_per_iteration": 2.8220880031585693 + }, + { + "auxiliary_loss_clip": 0.01459813, + "auxiliary_loss_mlp": 0.0124611, + "balance_loss_clip": 1.14874578, + "balance_loss_mlp": 1.02638364, + "epoch": 0.44485194649030513, + "flos": 16545642130080.0, + "grad_norm": 2.6797193772866037, + "language_loss": 0.69614178, + "learning_rate": 2.450410174683472e-06, + "loss": 0.72320104, + "num_input_tokens_seen": 158634515, + "step": 7399, + "time_per_iteration": 2.748483896255493 + }, + { + "auxiliary_loss_clip": 0.01458335, + "auxiliary_loss_mlp": 0.01251952, + "balance_loss_clip": 1.14582705, + "balance_loss_mlp": 1.03127182, + "epoch": 0.4449120697429731, + "flos": 22603133184480.0, + "grad_norm": 1.8248028437942492, + "language_loss": 0.72176266, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.74886549, + "num_input_tokens_seen": 158653760, + "step": 7400, + "time_per_iteration": 2.8140218257904053 + }, + { + "auxiliary_loss_clip": 0.01455288, + "auxiliary_loss_mlp": 0.0124923, + "balance_loss_clip": 1.14364791, + "balance_loss_mlp": 1.02950358, + "epoch": 0.44497219299564106, + "flos": 20006583412320.0, + "grad_norm": 1.6158552293815402, + "language_loss": 0.8490541, + "learning_rate": 2.449651226645422e-06, + "loss": 0.87609935, + "num_input_tokens_seen": 158672190, + "step": 7401, + "time_per_iteration": 2.8406007289886475 + }, + { + "auxiliary_loss_clip": 0.01454981, + "auxiliary_loss_mlp": 0.01249834, + "balance_loss_clip": 1.14347386, + "balance_loss_mlp": 1.02991652, + "epoch": 0.445032316248309, + "flos": 25596681014880.0, + "grad_norm": 1.7349696590791974, + "language_loss": 0.83005214, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85710025, + "num_input_tokens_seen": 158694115, + "step": 7402, + "time_per_iteration": 4.399066686630249 + }, + { + "auxiliary_loss_clip": 0.01459925, + "auxiliary_loss_mlp": 0.01255413, + "balance_loss_clip": 1.1484797, + "balance_loss_mlp": 1.0318718, + "epoch": 0.445092439500977, + "flos": 21252497927040.0, + "grad_norm": 1.782800058291536, + "language_loss": 0.77332568, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.80047905, + "num_input_tokens_seen": 158711000, + "step": 7403, + "time_per_iteration": 2.773273468017578 + }, + { + "auxiliary_loss_clip": 0.01487033, + "auxiliary_loss_mlp": 0.01226349, + "balance_loss_clip": 1.20366442, + "balance_loss_mlp": 1.01882935, + "epoch": 0.44515256275364495, + "flos": 57770625689280.0, + "grad_norm": 0.7472508917402462, + "language_loss": 0.60037041, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62750423, + "num_input_tokens_seen": 158769675, + "step": 7404, + "time_per_iteration": 3.293677568435669 + }, + { + "auxiliary_loss_clip": 0.01461474, + "auxiliary_loss_mlp": 0.01258598, + "balance_loss_clip": 1.14908624, + "balance_loss_mlp": 1.03620148, + "epoch": 0.4452126860063129, + "flos": 15597807939360.0, + "grad_norm": 2.0782331618786833, + "language_loss": 0.81763256, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.84483337, + "num_input_tokens_seen": 158788215, + "step": 7405, + "time_per_iteration": 2.7562084197998047 + }, + { + "auxiliary_loss_clip": 0.01452854, + "auxiliary_loss_mlp": 0.01265994, + "balance_loss_clip": 1.14196301, + "balance_loss_mlp": 1.04912806, + "epoch": 0.4452728092589809, + "flos": 21619608230880.0, + "grad_norm": 1.9736194745267148, + "language_loss": 0.75411093, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.78129941, + "num_input_tokens_seen": 158809090, + "step": 7406, + "time_per_iteration": 2.784987211227417 + }, + { + "auxiliary_loss_clip": 0.0145919, + "auxiliary_loss_mlp": 0.01254944, + "balance_loss_clip": 1.14754128, + "balance_loss_mlp": 1.03674316, + "epoch": 0.4453329325116489, + "flos": 29500096583520.0, + "grad_norm": 1.7613009295287183, + "language_loss": 0.65536261, + "learning_rate": 2.447373973772129e-06, + "loss": 0.68250394, + "num_input_tokens_seen": 158828320, + "step": 7407, + "time_per_iteration": 2.8542771339416504 + }, + { + "auxiliary_loss_clip": 0.01460355, + "auxiliary_loss_mlp": 0.01255128, + "balance_loss_clip": 1.14776587, + "balance_loss_mlp": 1.0348289, + "epoch": 0.44539305576431687, + "flos": 21363287175360.0, + "grad_norm": 1.6084406679325254, + "language_loss": 0.68363535, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.71079016, + "num_input_tokens_seen": 158847040, + "step": 7408, + "time_per_iteration": 2.8016302585601807 + }, + { + "auxiliary_loss_clip": 0.01466535, + "auxiliary_loss_mlp": 0.01254356, + "balance_loss_clip": 1.15563715, + "balance_loss_mlp": 1.03253174, + "epoch": 0.44545317901698483, + "flos": 41430859797600.0, + "grad_norm": 1.4483138836620895, + "language_loss": 0.720891, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74809992, + "num_input_tokens_seen": 158870490, + "step": 7409, + "time_per_iteration": 2.9085845947265625 + }, + { + "auxiliary_loss_clip": 0.01467478, + "auxiliary_loss_mlp": 0.01254921, + "balance_loss_clip": 1.15644717, + "balance_loss_mlp": 1.03156996, + "epoch": 0.4455133022696528, + "flos": 22057227712800.0, + "grad_norm": 2.0908048619927264, + "language_loss": 0.65083748, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67806149, + "num_input_tokens_seen": 158889920, + "step": 7410, + "time_per_iteration": 5.19216775894165 + }, + { + "auxiliary_loss_clip": 0.01474579, + "auxiliary_loss_mlp": 0.0125597, + "balance_loss_clip": 1.16252565, + "balance_loss_mlp": 1.0330013, + "epoch": 0.44557342552232077, + "flos": 23479206567840.0, + "grad_norm": 2.102776299250577, + "language_loss": 0.73823786, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.7655434, + "num_input_tokens_seen": 158909580, + "step": 7411, + "time_per_iteration": 2.8719353675842285 + }, + { + "auxiliary_loss_clip": 0.01469253, + "auxiliary_loss_mlp": 0.01252646, + "balance_loss_clip": 1.15885234, + "balance_loss_mlp": 1.03177476, + "epoch": 0.44563354877498873, + "flos": 19136654390880.0, + "grad_norm": 2.6547915046279518, + "language_loss": 0.79273313, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81995213, + "num_input_tokens_seen": 158924600, + "step": 7412, + "time_per_iteration": 2.8016154766082764 + }, + { + "auxiliary_loss_clip": 0.0146083, + "auxiliary_loss_mlp": 0.01250244, + "balance_loss_clip": 1.14934564, + "balance_loss_mlp": 1.02498674, + "epoch": 0.4456936720276567, + "flos": 13622868974880.0, + "grad_norm": 2.1556893131698422, + "language_loss": 0.79652017, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82363093, + "num_input_tokens_seen": 158939345, + "step": 7413, + "time_per_iteration": 2.7288765907287598 + }, + { + "auxiliary_loss_clip": 0.01464886, + "auxiliary_loss_mlp": 0.01247318, + "balance_loss_clip": 1.1546886, + "balance_loss_mlp": 1.02701914, + "epoch": 0.44575379528032466, + "flos": 14715097128000.0, + "grad_norm": 2.1158767950505375, + "language_loss": 0.76347268, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.79059482, + "num_input_tokens_seen": 158955855, + "step": 7414, + "time_per_iteration": 2.711015462875366 + }, + { + "auxiliary_loss_clip": 0.01469154, + "auxiliary_loss_mlp": 0.01248628, + "balance_loss_clip": 1.15797806, + "balance_loss_mlp": 1.02623057, + "epoch": 0.4458139185329926, + "flos": 24172995392640.0, + "grad_norm": 1.5350744077008773, + "language_loss": 0.83366048, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.86083823, + "num_input_tokens_seen": 158976315, + "step": 7415, + "time_per_iteration": 4.253509998321533 + }, + { + "auxiliary_loss_clip": 0.01466677, + "auxiliary_loss_mlp": 0.01242383, + "balance_loss_clip": 1.15529037, + "balance_loss_mlp": 1.01960456, + "epoch": 0.4458740417856606, + "flos": 21764381474880.0, + "grad_norm": 1.7313910129821766, + "language_loss": 0.84350437, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.87059498, + "num_input_tokens_seen": 158996725, + "step": 7416, + "time_per_iteration": 2.7304208278656006 + }, + { + "auxiliary_loss_clip": 0.01460686, + "auxiliary_loss_mlp": 0.0125045, + "balance_loss_clip": 1.14974523, + "balance_loss_mlp": 1.02938867, + "epoch": 0.44593416503832856, + "flos": 21070896075360.0, + "grad_norm": 1.6167810875727477, + "language_loss": 0.81023258, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83734393, + "num_input_tokens_seen": 159017255, + "step": 7417, + "time_per_iteration": 2.7840893268585205 + }, + { + "auxiliary_loss_clip": 0.01465423, + "auxiliary_loss_mlp": 0.01254724, + "balance_loss_clip": 1.15427399, + "balance_loss_mlp": 1.02927518, + "epoch": 0.4459942882909965, + "flos": 22602640118400.0, + "grad_norm": 1.9766055410524899, + "language_loss": 0.81189638, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83909786, + "num_input_tokens_seen": 159035010, + "step": 7418, + "time_per_iteration": 2.823110342025757 + }, + { + "auxiliary_loss_clip": 0.01469617, + "auxiliary_loss_mlp": 0.01241054, + "balance_loss_clip": 1.15907145, + "balance_loss_mlp": 1.0157963, + "epoch": 0.4460544115436645, + "flos": 26507914166880.0, + "grad_norm": 1.758451101560436, + "language_loss": 0.774728, + "learning_rate": 2.442817638972991e-06, + "loss": 0.8018347, + "num_input_tokens_seen": 159055345, + "step": 7419, + "time_per_iteration": 2.864664077758789 + }, + { + "auxiliary_loss_clip": 0.01459997, + "auxiliary_loss_mlp": 0.0124481, + "balance_loss_clip": 1.14923692, + "balance_loss_mlp": 1.02451181, + "epoch": 0.4461145347963325, + "flos": 17606010264480.0, + "grad_norm": 1.7486276894659918, + "language_loss": 0.72467756, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.75172561, + "num_input_tokens_seen": 159074225, + "step": 7420, + "time_per_iteration": 2.789005756378174 + }, + { + "auxiliary_loss_clip": 0.01461787, + "auxiliary_loss_mlp": 0.01241731, + "balance_loss_clip": 1.15134597, + "balance_loss_mlp": 1.01914334, + "epoch": 0.44617465804900047, + "flos": 27270467474400.0, + "grad_norm": 1.4716265711637062, + "language_loss": 0.74698675, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77402198, + "num_input_tokens_seen": 159095415, + "step": 7421, + "time_per_iteration": 2.8206615447998047 + }, + { + "auxiliary_loss_clip": 0.01463506, + "auxiliary_loss_mlp": 0.01242567, + "balance_loss_clip": 1.15353191, + "balance_loss_mlp": 1.02207756, + "epoch": 0.44623478130166844, + "flos": 17788408607520.0, + "grad_norm": 4.81433106068336, + "language_loss": 0.75823689, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78529763, + "num_input_tokens_seen": 159114615, + "step": 7422, + "time_per_iteration": 2.807560443878174 + }, + { + "auxiliary_loss_clip": 0.01456026, + "auxiliary_loss_mlp": 0.01249833, + "balance_loss_clip": 1.14645541, + "balance_loss_mlp": 1.02610087, + "epoch": 0.4462949045543364, + "flos": 23005251544320.0, + "grad_norm": 1.4848776152039633, + "language_loss": 0.6504333, + "learning_rate": 2.441298322143784e-06, + "loss": 0.6774919, + "num_input_tokens_seen": 159134370, + "step": 7423, + "time_per_iteration": 2.7751402854919434 + }, + { + "auxiliary_loss_clip": 0.01459983, + "auxiliary_loss_mlp": 0.01239705, + "balance_loss_clip": 1.1503123, + "balance_loss_mlp": 1.01826131, + "epoch": 0.44635502780700437, + "flos": 17821937465280.0, + "grad_norm": 1.752077062503266, + "language_loss": 0.78973991, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81673682, + "num_input_tokens_seen": 159152540, + "step": 7424, + "time_per_iteration": 2.8704569339752197 + }, + { + "auxiliary_loss_clip": 0.01461488, + "auxiliary_loss_mlp": 0.01238973, + "balance_loss_clip": 1.15047121, + "balance_loss_mlp": 1.02134395, + "epoch": 0.44641515105967233, + "flos": 26690464222560.0, + "grad_norm": 1.4737836433856644, + "language_loss": 0.80316114, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.83016574, + "num_input_tokens_seen": 159173425, + "step": 7425, + "time_per_iteration": 2.9532110691070557 + }, + { + "auxiliary_loss_clip": 0.01462927, + "auxiliary_loss_mlp": 0.01244565, + "balance_loss_clip": 1.15314829, + "balance_loss_mlp": 1.02502942, + "epoch": 0.4464752743123403, + "flos": 18915151750560.0, + "grad_norm": 1.5549333589105883, + "language_loss": 0.77686882, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.80394375, + "num_input_tokens_seen": 159191210, + "step": 7426, + "time_per_iteration": 2.8125855922698975 + }, + { + "auxiliary_loss_clip": 0.01461989, + "auxiliary_loss_mlp": 0.01242266, + "balance_loss_clip": 1.15017033, + "balance_loss_mlp": 1.01929736, + "epoch": 0.44653539756500826, + "flos": 29572919379360.0, + "grad_norm": 1.9667534085588905, + "language_loss": 0.64432645, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.67136896, + "num_input_tokens_seen": 159211755, + "step": 7427, + "time_per_iteration": 3.010934352874756 + }, + { + "auxiliary_loss_clip": 0.01470027, + "auxiliary_loss_mlp": 0.01250105, + "balance_loss_clip": 1.15899074, + "balance_loss_mlp": 1.02656412, + "epoch": 0.44659552081767623, + "flos": 21471042170880.0, + "grad_norm": 1.639358277602315, + "language_loss": 0.74880683, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77600813, + "num_input_tokens_seen": 159230315, + "step": 7428, + "time_per_iteration": 2.8847663402557373 + }, + { + "auxiliary_loss_clip": 0.01461136, + "auxiliary_loss_mlp": 0.01246105, + "balance_loss_clip": 1.14903069, + "balance_loss_mlp": 1.02599692, + "epoch": 0.4466556440703442, + "flos": 17933864558400.0, + "grad_norm": 2.230024568365236, + "language_loss": 0.78027821, + "learning_rate": 2.439018845165806e-06, + "loss": 0.80735064, + "num_input_tokens_seen": 159249810, + "step": 7429, + "time_per_iteration": 3.0019490718841553 + }, + { + "auxiliary_loss_clip": 0.01455645, + "auxiliary_loss_mlp": 0.0125268, + "balance_loss_clip": 1.14403272, + "balance_loss_mlp": 1.03257143, + "epoch": 0.44671576732301216, + "flos": 21109659019200.0, + "grad_norm": 2.0627565964147845, + "language_loss": 0.91084027, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93792355, + "num_input_tokens_seen": 159271715, + "step": 7430, + "time_per_iteration": 2.9675283432006836 + }, + { + "auxiliary_loss_clip": 0.01461152, + "auxiliary_loss_mlp": 0.01258635, + "balance_loss_clip": 1.14897168, + "balance_loss_mlp": 1.03490293, + "epoch": 0.4467758905756801, + "flos": 23510914873920.0, + "grad_norm": 2.2445220490499618, + "language_loss": 0.80165362, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.82885158, + "num_input_tokens_seen": 159290690, + "step": 7431, + "time_per_iteration": 2.933978796005249 + }, + { + "auxiliary_loss_clip": 0.01462597, + "auxiliary_loss_mlp": 0.01256774, + "balance_loss_clip": 1.14993596, + "balance_loss_mlp": 1.03609395, + "epoch": 0.4468360138283481, + "flos": 18736015229280.0, + "grad_norm": 2.0201972148579905, + "language_loss": 0.79944241, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82663614, + "num_input_tokens_seen": 159309400, + "step": 7432, + "time_per_iteration": 2.9465203285217285 + }, + { + "auxiliary_loss_clip": 0.01453633, + "auxiliary_loss_mlp": 0.01248458, + "balance_loss_clip": 1.14172113, + "balance_loss_mlp": 1.02758718, + "epoch": 0.4468961370810161, + "flos": 23479396208640.0, + "grad_norm": 2.346270993028319, + "language_loss": 0.77225256, + "learning_rate": 2.437498860702301e-06, + "loss": 0.79927349, + "num_input_tokens_seen": 159327425, + "step": 7433, + "time_per_iteration": 2.959463357925415 + }, + { + "auxiliary_loss_clip": 0.01451342, + "auxiliary_loss_mlp": 0.01253639, + "balance_loss_clip": 1.13924527, + "balance_loss_mlp": 1.03658259, + "epoch": 0.4469562603336841, + "flos": 30077331079680.0, + "grad_norm": 1.8141502361963564, + "language_loss": 0.77782929, + "learning_rate": 2.437118823075398e-06, + "loss": 0.80487913, + "num_input_tokens_seen": 159345805, + "step": 7434, + "time_per_iteration": 3.0154178142547607 + }, + { + "auxiliary_loss_clip": 0.01452228, + "auxiliary_loss_mlp": 0.01257707, + "balance_loss_clip": 1.14012933, + "balance_loss_mlp": 1.03855228, + "epoch": 0.44701638358635204, + "flos": 22458663365760.0, + "grad_norm": 1.8179500961383928, + "language_loss": 0.64495277, + "learning_rate": 2.436738768872905e-06, + "loss": 0.67205215, + "num_input_tokens_seen": 159364595, + "step": 7435, + "time_per_iteration": 2.9581127166748047 + }, + { + "auxiliary_loss_clip": 0.0146154, + "auxiliary_loss_mlp": 0.01258558, + "balance_loss_clip": 1.14864612, + "balance_loss_mlp": 1.03730547, + "epoch": 0.44707650683902, + "flos": 24059968382880.0, + "grad_norm": 1.676638723904669, + "language_loss": 0.83656359, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.86376458, + "num_input_tokens_seen": 159385265, + "step": 7436, + "time_per_iteration": 2.90128493309021 + }, + { + "auxiliary_loss_clip": 0.01459491, + "auxiliary_loss_mlp": 0.01273659, + "balance_loss_clip": 1.14704895, + "balance_loss_mlp": 1.05164385, + "epoch": 0.44713663009168797, + "flos": 23769094409280.0, + "grad_norm": 1.6564620877372367, + "language_loss": 0.79718411, + "learning_rate": 2.435978610798798e-06, + "loss": 0.82451558, + "num_input_tokens_seen": 159405080, + "step": 7437, + "time_per_iteration": 2.9761526584625244 + }, + { + "auxiliary_loss_clip": 0.0145703, + "auxiliary_loss_mlp": 0.0126375, + "balance_loss_clip": 1.14362431, + "balance_loss_mlp": 1.04154396, + "epoch": 0.44719675334435594, + "flos": 24501608249760.0, + "grad_norm": 2.23745209953619, + "language_loss": 0.71798801, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74519581, + "num_input_tokens_seen": 159424595, + "step": 7438, + "time_per_iteration": 3.017812490463257 + }, + { + "auxiliary_loss_clip": 0.01456931, + "auxiliary_loss_mlp": 0.01260276, + "balance_loss_clip": 1.14287138, + "balance_loss_mlp": 1.03845179, + "epoch": 0.4472568765970239, + "flos": 29783953847520.0, + "grad_norm": 1.6987757564947883, + "language_loss": 0.67313504, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.70030713, + "num_input_tokens_seen": 159443865, + "step": 7439, + "time_per_iteration": 3.0843617916107178 + }, + { + "auxiliary_loss_clip": 0.01456645, + "auxiliary_loss_mlp": 0.01277661, + "balance_loss_clip": 1.1432997, + "balance_loss_mlp": 1.05621791, + "epoch": 0.44731699984969187, + "flos": 24645357433440.0, + "grad_norm": 1.7782764114529273, + "language_loss": 0.73909783, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.76644087, + "num_input_tokens_seen": 159464525, + "step": 7440, + "time_per_iteration": 4.5826380252838135 + }, + { + "auxiliary_loss_clip": 0.01455291, + "auxiliary_loss_mlp": 0.01265342, + "balance_loss_clip": 1.14081669, + "balance_loss_mlp": 1.0463779, + "epoch": 0.44737712310235983, + "flos": 29457995961600.0, + "grad_norm": 1.656661811648386, + "language_loss": 0.74295282, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.77015913, + "num_input_tokens_seen": 159486385, + "step": 7441, + "time_per_iteration": 2.9737021923065186 + }, + { + "auxiliary_loss_clip": 0.01457087, + "auxiliary_loss_mlp": 0.01251457, + "balance_loss_clip": 1.1424737, + "balance_loss_mlp": 1.03153992, + "epoch": 0.4474372463550278, + "flos": 24898985589600.0, + "grad_norm": 1.874384171950789, + "language_loss": 0.75194365, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.77902901, + "num_input_tokens_seen": 159503880, + "step": 7442, + "time_per_iteration": 2.9131593704223633 + }, + { + "auxiliary_loss_clip": 0.01452354, + "auxiliary_loss_mlp": 0.01257951, + "balance_loss_clip": 1.13686311, + "balance_loss_mlp": 1.03555369, + "epoch": 0.44749736960769576, + "flos": 33184361057760.0, + "grad_norm": 1.895601408806374, + "language_loss": 0.74457979, + "learning_rate": 2.433697740261273e-06, + "loss": 0.77168286, + "num_input_tokens_seen": 159522980, + "step": 7443, + "time_per_iteration": 2.982414722442627 + }, + { + "auxiliary_loss_clip": 0.01451507, + "auxiliary_loss_mlp": 0.0125601, + "balance_loss_clip": 1.13776898, + "balance_loss_mlp": 1.03552103, + "epoch": 0.4475574928603637, + "flos": 21074082040800.0, + "grad_norm": 1.7884078205231375, + "language_loss": 0.77991229, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.8069874, + "num_input_tokens_seen": 159543340, + "step": 7444, + "time_per_iteration": 2.886101245880127 + }, + { + "auxiliary_loss_clip": 0.01453843, + "auxiliary_loss_mlp": 0.01261344, + "balance_loss_clip": 1.13967514, + "balance_loss_mlp": 1.04085505, + "epoch": 0.4476176161130317, + "flos": 21864171556800.0, + "grad_norm": 2.817338524597821, + "language_loss": 0.85177666, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87892854, + "num_input_tokens_seen": 159558210, + "step": 7445, + "time_per_iteration": 2.826874017715454 + }, + { + "auxiliary_loss_clip": 0.01456384, + "auxiliary_loss_mlp": 0.01261701, + "balance_loss_clip": 1.13987589, + "balance_loss_mlp": 1.03720593, + "epoch": 0.4476777393656997, + "flos": 22530879311040.0, + "grad_norm": 1.974344573597333, + "language_loss": 0.64400387, + "learning_rate": 2.432557082778765e-06, + "loss": 0.67118478, + "num_input_tokens_seen": 159577920, + "step": 7446, + "time_per_iteration": 2.838271141052246 + }, + { + "auxiliary_loss_clip": 0.0151559, + "auxiliary_loss_mlp": 0.01208672, + "balance_loss_clip": 1.21709299, + "balance_loss_mlp": 0.99886322, + "epoch": 0.4477378626183677, + "flos": 49022863286400.0, + "grad_norm": 0.7533308706645735, + "language_loss": 0.50175738, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52899998, + "num_input_tokens_seen": 159632295, + "step": 7447, + "time_per_iteration": 4.609310626983643 + }, + { + "auxiliary_loss_clip": 0.01518166, + "auxiliary_loss_mlp": 0.01207436, + "balance_loss_clip": 1.21939015, + "balance_loss_mlp": 0.9983902, + "epoch": 0.44779798587103564, + "flos": 56548833484320.0, + "grad_norm": 0.7971362538114426, + "language_loss": 0.59375, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.62100601, + "num_input_tokens_seen": 159698435, + "step": 7448, + "time_per_iteration": 5.154995441436768 + }, + { + "auxiliary_loss_clip": 0.01445274, + "auxiliary_loss_mlp": 0.01248043, + "balance_loss_clip": 1.12933183, + "balance_loss_mlp": 1.02965128, + "epoch": 0.4478581091237036, + "flos": 46502019214560.0, + "grad_norm": 1.5682109243724962, + "language_loss": 0.58940494, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61633813, + "num_input_tokens_seen": 159722150, + "step": 7449, + "time_per_iteration": 3.0617778301239014 + }, + { + "auxiliary_loss_clip": 0.01453774, + "auxiliary_loss_mlp": 0.01262418, + "balance_loss_clip": 1.13949394, + "balance_loss_mlp": 1.04059374, + "epoch": 0.4479182323763716, + "flos": 20816509356000.0, + "grad_norm": 1.8616272593598868, + "language_loss": 0.80335045, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.8305124, + "num_input_tokens_seen": 159740550, + "step": 7450, + "time_per_iteration": 2.99501633644104 + }, + { + "auxiliary_loss_clip": 0.01449204, + "auxiliary_loss_mlp": 0.01257409, + "balance_loss_clip": 1.13507473, + "balance_loss_mlp": 1.03691936, + "epoch": 0.44797835562903954, + "flos": 14247703676160.0, + "grad_norm": 2.126707361404925, + "language_loss": 0.79098666, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81805277, + "num_input_tokens_seen": 159758245, + "step": 7451, + "time_per_iteration": 2.8382272720336914 + }, + { + "auxiliary_loss_clip": 0.01517736, + "auxiliary_loss_mlp": 0.01226105, + "balance_loss_clip": 1.21780229, + "balance_loss_mlp": 1.01858521, + "epoch": 0.4480384788817075, + "flos": 63540883873440.0, + "grad_norm": 0.8297184232044145, + "language_loss": 0.62796068, + "learning_rate": 2.430275325332681e-06, + "loss": 0.65539896, + "num_input_tokens_seen": 159826790, + "step": 7452, + "time_per_iteration": 3.431828022003174 + }, + { + "auxiliary_loss_clip": 0.01458172, + "auxiliary_loss_mlp": 0.0124912, + "balance_loss_clip": 1.14389563, + "balance_loss_mlp": 1.02672279, + "epoch": 0.44809860213437547, + "flos": 21654767999520.0, + "grad_norm": 2.2522446039965147, + "language_loss": 0.62672287, + "learning_rate": 2.429894975234582e-06, + "loss": 0.65379572, + "num_input_tokens_seen": 159845805, + "step": 7453, + "time_per_iteration": 2.868605375289917 + }, + { + "auxiliary_loss_clip": 0.01513415, + "auxiliary_loss_mlp": 0.01233871, + "balance_loss_clip": 1.21470308, + "balance_loss_mlp": 1.02787781, + "epoch": 0.44815872538704343, + "flos": 69197318556480.0, + "grad_norm": 0.7949948941710605, + "language_loss": 0.56955004, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59702289, + "num_input_tokens_seen": 159898860, + "step": 7454, + "time_per_iteration": 4.618616342544556 + }, + { + "auxiliary_loss_clip": 0.01453827, + "auxiliary_loss_mlp": 0.01262834, + "balance_loss_clip": 1.13948774, + "balance_loss_mlp": 1.04367971, + "epoch": 0.4482188486397114, + "flos": 12599594945280.0, + "grad_norm": 7.236750811236525, + "language_loss": 0.75380147, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.78096807, + "num_input_tokens_seen": 159911555, + "step": 7455, + "time_per_iteration": 2.7947516441345215 + }, + { + "auxiliary_loss_clip": 0.01451226, + "auxiliary_loss_mlp": 0.01258327, + "balance_loss_clip": 1.13675404, + "balance_loss_mlp": 1.03840983, + "epoch": 0.44827897189237936, + "flos": 34061041291680.0, + "grad_norm": 1.926636006292865, + "language_loss": 0.76066196, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78775746, + "num_input_tokens_seen": 159931470, + "step": 7456, + "time_per_iteration": 2.89041805267334 + }, + { + "auxiliary_loss_clip": 0.01459465, + "auxiliary_loss_mlp": 0.01252659, + "balance_loss_clip": 1.14470088, + "balance_loss_mlp": 1.02968979, + "epoch": 0.44833909514504733, + "flos": 25149313995840.0, + "grad_norm": 1.864577251967849, + "language_loss": 0.76084149, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78796279, + "num_input_tokens_seen": 159946115, + "step": 7457, + "time_per_iteration": 2.8106491565704346 + }, + { + "auxiliary_loss_clip": 0.01452552, + "auxiliary_loss_mlp": 0.0125592, + "balance_loss_clip": 1.13882172, + "balance_loss_mlp": 1.03276026, + "epoch": 0.4483992183977153, + "flos": 16181831576160.0, + "grad_norm": 7.64104320935754, + "language_loss": 0.68248892, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.70957363, + "num_input_tokens_seen": 159963915, + "step": 7458, + "time_per_iteration": 2.8224503993988037 + }, + { + "auxiliary_loss_clip": 0.01457111, + "auxiliary_loss_mlp": 0.01256316, + "balance_loss_clip": 1.14252329, + "balance_loss_mlp": 1.02972293, + "epoch": 0.44845934165038326, + "flos": 17747825112000.0, + "grad_norm": 1.8951411494537642, + "language_loss": 0.71684617, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74398041, + "num_input_tokens_seen": 159982140, + "step": 7459, + "time_per_iteration": 2.785262107849121 + }, + { + "auxiliary_loss_clip": 0.01453591, + "auxiliary_loss_mlp": 0.01238361, + "balance_loss_clip": 1.14079666, + "balance_loss_mlp": 1.01558232, + "epoch": 0.4485194649030513, + "flos": 21838114546560.0, + "grad_norm": 1.783563288581948, + "language_loss": 0.69662982, + "learning_rate": 2.427232068909154e-06, + "loss": 0.72354937, + "num_input_tokens_seen": 160002280, + "step": 7460, + "time_per_iteration": 2.8022501468658447 + }, + { + "auxiliary_loss_clip": 0.01451514, + "auxiliary_loss_mlp": 0.01247663, + "balance_loss_clip": 1.13652086, + "balance_loss_mlp": 1.02450335, + "epoch": 0.44857958815571924, + "flos": 20086612558560.0, + "grad_norm": 2.156466608773332, + "language_loss": 0.77248299, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79947478, + "num_input_tokens_seen": 160020260, + "step": 7461, + "time_per_iteration": 2.733489751815796 + }, + { + "auxiliary_loss_clip": 0.01453369, + "auxiliary_loss_mlp": 0.01247291, + "balance_loss_clip": 1.14014089, + "balance_loss_mlp": 1.02279556, + "epoch": 0.4486397114083872, + "flos": 27056019471840.0, + "grad_norm": 2.368679376046303, + "language_loss": 0.67364794, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70065451, + "num_input_tokens_seen": 160040240, + "step": 7462, + "time_per_iteration": 2.8263237476348877 + }, + { + "auxiliary_loss_clip": 0.01505658, + "auxiliary_loss_mlp": 0.01205414, + "balance_loss_clip": 1.21170712, + "balance_loss_mlp": 0.99789429, + "epoch": 0.4486998346610552, + "flos": 67327289975520.0, + "grad_norm": 0.7445416305320145, + "language_loss": 0.54455078, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.57166147, + "num_input_tokens_seen": 160093865, + "step": 7463, + "time_per_iteration": 3.358220100402832 + }, + { + "auxiliary_loss_clip": 0.01456102, + "auxiliary_loss_mlp": 0.012425, + "balance_loss_clip": 1.14214659, + "balance_loss_mlp": 1.02124763, + "epoch": 0.44875995791372314, + "flos": 27639322473600.0, + "grad_norm": 2.4526910093013607, + "language_loss": 0.75495344, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.78193951, + "num_input_tokens_seen": 160113590, + "step": 7464, + "time_per_iteration": 2.842437982559204 + }, + { + "auxiliary_loss_clip": 0.0144178, + "auxiliary_loss_mlp": 0.0124736, + "balance_loss_clip": 1.12761104, + "balance_loss_mlp": 1.02839661, + "epoch": 0.4488200811663911, + "flos": 13007630098080.0, + "grad_norm": 2.6431877982633925, + "language_loss": 0.74058098, + "learning_rate": 2.425329506653441e-06, + "loss": 0.76747239, + "num_input_tokens_seen": 160131795, + "step": 7465, + "time_per_iteration": 2.795498847961426 + }, + { + "auxiliary_loss_clip": 0.0145576, + "auxiliary_loss_mlp": 0.01255009, + "balance_loss_clip": 1.14262438, + "balance_loss_mlp": 1.03013301, + "epoch": 0.44888020441905907, + "flos": 27492425252640.0, + "grad_norm": 2.412066029448976, + "language_loss": 0.79666787, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82377553, + "num_input_tokens_seen": 160150635, + "step": 7466, + "time_per_iteration": 2.7965404987335205 + }, + { + "auxiliary_loss_clip": 0.01448669, + "auxiliary_loss_mlp": 0.01246918, + "balance_loss_clip": 1.13490248, + "balance_loss_mlp": 1.02509308, + "epoch": 0.44894032767172704, + "flos": 18261642996000.0, + "grad_norm": 4.916665143513868, + "language_loss": 0.8055886, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.83254445, + "num_input_tokens_seen": 160168615, + "step": 7467, + "time_per_iteration": 2.8274316787719727 + }, + { + "auxiliary_loss_clip": 0.01451652, + "auxiliary_loss_mlp": 0.01258062, + "balance_loss_clip": 1.13807702, + "balance_loss_mlp": 1.03986132, + "epoch": 0.449000450924395, + "flos": 21581983131840.0, + "grad_norm": 2.090450509161485, + "language_loss": 0.75122237, + "learning_rate": 2.424187775642129e-06, + "loss": 0.77831954, + "num_input_tokens_seen": 160187295, + "step": 7468, + "time_per_iteration": 2.7677817344665527 + }, + { + "auxiliary_loss_clip": 0.01446643, + "auxiliary_loss_mlp": 0.01247435, + "balance_loss_clip": 1.1334008, + "balance_loss_mlp": 1.02904356, + "epoch": 0.44906057417706297, + "flos": 17969858746560.0, + "grad_norm": 2.7107779852418328, + "language_loss": 0.70919478, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.73613554, + "num_input_tokens_seen": 160205115, + "step": 7469, + "time_per_iteration": 2.8010449409484863 + }, + { + "auxiliary_loss_clip": 0.01452265, + "auxiliary_loss_mlp": 0.01242215, + "balance_loss_clip": 1.13823295, + "balance_loss_mlp": 1.02019954, + "epoch": 0.44912069742973093, + "flos": 20049404669280.0, + "grad_norm": 1.769970884860552, + "language_loss": 0.72213227, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74907708, + "num_input_tokens_seen": 160222580, + "step": 7470, + "time_per_iteration": 2.7703022956848145 + }, + { + "auxiliary_loss_clip": 0.01441845, + "auxiliary_loss_mlp": 0.01251784, + "balance_loss_clip": 1.12822175, + "balance_loss_mlp": 1.03339231, + "epoch": 0.4491808206823989, + "flos": 21035698378560.0, + "grad_norm": 1.8941303256329551, + "language_loss": 0.76595068, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79288697, + "num_input_tokens_seen": 160241520, + "step": 7471, + "time_per_iteration": 2.7854907512664795 + }, + { + "auxiliary_loss_clip": 0.01454922, + "auxiliary_loss_mlp": 0.01259094, + "balance_loss_clip": 1.14092469, + "balance_loss_mlp": 1.03536189, + "epoch": 0.44924094393506686, + "flos": 22969409068800.0, + "grad_norm": 1.8489121742068946, + "language_loss": 0.70015961, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72729975, + "num_input_tokens_seen": 160261815, + "step": 7472, + "time_per_iteration": 2.839951515197754 + }, + { + "auxiliary_loss_clip": 0.01493298, + "auxiliary_loss_mlp": 0.01217415, + "balance_loss_clip": 1.20170712, + "balance_loss_mlp": 1.01218414, + "epoch": 0.4493010671877349, + "flos": 59239787539680.0, + "grad_norm": 0.7426645871492585, + "language_loss": 0.61611742, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.6432246, + "num_input_tokens_seen": 160317070, + "step": 7473, + "time_per_iteration": 3.293976306915283 + }, + { + "auxiliary_loss_clip": 0.01447116, + "auxiliary_loss_mlp": 0.01258787, + "balance_loss_clip": 1.13288617, + "balance_loss_mlp": 1.03925157, + "epoch": 0.44936119044040285, + "flos": 18006876995040.0, + "grad_norm": 2.268490919025664, + "language_loss": 0.77979386, + "learning_rate": 2.421903879707657e-06, + "loss": 0.80685288, + "num_input_tokens_seen": 160334980, + "step": 7474, + "time_per_iteration": 2.7883355617523193 + }, + { + "auxiliary_loss_clip": 0.01454018, + "auxiliary_loss_mlp": 0.01251686, + "balance_loss_clip": 1.13938868, + "balance_loss_mlp": 1.03138685, + "epoch": 0.4494213136930708, + "flos": 21253977125280.0, + "grad_norm": 1.8154232599551003, + "language_loss": 0.72327501, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.75033206, + "num_input_tokens_seen": 160354500, + "step": 7475, + "time_per_iteration": 2.8349990844726562 + }, + { + "auxiliary_loss_clip": 0.01440918, + "auxiliary_loss_mlp": 0.01248463, + "balance_loss_clip": 1.12823009, + "balance_loss_mlp": 1.03026271, + "epoch": 0.4494814369457388, + "flos": 27421233367680.0, + "grad_norm": 2.088223878009255, + "language_loss": 0.77114773, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.79804158, + "num_input_tokens_seen": 160373650, + "step": 7476, + "time_per_iteration": 2.8741044998168945 + }, + { + "auxiliary_loss_clip": 0.0145246, + "auxiliary_loss_mlp": 0.01253845, + "balance_loss_clip": 1.13818622, + "balance_loss_mlp": 1.03144825, + "epoch": 0.44954156019840674, + "flos": 22856116561920.0, + "grad_norm": 2.0323858205701795, + "language_loss": 0.71885657, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.74591959, + "num_input_tokens_seen": 160393430, + "step": 7477, + "time_per_iteration": 2.784876823425293 + }, + { + "auxiliary_loss_clip": 0.01450979, + "auxiliary_loss_mlp": 0.01254454, + "balance_loss_clip": 1.13696861, + "balance_loss_mlp": 1.03148544, + "epoch": 0.4496016834510747, + "flos": 17203474694880.0, + "grad_norm": 2.488462823617563, + "language_loss": 0.67951453, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70656884, + "num_input_tokens_seen": 160410545, + "step": 7478, + "time_per_iteration": 4.535475492477417 + }, + { + "auxiliary_loss_clip": 0.01449925, + "auxiliary_loss_mlp": 0.01245367, + "balance_loss_clip": 1.13683319, + "balance_loss_mlp": 1.02697539, + "epoch": 0.4496618067037427, + "flos": 18918641141280.0, + "grad_norm": 1.874426484033325, + "language_loss": 0.89489067, + "learning_rate": 2.420000193000779e-06, + "loss": 0.92184365, + "num_input_tokens_seen": 160428105, + "step": 7479, + "time_per_iteration": 2.819157361984253 + }, + { + "auxiliary_loss_clip": 0.01448212, + "auxiliary_loss_mlp": 0.01260401, + "balance_loss_clip": 1.13449752, + "balance_loss_mlp": 1.03724146, + "epoch": 0.44972192995641064, + "flos": 21033726114240.0, + "grad_norm": 1.891996985166049, + "language_loss": 0.75737923, + "learning_rate": 2.419619407822302e-06, + "loss": 0.78446537, + "num_input_tokens_seen": 160448815, + "step": 7480, + "time_per_iteration": 2.868501901626587 + }, + { + "auxiliary_loss_clip": 0.01454799, + "auxiliary_loss_mlp": 0.01249903, + "balance_loss_clip": 1.1410073, + "balance_loss_mlp": 1.02578998, + "epoch": 0.4497820532090786, + "flos": 20779187682240.0, + "grad_norm": 2.2915681087061746, + "language_loss": 0.79629767, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82334471, + "num_input_tokens_seen": 160465940, + "step": 7481, + "time_per_iteration": 2.7573297023773193 + }, + { + "auxiliary_loss_clip": 0.01445493, + "auxiliary_loss_mlp": 0.01249353, + "balance_loss_clip": 1.13055491, + "balance_loss_mlp": 1.03153419, + "epoch": 0.44984217646174657, + "flos": 33805440871200.0, + "grad_norm": 2.3680238149700945, + "language_loss": 0.68646049, + "learning_rate": 2.418857789743758e-06, + "loss": 0.71340895, + "num_input_tokens_seen": 160486710, + "step": 7482, + "time_per_iteration": 2.9021263122558594 + }, + { + "auxiliary_loss_clip": 0.01450046, + "auxiliary_loss_mlp": 0.0125704, + "balance_loss_clip": 1.13667095, + "balance_loss_mlp": 1.03674102, + "epoch": 0.44990229971441453, + "flos": 15519902770080.0, + "grad_norm": 1.9267704276489386, + "language_loss": 0.84300876, + "learning_rate": 2.418476956872571e-06, + "loss": 0.87007964, + "num_input_tokens_seen": 160503405, + "step": 7483, + "time_per_iteration": 2.713792085647583 + }, + { + "auxiliary_loss_clip": 0.01454551, + "auxiliary_loss_mlp": 0.012501, + "balance_loss_clip": 1.14141285, + "balance_loss_mlp": 1.02789354, + "epoch": 0.4499624229670825, + "flos": 29864513988000.0, + "grad_norm": 1.9675250774155966, + "language_loss": 0.80387926, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.8309257, + "num_input_tokens_seen": 160525080, + "step": 7484, + "time_per_iteration": 2.8935511112213135 + }, + { + "auxiliary_loss_clip": 0.01454221, + "auxiliary_loss_mlp": 0.01257295, + "balance_loss_clip": 1.14132071, + "balance_loss_mlp": 1.03470731, + "epoch": 0.45002254621975046, + "flos": 18515574577440.0, + "grad_norm": 2.420200225647099, + "language_loss": 0.75023943, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.7773546, + "num_input_tokens_seen": 160540895, + "step": 7485, + "time_per_iteration": 4.361106634140015 + }, + { + "auxiliary_loss_clip": 0.01486229, + "auxiliary_loss_mlp": 0.01204346, + "balance_loss_clip": 1.20021141, + "balance_loss_mlp": 0.99835205, + "epoch": 0.4500826694724185, + "flos": 70426544680800.0, + "grad_norm": 0.787193540755516, + "language_loss": 0.58558279, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.61248857, + "num_input_tokens_seen": 160598270, + "step": 7486, + "time_per_iteration": 4.941924333572388 + }, + { + "auxiliary_loss_clip": 0.01446262, + "auxiliary_loss_mlp": 0.01246618, + "balance_loss_clip": 1.13266349, + "balance_loss_mlp": 1.02593791, + "epoch": 0.45014279272508645, + "flos": 15780737276640.0, + "grad_norm": 2.5083549405444234, + "language_loss": 0.8284803, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85540909, + "num_input_tokens_seen": 160614720, + "step": 7487, + "time_per_iteration": 2.7214787006378174 + }, + { + "auxiliary_loss_clip": 0.01443116, + "auxiliary_loss_mlp": 0.01239184, + "balance_loss_clip": 1.12992787, + "balance_loss_mlp": 1.01831317, + "epoch": 0.4502029159777544, + "flos": 21801779004960.0, + "grad_norm": 1.585819495395266, + "language_loss": 0.77099735, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79782033, + "num_input_tokens_seen": 160635170, + "step": 7488, + "time_per_iteration": 2.850085735321045 + }, + { + "auxiliary_loss_clip": 0.01452558, + "auxiliary_loss_mlp": 0.01261262, + "balance_loss_clip": 1.14020109, + "balance_loss_mlp": 1.03753018, + "epoch": 0.4502630392304224, + "flos": 28770541139520.0, + "grad_norm": 2.265667384805241, + "language_loss": 0.71969104, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.74682927, + "num_input_tokens_seen": 160654490, + "step": 7489, + "time_per_iteration": 2.8253707885742188 + }, + { + "auxiliary_loss_clip": 0.01460313, + "auxiliary_loss_mlp": 0.01246924, + "balance_loss_clip": 1.14643633, + "balance_loss_mlp": 1.0220474, + "epoch": 0.45032316248309034, + "flos": 15845595158880.0, + "grad_norm": 3.587698698755627, + "language_loss": 0.69474149, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.7218138, + "num_input_tokens_seen": 160669400, + "step": 7490, + "time_per_iteration": 2.7876014709472656 + }, + { + "auxiliary_loss_clip": 0.01489166, + "auxiliary_loss_mlp": 0.0121479, + "balance_loss_clip": 1.20525777, + "balance_loss_mlp": 1.00955963, + "epoch": 0.4503832857357583, + "flos": 57860212731840.0, + "grad_norm": 0.7473171176027094, + "language_loss": 0.56640822, + "learning_rate": 2.415429723843495e-06, + "loss": 0.5934478, + "num_input_tokens_seen": 160733820, + "step": 7491, + "time_per_iteration": 3.334073305130005 + }, + { + "auxiliary_loss_clip": 0.01450262, + "auxiliary_loss_mlp": 0.01244981, + "balance_loss_clip": 1.13905811, + "balance_loss_mlp": 1.02601695, + "epoch": 0.4504434089884263, + "flos": 23880338795520.0, + "grad_norm": 1.7883629043888534, + "language_loss": 0.79821622, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.82516873, + "num_input_tokens_seen": 160753175, + "step": 7492, + "time_per_iteration": 4.272010326385498 + }, + { + "auxiliary_loss_clip": 0.01456574, + "auxiliary_loss_mlp": 0.01250534, + "balance_loss_clip": 1.14354587, + "balance_loss_mlp": 1.02527583, + "epoch": 0.45050353224109424, + "flos": 17787005265600.0, + "grad_norm": 2.39807741905947, + "language_loss": 0.93074077, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.95781189, + "num_input_tokens_seen": 160768310, + "step": 7493, + "time_per_iteration": 2.770404100418091 + }, + { + "auxiliary_loss_clip": 0.01490916, + "auxiliary_loss_mlp": 0.01230011, + "balance_loss_clip": 1.2072506, + "balance_loss_mlp": 1.02401733, + "epoch": 0.4505636554937622, + "flos": 65069631591840.0, + "grad_norm": 0.807425183525956, + "language_loss": 0.627505, + "learning_rate": 2.4142867511336e-06, + "loss": 0.65471429, + "num_input_tokens_seen": 160827370, + "step": 7494, + "time_per_iteration": 3.33894681930542 + }, + { + "auxiliary_loss_clip": 0.01449424, + "auxiliary_loss_mlp": 0.01249262, + "balance_loss_clip": 1.13744879, + "balance_loss_mlp": 1.03125191, + "epoch": 0.45062377874643017, + "flos": 22202266453920.0, + "grad_norm": 1.5686392614347213, + "language_loss": 0.82167661, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84866351, + "num_input_tokens_seen": 160849140, + "step": 7495, + "time_per_iteration": 2.804778575897217 + }, + { + "auxiliary_loss_clip": 0.01449679, + "auxiliary_loss_mlp": 0.0125131, + "balance_loss_clip": 1.13846648, + "balance_loss_mlp": 1.03158379, + "epoch": 0.45068390199909814, + "flos": 37673355317760.0, + "grad_norm": 1.7439282197572077, + "language_loss": 0.86026967, + "learning_rate": 2.41352469075395e-06, + "loss": 0.88727957, + "num_input_tokens_seen": 160871280, + "step": 7496, + "time_per_iteration": 2.9432435035705566 + }, + { + "auxiliary_loss_clip": 0.01455417, + "auxiliary_loss_mlp": 0.01253539, + "balance_loss_clip": 1.14285016, + "balance_loss_mlp": 1.03209579, + "epoch": 0.4507440252517661, + "flos": 22304294297280.0, + "grad_norm": 1.991304018581301, + "language_loss": 0.76109284, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.7881825, + "num_input_tokens_seen": 160888625, + "step": 7497, + "time_per_iteration": 2.7334470748901367 + }, + { + "auxiliary_loss_clip": 0.01452996, + "auxiliary_loss_mlp": 0.01247411, + "balance_loss_clip": 1.13975883, + "balance_loss_mlp": 1.02654004, + "epoch": 0.45080414850443407, + "flos": 13190066369280.0, + "grad_norm": 2.002102043909523, + "language_loss": 0.74971545, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77671945, + "num_input_tokens_seen": 160907040, + "step": 7498, + "time_per_iteration": 2.8529715538024902 + }, + { + "auxiliary_loss_clip": 0.01455649, + "auxiliary_loss_mlp": 0.01247961, + "balance_loss_clip": 1.14314389, + "balance_loss_mlp": 1.02270317, + "epoch": 0.4508642717571021, + "flos": 21947121171360.0, + "grad_norm": 2.38356843862308, + "language_loss": 0.69910908, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72614515, + "num_input_tokens_seen": 160927115, + "step": 7499, + "time_per_iteration": 2.75754451751709 + }, + { + "auxiliary_loss_clip": 0.01453727, + "auxiliary_loss_mlp": 0.01247909, + "balance_loss_clip": 1.14169312, + "balance_loss_mlp": 1.02474952, + "epoch": 0.45092439500977005, + "flos": 23369706876960.0, + "grad_norm": 2.054594710766761, + "language_loss": 0.7702533, + "learning_rate": 2.412000381939477e-06, + "loss": 0.7972697, + "num_input_tokens_seen": 160944405, + "step": 7500, + "time_per_iteration": 2.7961678504943848 + }, + { + "auxiliary_loss_clip": 0.01454616, + "auxiliary_loss_mlp": 0.01240078, + "balance_loss_clip": 1.1428647, + "balance_loss_mlp": 1.01882553, + "epoch": 0.450984518262438, + "flos": 20774825943840.0, + "grad_norm": 2.0746214135318657, + "language_loss": 0.62570596, + "learning_rate": 2.411619265641992e-06, + "loss": 0.65265298, + "num_input_tokens_seen": 160961345, + "step": 7501, + "time_per_iteration": 2.7819020748138428 + }, + { + "auxiliary_loss_clip": 0.01455884, + "auxiliary_loss_mlp": 0.01237659, + "balance_loss_clip": 1.14380884, + "balance_loss_mlp": 1.01545262, + "epoch": 0.451044641515106, + "flos": 17709024240000.0, + "grad_norm": 1.9296782998124147, + "language_loss": 0.84493017, + "learning_rate": 2.411238133735863e-06, + "loss": 0.87186563, + "num_input_tokens_seen": 160977330, + "step": 7502, + "time_per_iteration": 2.730693817138672 + }, + { + "auxiliary_loss_clip": 0.01451727, + "auxiliary_loss_mlp": 0.01243043, + "balance_loss_clip": 1.14071321, + "balance_loss_mlp": 1.02121854, + "epoch": 0.45110476476777395, + "flos": 20596789339200.0, + "grad_norm": 1.4496470612787244, + "language_loss": 0.79516363, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.82211137, + "num_input_tokens_seen": 160997280, + "step": 7503, + "time_per_iteration": 2.7839136123657227 + }, + { + "auxiliary_loss_clip": 0.01456631, + "auxiliary_loss_mlp": 0.0125037, + "balance_loss_clip": 1.14607584, + "balance_loss_mlp": 1.02968943, + "epoch": 0.4511648880204419, + "flos": 16035882559200.0, + "grad_norm": 2.0681902851130527, + "language_loss": 0.81243461, + "learning_rate": 2.410475823155484e-06, + "loss": 0.8395046, + "num_input_tokens_seen": 161014235, + "step": 7504, + "time_per_iteration": 2.753857135772705 + }, + { + "auxiliary_loss_clip": 0.01452628, + "auxiliary_loss_mlp": 0.01254761, + "balance_loss_clip": 1.141644, + "balance_loss_mlp": 1.03522539, + "epoch": 0.4512250112731099, + "flos": 23980584015360.0, + "grad_norm": 1.7288262165534876, + "language_loss": 0.63062721, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.65770113, + "num_input_tokens_seen": 161032360, + "step": 7505, + "time_per_iteration": 2.7918663024902344 + }, + { + "auxiliary_loss_clip": 0.01491483, + "auxiliary_loss_mlp": 0.01211525, + "balance_loss_clip": 1.20817494, + "balance_loss_mlp": 1.00553131, + "epoch": 0.45128513452577784, + "flos": 71469655138080.0, + "grad_norm": 0.8340132468593819, + "language_loss": 0.58745652, + "learning_rate": 2.409713450313968e-06, + "loss": 0.61448658, + "num_input_tokens_seen": 161091360, + "step": 7506, + "time_per_iteration": 3.452659845352173 + }, + { + "auxiliary_loss_clip": 0.01452514, + "auxiliary_loss_mlp": 0.01246847, + "balance_loss_clip": 1.14021838, + "balance_loss_mlp": 1.02635717, + "epoch": 0.4513452577784458, + "flos": 22093032260160.0, + "grad_norm": 2.5341836017316175, + "language_loss": 0.7939567, + "learning_rate": 2.40933224058142e-06, + "loss": 0.82095027, + "num_input_tokens_seen": 161110825, + "step": 7507, + "time_per_iteration": 2.8072025775909424 + }, + { + "auxiliary_loss_clip": 0.01450304, + "auxiliary_loss_mlp": 0.01250152, + "balance_loss_clip": 1.13856483, + "balance_loss_mlp": 1.03214216, + "epoch": 0.4514053810311138, + "flos": 24278512626720.0, + "grad_norm": 1.5740030350034313, + "language_loss": 0.73842198, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76542652, + "num_input_tokens_seen": 161130685, + "step": 7508, + "time_per_iteration": 2.822499990463257 + }, + { + "auxiliary_loss_clip": 0.01456744, + "auxiliary_loss_mlp": 0.01245182, + "balance_loss_clip": 1.14528966, + "balance_loss_mlp": 1.02640879, + "epoch": 0.45146550428378174, + "flos": 17888274545760.0, + "grad_norm": 2.1951290814985995, + "language_loss": 0.7989794, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.82599866, + "num_input_tokens_seen": 161147555, + "step": 7509, + "time_per_iteration": 2.733720541000366 + }, + { + "auxiliary_loss_clip": 0.01446273, + "auxiliary_loss_mlp": 0.01239984, + "balance_loss_clip": 1.13475609, + "balance_loss_mlp": 1.01968539, + "epoch": 0.4515256275364497, + "flos": 24245742332160.0, + "grad_norm": 1.7540902413694992, + "language_loss": 0.73074073, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75760329, + "num_input_tokens_seen": 161166255, + "step": 7510, + "time_per_iteration": 2.8583710193634033 + }, + { + "auxiliary_loss_clip": 0.01451945, + "auxiliary_loss_mlp": 0.01238971, + "balance_loss_clip": 1.13853586, + "balance_loss_mlp": 1.01790965, + "epoch": 0.45158575078911767, + "flos": 20633124880800.0, + "grad_norm": 2.7390235947030415, + "language_loss": 0.77131224, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79822147, + "num_input_tokens_seen": 161184720, + "step": 7511, + "time_per_iteration": 2.760899305343628 + }, + { + "auxiliary_loss_clip": 0.01450649, + "auxiliary_loss_mlp": 0.01253147, + "balance_loss_clip": 1.1380918, + "balance_loss_mlp": 1.032848, + "epoch": 0.45164587404178563, + "flos": 23329806088320.0, + "grad_norm": 2.0333469166781315, + "language_loss": 0.78762138, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.81465936, + "num_input_tokens_seen": 161204360, + "step": 7512, + "time_per_iteration": 2.8003652095794678 + }, + { + "auxiliary_loss_clip": 0.01452381, + "auxiliary_loss_mlp": 0.01248144, + "balance_loss_clip": 1.1392144, + "balance_loss_mlp": 1.02383971, + "epoch": 0.45170599729445365, + "flos": 23808274562880.0, + "grad_norm": 3.131968516682631, + "language_loss": 0.87709737, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.90410268, + "num_input_tokens_seen": 161223575, + "step": 7513, + "time_per_iteration": 2.7854340076446533 + }, + { + "auxiliary_loss_clip": 0.01450387, + "auxiliary_loss_mlp": 0.01238174, + "balance_loss_clip": 1.13843858, + "balance_loss_mlp": 1.0192101, + "epoch": 0.4517661205471216, + "flos": 23515162827840.0, + "grad_norm": 1.608894757070873, + "language_loss": 0.67100346, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69788903, + "num_input_tokens_seen": 161243805, + "step": 7514, + "time_per_iteration": 2.81986403465271 + }, + { + "auxiliary_loss_clip": 0.01463717, + "auxiliary_loss_mlp": 0.01249547, + "balance_loss_clip": 1.1497848, + "balance_loss_mlp": 1.02695894, + "epoch": 0.4518262437997896, + "flos": 23516224816320.0, + "grad_norm": 2.0996939358253504, + "language_loss": 0.69732511, + "learning_rate": 2.406282005146318e-06, + "loss": 0.7244578, + "num_input_tokens_seen": 161261450, + "step": 7515, + "time_per_iteration": 2.7778854370117188 + }, + { + "auxiliary_loss_clip": 0.01452791, + "auxiliary_loss_mlp": 0.01256602, + "balance_loss_clip": 1.13948274, + "balance_loss_mlp": 1.03439569, + "epoch": 0.45188636705245755, + "flos": 14569451536320.0, + "grad_norm": 2.8694561805003724, + "language_loss": 0.82065302, + "learning_rate": 2.405900656236963e-06, + "loss": 0.84774697, + "num_input_tokens_seen": 161276965, + "step": 7516, + "time_per_iteration": 4.4268763065338135 + }, + { + "auxiliary_loss_clip": 0.01456111, + "auxiliary_loss_mlp": 0.01255525, + "balance_loss_clip": 1.14302814, + "balance_loss_mlp": 1.03598905, + "epoch": 0.4519464903051255, + "flos": 19903607364960.0, + "grad_norm": 2.003581897006805, + "language_loss": 0.65231431, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.67943072, + "num_input_tokens_seen": 161295375, + "step": 7517, + "time_per_iteration": 2.759235143661499 + }, + { + "auxiliary_loss_clip": 0.01455823, + "auxiliary_loss_mlp": 0.01243212, + "balance_loss_clip": 1.14354253, + "balance_loss_mlp": 1.0252018, + "epoch": 0.4520066135577935, + "flos": 18846425196000.0, + "grad_norm": 1.900321033617832, + "language_loss": 0.62956303, + "learning_rate": 2.405137912257333e-06, + "loss": 0.65655339, + "num_input_tokens_seen": 161313010, + "step": 7518, + "time_per_iteration": 2.6426353454589844 + }, + { + "auxiliary_loss_clip": 0.01455238, + "auxiliary_loss_mlp": 0.01249164, + "balance_loss_clip": 1.14224815, + "balance_loss_mlp": 1.02867436, + "epoch": 0.45206673681046144, + "flos": 48218058008640.0, + "grad_norm": 1.4325410350820478, + "language_loss": 0.59437013, + "learning_rate": 2.404756517215982e-06, + "loss": 0.62141418, + "num_input_tokens_seen": 161336690, + "step": 7519, + "time_per_iteration": 2.8889148235321045 + }, + { + "auxiliary_loss_clip": 0.01454039, + "auxiliary_loss_mlp": 0.01246359, + "balance_loss_clip": 1.14193487, + "balance_loss_mlp": 1.02739525, + "epoch": 0.4521268600631294, + "flos": 23844458391840.0, + "grad_norm": 1.4526647612709966, + "language_loss": 0.72521174, + "learning_rate": 2.404375106826223e-06, + "loss": 0.75221574, + "num_input_tokens_seen": 161357845, + "step": 7520, + "time_per_iteration": 2.826385021209717 + }, + { + "auxiliary_loss_clip": 0.01450818, + "auxiliary_loss_mlp": 0.01252182, + "balance_loss_clip": 1.13762212, + "balance_loss_mlp": 1.02940321, + "epoch": 0.4521869833157974, + "flos": 18845628704640.0, + "grad_norm": 1.9096773104451614, + "language_loss": 0.75706208, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.78409207, + "num_input_tokens_seen": 161375160, + "step": 7521, + "time_per_iteration": 2.717756986618042 + }, + { + "auxiliary_loss_clip": 0.01453035, + "auxiliary_loss_mlp": 0.01240218, + "balance_loss_clip": 1.13823819, + "balance_loss_mlp": 1.01782084, + "epoch": 0.45224710656846534, + "flos": 19789897648320.0, + "grad_norm": 2.0416725003698377, + "language_loss": 0.67983544, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.70676804, + "num_input_tokens_seen": 161393690, + "step": 7522, + "time_per_iteration": 2.7682042121887207 + }, + { + "auxiliary_loss_clip": 0.01451893, + "auxiliary_loss_mlp": 0.0124804, + "balance_loss_clip": 1.13786376, + "balance_loss_mlp": 1.02926755, + "epoch": 0.4523072298211333, + "flos": 28258619663520.0, + "grad_norm": 1.5455743972617528, + "language_loss": 0.61067641, + "learning_rate": 2.403230783711134e-06, + "loss": 0.63767576, + "num_input_tokens_seen": 161415015, + "step": 7523, + "time_per_iteration": 4.4280054569244385 + }, + { + "auxiliary_loss_clip": 0.01451565, + "auxiliary_loss_mlp": 0.01247268, + "balance_loss_clip": 1.13815546, + "balance_loss_mlp": 1.02506137, + "epoch": 0.45236735307380127, + "flos": 11182584679200.0, + "grad_norm": 2.124450966256615, + "language_loss": 0.7847997, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.81178808, + "num_input_tokens_seen": 161432940, + "step": 7524, + "time_per_iteration": 2.722989320755005 + }, + { + "auxiliary_loss_clip": 0.0145205, + "auxiliary_loss_mlp": 0.01243953, + "balance_loss_clip": 1.13818955, + "balance_loss_mlp": 1.02498937, + "epoch": 0.45242747632646924, + "flos": 22603588322400.0, + "grad_norm": 1.7599124212357453, + "language_loss": 0.63883883, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.6657989, + "num_input_tokens_seen": 161452215, + "step": 7525, + "time_per_iteration": 2.7843894958496094 + }, + { + "auxiliary_loss_clip": 0.01447526, + "auxiliary_loss_mlp": 0.01246402, + "balance_loss_clip": 1.135252, + "balance_loss_mlp": 1.02801025, + "epoch": 0.45248759957913726, + "flos": 18258039820800.0, + "grad_norm": 1.8338089119698904, + "language_loss": 0.79793572, + "learning_rate": 2.402086322981083e-06, + "loss": 0.824875, + "num_input_tokens_seen": 161469520, + "step": 7526, + "time_per_iteration": 2.738574743270874 + }, + { + "auxiliary_loss_clip": 0.01451982, + "auxiliary_loss_mlp": 0.01250663, + "balance_loss_clip": 1.14017749, + "balance_loss_mlp": 1.03150868, + "epoch": 0.4525477228318052, + "flos": 22451988009600.0, + "grad_norm": 1.7172481091837206, + "language_loss": 0.81426907, + "learning_rate": 2.40170480555747e-06, + "loss": 0.84129554, + "num_input_tokens_seen": 161487335, + "step": 7527, + "time_per_iteration": 2.7567899227142334 + }, + { + "auxiliary_loss_clip": 0.01457568, + "auxiliary_loss_mlp": 0.01243353, + "balance_loss_clip": 1.1448735, + "balance_loss_mlp": 1.02343524, + "epoch": 0.4526078460844732, + "flos": 29647562726880.0, + "grad_norm": 1.6747963197721678, + "language_loss": 0.65201598, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67902517, + "num_input_tokens_seen": 161510095, + "step": 7528, + "time_per_iteration": 2.863926410675049 + }, + { + "auxiliary_loss_clip": 0.01450398, + "auxiliary_loss_mlp": 0.01247604, + "balance_loss_clip": 1.13639116, + "balance_loss_mlp": 1.02959406, + "epoch": 0.45266796933714115, + "flos": 23042004295680.0, + "grad_norm": 3.9920503399406537, + "language_loss": 0.75367868, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.78065872, + "num_input_tokens_seen": 161528725, + "step": 7529, + "time_per_iteration": 2.741511583328247 + }, + { + "auxiliary_loss_clip": 0.01449017, + "auxiliary_loss_mlp": 0.01251558, + "balance_loss_clip": 1.13628364, + "balance_loss_mlp": 1.03392947, + "epoch": 0.4527280925898091, + "flos": 14430139947360.0, + "grad_norm": 2.8198218219924267, + "language_loss": 0.72819519, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75520098, + "num_input_tokens_seen": 161547195, + "step": 7530, + "time_per_iteration": 2.7447030544281006 + }, + { + "auxiliary_loss_clip": 0.0145071, + "auxiliary_loss_mlp": 0.01258333, + "balance_loss_clip": 1.13794565, + "balance_loss_mlp": 1.04070437, + "epoch": 0.4527882158424771, + "flos": 22927498087680.0, + "grad_norm": 1.9094747763548574, + "language_loss": 0.76130641, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78839684, + "num_input_tokens_seen": 161565565, + "step": 7531, + "time_per_iteration": 4.216162919998169 + }, + { + "auxiliary_loss_clip": 0.01451669, + "auxiliary_loss_mlp": 0.01249529, + "balance_loss_clip": 1.13804007, + "balance_loss_mlp": 1.03361702, + "epoch": 0.45284833909514505, + "flos": 25557614645760.0, + "grad_norm": 1.4353348754685706, + "language_loss": 0.67182684, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69883883, + "num_input_tokens_seen": 161586630, + "step": 7532, + "time_per_iteration": 2.80558180809021 + }, + { + "auxiliary_loss_clip": 0.01446366, + "auxiliary_loss_mlp": 0.01249804, + "balance_loss_clip": 1.13274455, + "balance_loss_mlp": 1.03389215, + "epoch": 0.452908462347813, + "flos": 18151953664320.0, + "grad_norm": 1.9840106292126358, + "language_loss": 0.78719205, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81415379, + "num_input_tokens_seen": 161603815, + "step": 7533, + "time_per_iteration": 2.77382493019104 + }, + { + "auxiliary_loss_clip": 0.01448694, + "auxiliary_loss_mlp": 0.01254538, + "balance_loss_clip": 1.13540626, + "balance_loss_mlp": 1.03443003, + "epoch": 0.452968585600481, + "flos": 19064779799040.0, + "grad_norm": 2.119783752527777, + "language_loss": 0.83342165, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.86045396, + "num_input_tokens_seen": 161622900, + "step": 7534, + "time_per_iteration": 2.7771787643432617 + }, + { + "auxiliary_loss_clip": 0.01452973, + "auxiliary_loss_mlp": 0.01267055, + "balance_loss_clip": 1.14045739, + "balance_loss_mlp": 1.0479008, + "epoch": 0.45302870885314894, + "flos": 22053776250240.0, + "grad_norm": 2.6809483498358238, + "language_loss": 0.76400471, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.79120493, + "num_input_tokens_seen": 161641700, + "step": 7535, + "time_per_iteration": 2.7693469524383545 + }, + { + "auxiliary_loss_clip": 0.01443422, + "auxiliary_loss_mlp": 0.01238147, + "balance_loss_clip": 1.13045263, + "balance_loss_mlp": 1.0207088, + "epoch": 0.4530888321058169, + "flos": 20378320951680.0, + "grad_norm": 1.634440006947588, + "language_loss": 0.80772203, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.83453774, + "num_input_tokens_seen": 161661955, + "step": 7536, + "time_per_iteration": 2.7577009201049805 + }, + { + "auxiliary_loss_clip": 0.01447985, + "auxiliary_loss_mlp": 0.01248766, + "balance_loss_clip": 1.134197, + "balance_loss_mlp": 1.02961218, + "epoch": 0.4531489553584849, + "flos": 14832220379040.0, + "grad_norm": 1.8450150994187593, + "language_loss": 0.76268959, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.78965712, + "num_input_tokens_seen": 161679245, + "step": 7537, + "time_per_iteration": 2.7429559230804443 + }, + { + "auxiliary_loss_clip": 0.01453635, + "auxiliary_loss_mlp": 0.01252331, + "balance_loss_clip": 1.14109588, + "balance_loss_mlp": 1.03203273, + "epoch": 0.45320907861115284, + "flos": 21947159099520.0, + "grad_norm": 2.4570072774521843, + "language_loss": 0.76431781, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.79137743, + "num_input_tokens_seen": 161698795, + "step": 7538, + "time_per_iteration": 2.720088005065918 + }, + { + "auxiliary_loss_clip": 0.01519716, + "auxiliary_loss_mlp": 0.01226173, + "balance_loss_clip": 1.23376906, + "balance_loss_mlp": 1.02017975, + "epoch": 0.45326920186382086, + "flos": 66258880706880.0, + "grad_norm": 0.785789288204743, + "language_loss": 0.62286687, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.65032578, + "num_input_tokens_seen": 161761980, + "step": 7539, + "time_per_iteration": 3.40657377243042 + }, + { + "auxiliary_loss_clip": 0.01456396, + "auxiliary_loss_mlp": 0.01248072, + "balance_loss_clip": 1.14470887, + "balance_loss_mlp": 1.0306344, + "epoch": 0.4533293251164888, + "flos": 14686005864960.0, + "grad_norm": 2.4413544819287853, + "language_loss": 0.65621704, + "learning_rate": 2.396743698142872e-06, + "loss": 0.68326175, + "num_input_tokens_seen": 161779455, + "step": 7540, + "time_per_iteration": 2.7512006759643555 + }, + { + "auxiliary_loss_clip": 0.0145403, + "auxiliary_loss_mlp": 0.01260491, + "balance_loss_clip": 1.1398139, + "balance_loss_mlp": 1.03618658, + "epoch": 0.4533894483691568, + "flos": 22603398681600.0, + "grad_norm": 1.9475133571468173, + "language_loss": 0.84601444, + "learning_rate": 2.396361968778424e-06, + "loss": 0.87315965, + "num_input_tokens_seen": 161798980, + "step": 7541, + "time_per_iteration": 2.785184144973755 + }, + { + "auxiliary_loss_clip": 0.01448446, + "auxiliary_loss_mlp": 0.01252414, + "balance_loss_clip": 1.13503957, + "balance_loss_mlp": 1.03402293, + "epoch": 0.45344957162182475, + "flos": 34754526691200.0, + "grad_norm": 1.8057697571582918, + "language_loss": 0.77108324, + "learning_rate": 2.395980224383889e-06, + "loss": 0.79809177, + "num_input_tokens_seen": 161819745, + "step": 7542, + "time_per_iteration": 2.913055896759033 + }, + { + "auxiliary_loss_clip": 0.01449067, + "auxiliary_loss_mlp": 0.01245529, + "balance_loss_clip": 1.13662815, + "balance_loss_mlp": 1.02618372, + "epoch": 0.4535096948744927, + "flos": 23552484501600.0, + "grad_norm": 1.5857059373889664, + "language_loss": 0.80559504, + "learning_rate": 2.395598464973746e-06, + "loss": 0.83254099, + "num_input_tokens_seen": 161838575, + "step": 7543, + "time_per_iteration": 2.7530007362365723 + }, + { + "auxiliary_loss_clip": 0.0145158, + "auxiliary_loss_mlp": 0.01249773, + "balance_loss_clip": 1.13901019, + "balance_loss_mlp": 1.03080904, + "epoch": 0.4535698181271607, + "flos": 25559966191680.0, + "grad_norm": 1.694406058623594, + "language_loss": 0.76145089, + "learning_rate": 2.395216690562469e-06, + "loss": 0.78846443, + "num_input_tokens_seen": 161858590, + "step": 7544, + "time_per_iteration": 2.7819411754608154 + }, + { + "auxiliary_loss_clip": 0.01450076, + "auxiliary_loss_mlp": 0.01252002, + "balance_loss_clip": 1.13677812, + "balance_loss_mlp": 1.03017771, + "epoch": 0.45362994137982865, + "flos": 24866480792160.0, + "grad_norm": 1.8046936221258594, + "language_loss": 0.75249046, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77951121, + "num_input_tokens_seen": 161878390, + "step": 7545, + "time_per_iteration": 2.7619495391845703 + }, + { + "auxiliary_loss_clip": 0.01454607, + "auxiliary_loss_mlp": 0.01244709, + "balance_loss_clip": 1.14247024, + "balance_loss_mlp": 1.02593613, + "epoch": 0.4536900646324966, + "flos": 30809238069600.0, + "grad_norm": 1.6545016278388425, + "language_loss": 0.72395897, + "learning_rate": 2.394453096794423e-06, + "loss": 0.75095218, + "num_input_tokens_seen": 161898610, + "step": 7546, + "time_per_iteration": 2.84910249710083 + }, + { + "auxiliary_loss_clip": 0.0145928, + "auxiliary_loss_mlp": 0.01252932, + "balance_loss_clip": 1.14849424, + "balance_loss_mlp": 1.03091621, + "epoch": 0.4537501878851646, + "flos": 23406573412800.0, + "grad_norm": 2.03924444847039, + "language_loss": 0.75580442, + "learning_rate": 2.394071277466609e-06, + "loss": 0.78292656, + "num_input_tokens_seen": 161918210, + "step": 7547, + "time_per_iteration": 2.7524771690368652 + }, + { + "auxiliary_loss_clip": 0.01453919, + "auxiliary_loss_mlp": 0.01250615, + "balance_loss_clip": 1.14010692, + "balance_loss_mlp": 1.02688336, + "epoch": 0.45381031113783254, + "flos": 18151195101120.0, + "grad_norm": 1.9479190105765256, + "language_loss": 0.69912982, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72617513, + "num_input_tokens_seen": 161936950, + "step": 7548, + "time_per_iteration": 2.7466907501220703 + }, + { + "auxiliary_loss_clip": 0.01453591, + "auxiliary_loss_mlp": 0.01239061, + "balance_loss_clip": 1.14267659, + "balance_loss_mlp": 1.01723647, + "epoch": 0.4538704343905005, + "flos": 25338956617440.0, + "grad_norm": 2.0063701669331624, + "language_loss": 0.72429234, + "learning_rate": 2.393307593995794e-06, + "loss": 0.75121886, + "num_input_tokens_seen": 161955550, + "step": 7549, + "time_per_iteration": 2.8173763751983643 + }, + { + "auxiliary_loss_clip": 0.0145384, + "auxiliary_loss_mlp": 0.01239416, + "balance_loss_clip": 1.14141655, + "balance_loss_mlp": 1.01797295, + "epoch": 0.4539305576431685, + "flos": 28734015957120.0, + "grad_norm": 1.5900450461076885, + "language_loss": 0.65498352, + "learning_rate": 2.392925729881751e-06, + "loss": 0.68191612, + "num_input_tokens_seen": 161976760, + "step": 7550, + "time_per_iteration": 2.941467046737671 + }, + { + "auxiliary_loss_clip": 0.0146245, + "auxiliary_loss_mlp": 0.01244342, + "balance_loss_clip": 1.14975297, + "balance_loss_mlp": 1.02404368, + "epoch": 0.45399068089583644, + "flos": 22494619625760.0, + "grad_norm": 2.0117459205269785, + "language_loss": 0.68857563, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.71564353, + "num_input_tokens_seen": 161996120, + "step": 7551, + "time_per_iteration": 2.8053717613220215 + }, + { + "auxiliary_loss_clip": 0.0145343, + "auxiliary_loss_mlp": 0.01241748, + "balance_loss_clip": 1.14064312, + "balance_loss_mlp": 1.01935077, + "epoch": 0.45405080414850446, + "flos": 12894906513600.0, + "grad_norm": 1.7663752648203523, + "language_loss": 0.79124469, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81819642, + "num_input_tokens_seen": 162011125, + "step": 7552, + "time_per_iteration": 2.708043098449707 + }, + { + "auxiliary_loss_clip": 0.01533609, + "auxiliary_loss_mlp": 0.0120224, + "balance_loss_clip": 1.24909651, + "balance_loss_mlp": 0.9954834, + "epoch": 0.4541109274011724, + "flos": 59772494011680.0, + "grad_norm": 0.8252036529799057, + "language_loss": 0.57775301, + "learning_rate": 2.39178004819885e-06, + "loss": 0.60511148, + "num_input_tokens_seen": 162068705, + "step": 7553, + "time_per_iteration": 3.3207218647003174 + }, + { + "auxiliary_loss_clip": 0.01456774, + "auxiliary_loss_mlp": 0.01241132, + "balance_loss_clip": 1.14459467, + "balance_loss_mlp": 1.02407539, + "epoch": 0.4541710506538404, + "flos": 28514258012160.0, + "grad_norm": 1.4881270924295489, + "language_loss": 0.76663589, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.79361498, + "num_input_tokens_seen": 162089655, + "step": 7554, + "time_per_iteration": 2.7820887565612793 + }, + { + "auxiliary_loss_clip": 0.01462577, + "auxiliary_loss_mlp": 0.01264032, + "balance_loss_clip": 1.14957356, + "balance_loss_mlp": 1.04144478, + "epoch": 0.45423117390650836, + "flos": 17677505574720.0, + "grad_norm": 2.748606565075223, + "language_loss": 0.77020371, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79746985, + "num_input_tokens_seen": 162108465, + "step": 7555, + "time_per_iteration": 4.533098459243774 + }, + { + "auxiliary_loss_clip": 0.01459089, + "auxiliary_loss_mlp": 0.01245582, + "balance_loss_clip": 1.14577329, + "balance_loss_mlp": 1.02604604, + "epoch": 0.4542912971591763, + "flos": 28074969691200.0, + "grad_norm": 1.3492053636809305, + "language_loss": 0.72572482, + "learning_rate": 2.390634232808903e-06, + "loss": 0.75277156, + "num_input_tokens_seen": 162129910, + "step": 7556, + "time_per_iteration": 2.8728692531585693 + }, + { + "auxiliary_loss_clip": 0.01464725, + "auxiliary_loss_mlp": 0.01258286, + "balance_loss_clip": 1.15128112, + "balance_loss_mlp": 1.03302836, + "epoch": 0.4543514204118443, + "flos": 22673832003360.0, + "grad_norm": 2.269252615971606, + "language_loss": 0.63217288, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65940297, + "num_input_tokens_seen": 162148840, + "step": 7557, + "time_per_iteration": 2.746307134628296 + }, + { + "auxiliary_loss_clip": 0.01534412, + "auxiliary_loss_mlp": 0.01199753, + "balance_loss_clip": 1.25005507, + "balance_loss_mlp": 0.99375916, + "epoch": 0.45441154366451225, + "flos": 58223075081760.0, + "grad_norm": 0.699095522510566, + "language_loss": 0.57511365, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.60245526, + "num_input_tokens_seen": 162208500, + "step": 7558, + "time_per_iteration": 3.2340641021728516 + }, + { + "auxiliary_loss_clip": 0.01463701, + "auxiliary_loss_mlp": 0.01250214, + "balance_loss_clip": 1.15037155, + "balance_loss_mlp": 1.02762651, + "epoch": 0.4544716669171802, + "flos": 16766196566400.0, + "grad_norm": 4.843271340446579, + "language_loss": 0.56855822, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.5956974, + "num_input_tokens_seen": 162224650, + "step": 7559, + "time_per_iteration": 2.7499372959136963 + }, + { + "auxiliary_loss_clip": 0.01462448, + "auxiliary_loss_mlp": 0.01251091, + "balance_loss_clip": 1.14968967, + "balance_loss_mlp": 1.03021979, + "epoch": 0.4545317901698482, + "flos": 15927103503360.0, + "grad_norm": 2.1071311124800016, + "language_loss": 0.72603667, + "learning_rate": 2.389106271642792e-06, + "loss": 0.75317204, + "num_input_tokens_seen": 162242930, + "step": 7560, + "time_per_iteration": 2.7285196781158447 + }, + { + "auxiliary_loss_clip": 0.01467275, + "auxiliary_loss_mlp": 0.01249701, + "balance_loss_clip": 1.15381145, + "balance_loss_mlp": 1.02844882, + "epoch": 0.45459191342251615, + "flos": 17641701027360.0, + "grad_norm": 2.333778944983639, + "language_loss": 0.69587636, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.72304606, + "num_input_tokens_seen": 162261455, + "step": 7561, + "time_per_iteration": 5.804704904556274 + }, + { + "auxiliary_loss_clip": 0.01462246, + "auxiliary_loss_mlp": 0.01248885, + "balance_loss_clip": 1.14862645, + "balance_loss_mlp": 1.03354502, + "epoch": 0.4546520366751841, + "flos": 16178266329120.0, + "grad_norm": 1.7291621484756583, + "language_loss": 0.85527027, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.88238156, + "num_input_tokens_seen": 162279725, + "step": 7562, + "time_per_iteration": 2.764768362045288 + }, + { + "auxiliary_loss_clip": 0.01468523, + "auxiliary_loss_mlp": 0.01251367, + "balance_loss_clip": 1.15718019, + "balance_loss_mlp": 1.03125954, + "epoch": 0.4547121599278521, + "flos": 19753258681440.0, + "grad_norm": 1.889026370049938, + "language_loss": 0.89896476, + "learning_rate": 2.38796014579055e-06, + "loss": 0.92616367, + "num_input_tokens_seen": 162297865, + "step": 7563, + "time_per_iteration": 2.7786247730255127 + }, + { + "auxiliary_loss_clip": 0.01464912, + "auxiliary_loss_mlp": 0.0126381, + "balance_loss_clip": 1.15360057, + "balance_loss_mlp": 1.04503703, + "epoch": 0.45477228318052004, + "flos": 19939487768640.0, + "grad_norm": 2.931545196809425, + "language_loss": 0.72117329, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.74846047, + "num_input_tokens_seen": 162316010, + "step": 7564, + "time_per_iteration": 2.7619259357452393 + }, + { + "auxiliary_loss_clip": 0.01467815, + "auxiliary_loss_mlp": 0.01260663, + "balance_loss_clip": 1.15590489, + "balance_loss_mlp": 1.0397923, + "epoch": 0.454832406433188, + "flos": 21290464379520.0, + "grad_norm": 2.1019697551760355, + "language_loss": 0.68425286, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.7115376, + "num_input_tokens_seen": 162336115, + "step": 7565, + "time_per_iteration": 2.8672261238098145 + }, + { + "auxiliary_loss_clip": 0.01458698, + "auxiliary_loss_mlp": 0.01251942, + "balance_loss_clip": 1.1470108, + "balance_loss_mlp": 1.03583956, + "epoch": 0.45489252968585603, + "flos": 24501039327360.0, + "grad_norm": 1.7904137859340012, + "language_loss": 0.80655706, + "learning_rate": 2.386813887534922e-06, + "loss": 0.83366346, + "num_input_tokens_seen": 162355705, + "step": 7566, + "time_per_iteration": 2.8013570308685303 + }, + { + "auxiliary_loss_clip": 0.01461009, + "auxiliary_loss_mlp": 0.01245858, + "balance_loss_clip": 1.14770103, + "balance_loss_mlp": 1.0267036, + "epoch": 0.454952652938524, + "flos": 17094392213760.0, + "grad_norm": 1.608453472365314, + "language_loss": 0.74086463, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.76793331, + "num_input_tokens_seen": 162374055, + "step": 7567, + "time_per_iteration": 2.7499542236328125 + }, + { + "auxiliary_loss_clip": 0.01467186, + "auxiliary_loss_mlp": 0.01255648, + "balance_loss_clip": 1.15528798, + "balance_loss_mlp": 1.03744698, + "epoch": 0.45501277619119196, + "flos": 27632533332960.0, + "grad_norm": 1.4721446629025616, + "language_loss": 0.81162208, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83885044, + "num_input_tokens_seen": 162393560, + "step": 7568, + "time_per_iteration": 2.7976818084716797 + }, + { + "auxiliary_loss_clip": 0.01468082, + "auxiliary_loss_mlp": 0.01264517, + "balance_loss_clip": 1.15351391, + "balance_loss_mlp": 1.03849649, + "epoch": 0.4550728994438599, + "flos": 19976733586080.0, + "grad_norm": 2.2237784654912454, + "language_loss": 0.79668772, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.82401371, + "num_input_tokens_seen": 162413170, + "step": 7569, + "time_per_iteration": 4.356321096420288 + }, + { + "auxiliary_loss_clip": 0.01468593, + "auxiliary_loss_mlp": 0.01261349, + "balance_loss_clip": 1.15384793, + "balance_loss_mlp": 1.04009628, + "epoch": 0.4551330226965279, + "flos": 26069232696480.0, + "grad_norm": 1.4426691670315086, + "language_loss": 0.75052476, + "learning_rate": 2.385285337909412e-06, + "loss": 0.7778241, + "num_input_tokens_seen": 162434080, + "step": 7570, + "time_per_iteration": 2.846862316131592 + }, + { + "auxiliary_loss_clip": 0.01469303, + "auxiliary_loss_mlp": 0.01255659, + "balance_loss_clip": 1.15669918, + "balance_loss_mlp": 1.03783989, + "epoch": 0.45519314594919585, + "flos": 32783873608800.0, + "grad_norm": 1.6783293995159407, + "language_loss": 0.74891615, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.77616578, + "num_input_tokens_seen": 162455445, + "step": 7571, + "time_per_iteration": 2.8289296627044678 + }, + { + "auxiliary_loss_clip": 0.01468032, + "auxiliary_loss_mlp": 0.01243222, + "balance_loss_clip": 1.15452051, + "balance_loss_mlp": 1.02654719, + "epoch": 0.4552532692018638, + "flos": 19174734627840.0, + "grad_norm": 1.6318426633108354, + "language_loss": 0.81078923, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83790183, + "num_input_tokens_seen": 162474940, + "step": 7572, + "time_per_iteration": 2.7989673614501953 + }, + { + "auxiliary_loss_clip": 0.01466754, + "auxiliary_loss_mlp": 0.01258184, + "balance_loss_clip": 1.15296817, + "balance_loss_mlp": 1.03635907, + "epoch": 0.4553133924545318, + "flos": 26029331907840.0, + "grad_norm": 1.9868475340776095, + "language_loss": 0.72499359, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.75224298, + "num_input_tokens_seen": 162493340, + "step": 7573, + "time_per_iteration": 2.7809712886810303 + }, + { + "auxiliary_loss_clip": 0.01465443, + "auxiliary_loss_mlp": 0.01254536, + "balance_loss_clip": 1.15131724, + "balance_loss_mlp": 1.03500056, + "epoch": 0.45537351570719975, + "flos": 30664009687680.0, + "grad_norm": 2.0349197075393093, + "language_loss": 0.742486, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76968575, + "num_input_tokens_seen": 162514360, + "step": 7574, + "time_per_iteration": 2.806004762649536 + }, + { + "auxiliary_loss_clip": 0.01466976, + "auxiliary_loss_mlp": 0.01249317, + "balance_loss_clip": 1.15318656, + "balance_loss_mlp": 1.02959025, + "epoch": 0.4554336389598677, + "flos": 24355735089120.0, + "grad_norm": 1.545188017989516, + "language_loss": 0.71199256, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73915547, + "num_input_tokens_seen": 162535240, + "step": 7575, + "time_per_iteration": 2.8034491539001465 + }, + { + "auxiliary_loss_clip": 0.01458062, + "auxiliary_loss_mlp": 0.0125797, + "balance_loss_clip": 1.14588428, + "balance_loss_mlp": 1.03881609, + "epoch": 0.4554937622125357, + "flos": 20559998659680.0, + "grad_norm": 2.0360580469261924, + "language_loss": 0.73051775, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75767809, + "num_input_tokens_seen": 162553880, + "step": 7576, + "time_per_iteration": 2.7660555839538574 + }, + { + "auxiliary_loss_clip": 0.01465643, + "auxiliary_loss_mlp": 0.01250949, + "balance_loss_clip": 1.15120065, + "balance_loss_mlp": 1.03236699, + "epoch": 0.45555388546520365, + "flos": 22823004913920.0, + "grad_norm": 2.339677138766853, + "language_loss": 0.66439265, + "learning_rate": 2.382609814135511e-06, + "loss": 0.6915586, + "num_input_tokens_seen": 162574485, + "step": 7577, + "time_per_iteration": 2.8736956119537354 + }, + { + "auxiliary_loss_clip": 0.01469822, + "auxiliary_loss_mlp": 0.01251709, + "balance_loss_clip": 1.15393376, + "balance_loss_mlp": 1.03160143, + "epoch": 0.4556140087178716, + "flos": 21728159717760.0, + "grad_norm": 1.7386092242696214, + "language_loss": 0.73741013, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76462543, + "num_input_tokens_seen": 162595130, + "step": 7578, + "time_per_iteration": 2.7426133155822754 + }, + { + "auxiliary_loss_clip": 0.01473952, + "auxiliary_loss_mlp": 0.01259204, + "balance_loss_clip": 1.16118658, + "balance_loss_mlp": 1.04005015, + "epoch": 0.45567413197053963, + "flos": 25996902966720.0, + "grad_norm": 1.9410615144237866, + "language_loss": 0.70170224, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72903377, + "num_input_tokens_seen": 162615720, + "step": 7579, + "time_per_iteration": 2.800776243209839 + }, + { + "auxiliary_loss_clip": 0.01461421, + "auxiliary_loss_mlp": 0.01243386, + "balance_loss_clip": 1.14571548, + "balance_loss_mlp": 1.02575767, + "epoch": 0.4557342552232076, + "flos": 21539427372000.0, + "grad_norm": 1.7063328608855224, + "language_loss": 0.78579521, + "learning_rate": 2.381462943170627e-06, + "loss": 0.81284332, + "num_input_tokens_seen": 162635825, + "step": 7580, + "time_per_iteration": 2.7645559310913086 + }, + { + "auxiliary_loss_clip": 0.01469648, + "auxiliary_loss_mlp": 0.01239224, + "balance_loss_clip": 1.15467453, + "balance_loss_mlp": 1.01453853, + "epoch": 0.45579437847587556, + "flos": 40004974342080.0, + "grad_norm": 1.973026856489346, + "language_loss": 0.6855275, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71261626, + "num_input_tokens_seen": 162659130, + "step": 7581, + "time_per_iteration": 2.996273994445801 + }, + { + "auxiliary_loss_clip": 0.0146222, + "auxiliary_loss_mlp": 0.01251235, + "balance_loss_clip": 1.14662981, + "balance_loss_mlp": 1.03341556, + "epoch": 0.4558545017285435, + "flos": 31141150676640.0, + "grad_norm": 2.013585078776668, + "language_loss": 0.72820383, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75533843, + "num_input_tokens_seen": 162681665, + "step": 7582, + "time_per_iteration": 2.966859817504883 + }, + { + "auxiliary_loss_clip": 0.01461198, + "auxiliary_loss_mlp": 0.01254575, + "balance_loss_clip": 1.14525723, + "balance_loss_mlp": 1.03103375, + "epoch": 0.4559146249812115, + "flos": 21727932148800.0, + "grad_norm": 1.7490563101006087, + "language_loss": 0.72651303, + "learning_rate": 2.380315942019729e-06, + "loss": 0.75367075, + "num_input_tokens_seen": 162702040, + "step": 7583, + "time_per_iteration": 2.878666400909424 + }, + { + "auxiliary_loss_clip": 0.01462195, + "auxiliary_loss_mlp": 0.01245401, + "balance_loss_clip": 1.14609265, + "balance_loss_mlp": 1.0189991, + "epoch": 0.45597474823387946, + "flos": 23808274562880.0, + "grad_norm": 1.8191367663609406, + "language_loss": 0.72419268, + "learning_rate": 2.379933579440195e-06, + "loss": 0.75126863, + "num_input_tokens_seen": 162722375, + "step": 7584, + "time_per_iteration": 2.8204827308654785 + }, + { + "auxiliary_loss_clip": 0.01468661, + "auxiliary_loss_mlp": 0.01245414, + "balance_loss_clip": 1.15506637, + "balance_loss_mlp": 1.02587819, + "epoch": 0.4560348714865474, + "flos": 31908407076000.0, + "grad_norm": 2.974585596213567, + "language_loss": 0.68452376, + "learning_rate": 2.379551202453541e-06, + "loss": 0.71166456, + "num_input_tokens_seen": 162746095, + "step": 7585, + "time_per_iteration": 2.839479684829712 + }, + { + "auxiliary_loss_clip": 0.01460164, + "auxiliary_loss_mlp": 0.01253312, + "balance_loss_clip": 1.14647233, + "balance_loss_mlp": 1.03301311, + "epoch": 0.4560949947392154, + "flos": 22050704069280.0, + "grad_norm": 1.709916372917894, + "language_loss": 0.76059091, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78772569, + "num_input_tokens_seen": 162766330, + "step": 7586, + "time_per_iteration": 2.776766061782837 + }, + { + "auxiliary_loss_clip": 0.01468809, + "auxiliary_loss_mlp": 0.0124938, + "balance_loss_clip": 1.15425634, + "balance_loss_mlp": 1.02908111, + "epoch": 0.45615511799188335, + "flos": 24574127620320.0, + "grad_norm": 1.747981047983129, + "language_loss": 0.78170061, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80888253, + "num_input_tokens_seen": 162784755, + "step": 7587, + "time_per_iteration": 2.761751413345337 + }, + { + "auxiliary_loss_clip": 0.01463101, + "auxiliary_loss_mlp": 0.01246817, + "balance_loss_clip": 1.14811528, + "balance_loss_mlp": 1.02613676, + "epoch": 0.4562152412445513, + "flos": 18332569383840.0, + "grad_norm": 1.8593364472307583, + "language_loss": 0.69297469, + "learning_rate": 2.378403985195863e-06, + "loss": 0.72007394, + "num_input_tokens_seen": 162803850, + "step": 7588, + "time_per_iteration": 2.793295383453369 + }, + { + "auxiliary_loss_clip": 0.01462094, + "auxiliary_loss_mlp": 0.01238698, + "balance_loss_clip": 1.1473031, + "balance_loss_mlp": 1.02011609, + "epoch": 0.4562753644972193, + "flos": 13518224088480.0, + "grad_norm": 2.262905446920968, + "language_loss": 0.79303503, + "learning_rate": 2.378021550725735e-06, + "loss": 0.82004297, + "num_input_tokens_seen": 162820775, + "step": 7589, + "time_per_iteration": 2.764014482498169 + }, + { + "auxiliary_loss_clip": 0.01467186, + "auxiliary_loss_mlp": 0.01245279, + "balance_loss_clip": 1.15291882, + "balance_loss_mlp": 1.02231026, + "epoch": 0.45633548774988725, + "flos": 29642101071840.0, + "grad_norm": 2.3613088993368474, + "language_loss": 0.62031996, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64744461, + "num_input_tokens_seen": 162839695, + "step": 7590, + "time_per_iteration": 2.8096866607666016 + }, + { + "auxiliary_loss_clip": 0.01466362, + "auxiliary_loss_mlp": 0.01247155, + "balance_loss_clip": 1.15192914, + "balance_loss_mlp": 1.02685666, + "epoch": 0.4563956110025552, + "flos": 22235416030080.0, + "grad_norm": 1.8235921519650324, + "language_loss": 0.72815657, + "learning_rate": 2.377256638796135e-06, + "loss": 0.7552917, + "num_input_tokens_seen": 162856095, + "step": 7591, + "time_per_iteration": 2.7960658073425293 + }, + { + "auxiliary_loss_clip": 0.01468467, + "auxiliary_loss_mlp": 0.01245189, + "balance_loss_clip": 1.15399683, + "balance_loss_mlp": 1.02145743, + "epoch": 0.45645573425522323, + "flos": 17093937075840.0, + "grad_norm": 2.1767474232033828, + "language_loss": 0.76635194, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.7934885, + "num_input_tokens_seen": 162874070, + "step": 7592, + "time_per_iteration": 2.936685562133789 + }, + { + "auxiliary_loss_clip": 0.01459385, + "auxiliary_loss_mlp": 0.01240835, + "balance_loss_clip": 1.14508975, + "balance_loss_mlp": 1.01958275, + "epoch": 0.4565158575078912, + "flos": 20334020496480.0, + "grad_norm": 2.0241439205914653, + "language_loss": 0.6976853, + "learning_rate": 2.376491669644098e-06, + "loss": 0.72468746, + "num_input_tokens_seen": 162891000, + "step": 7593, + "time_per_iteration": 4.379237651824951 + }, + { + "auxiliary_loss_clip": 0.01458357, + "auxiliary_loss_mlp": 0.01234342, + "balance_loss_clip": 1.14483345, + "balance_loss_mlp": 1.01442456, + "epoch": 0.45657598076055916, + "flos": 23985021610080.0, + "grad_norm": 1.909580498160902, + "language_loss": 0.83883232, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86575931, + "num_input_tokens_seen": 162910120, + "step": 7594, + "time_per_iteration": 2.8241193294525146 + }, + { + "auxiliary_loss_clip": 0.01549692, + "auxiliary_loss_mlp": 0.0127037, + "balance_loss_clip": 1.26302087, + "balance_loss_mlp": 1.06819153, + "epoch": 0.45663610401322713, + "flos": 69370500356640.0, + "grad_norm": 0.8235307771946617, + "language_loss": 0.5276472, + "learning_rate": 2.375726643385654e-06, + "loss": 0.55584782, + "num_input_tokens_seen": 162963720, + "step": 7595, + "time_per_iteration": 3.379601001739502 + }, + { + "auxiliary_loss_clip": 0.01456766, + "auxiliary_loss_mlp": 0.01240253, + "balance_loss_clip": 1.1421237, + "balance_loss_mlp": 1.01938176, + "epoch": 0.4566962272658951, + "flos": 15150365064000.0, + "grad_norm": 2.323752154664937, + "language_loss": 0.87716758, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.90413779, + "num_input_tokens_seen": 162975760, + "step": 7596, + "time_per_iteration": 2.8312411308288574 + }, + { + "auxiliary_loss_clip": 0.01462663, + "auxiliary_loss_mlp": 0.0124542, + "balance_loss_clip": 1.14886713, + "balance_loss_mlp": 1.02397656, + "epoch": 0.45675635051856306, + "flos": 18699452118720.0, + "grad_norm": 1.6148638685992385, + "language_loss": 0.772394, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79947484, + "num_input_tokens_seen": 162994865, + "step": 7597, + "time_per_iteration": 2.896040916442871 + }, + { + "auxiliary_loss_clip": 0.01461163, + "auxiliary_loss_mlp": 0.01244437, + "balance_loss_clip": 1.14643943, + "balance_loss_mlp": 1.02356601, + "epoch": 0.456816473771231, + "flos": 19100280921120.0, + "grad_norm": 1.6877948980951016, + "language_loss": 0.78616226, + "learning_rate": 2.374578997177314e-06, + "loss": 0.81321824, + "num_input_tokens_seen": 163014730, + "step": 7598, + "time_per_iteration": 2.7974233627319336 + }, + { + "auxiliary_loss_clip": 0.01453986, + "auxiliary_loss_mlp": 0.01250887, + "balance_loss_clip": 1.13929176, + "balance_loss_mlp": 1.03287697, + "epoch": 0.456876597023899, + "flos": 28952636057280.0, + "grad_norm": 3.116057891083025, + "language_loss": 0.71232116, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73936993, + "num_input_tokens_seen": 163033405, + "step": 7599, + "time_per_iteration": 6.008702039718628 + }, + { + "auxiliary_loss_clip": 0.0145448, + "auxiliary_loss_mlp": 0.01242179, + "balance_loss_clip": 1.1409812, + "balance_loss_mlp": 1.02359748, + "epoch": 0.45693672027656695, + "flos": 23291422426080.0, + "grad_norm": 1.9073452311856598, + "language_loss": 0.69724107, + "learning_rate": 2.373813828660544e-06, + "loss": 0.72420764, + "num_input_tokens_seen": 163051400, + "step": 7600, + "time_per_iteration": 2.785315752029419 + }, + { + "auxiliary_loss_clip": 0.01459342, + "auxiliary_loss_mlp": 0.01247801, + "balance_loss_clip": 1.1465143, + "balance_loss_mlp": 1.02597666, + "epoch": 0.4569968435292349, + "flos": 20560302084960.0, + "grad_norm": 1.9313141155394216, + "language_loss": 0.79069066, + "learning_rate": 2.373431223132319e-06, + "loss": 0.81776208, + "num_input_tokens_seen": 163069250, + "step": 7601, + "time_per_iteration": 2.899796724319458 + }, + { + "auxiliary_loss_clip": 0.0145857, + "auxiliary_loss_mlp": 0.01244624, + "balance_loss_clip": 1.14491713, + "balance_loss_mlp": 1.02279925, + "epoch": 0.4570569667819029, + "flos": 41285896912800.0, + "grad_norm": 1.7634588952593453, + "language_loss": 0.71552771, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.74255967, + "num_input_tokens_seen": 163091755, + "step": 7602, + "time_per_iteration": 2.9258954524993896 + }, + { + "auxiliary_loss_clip": 0.01458025, + "auxiliary_loss_mlp": 0.01245653, + "balance_loss_clip": 1.14525068, + "balance_loss_mlp": 1.02459085, + "epoch": 0.45711709003457085, + "flos": 26033997071520.0, + "grad_norm": 1.8445338045128168, + "language_loss": 0.73159498, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75863171, + "num_input_tokens_seen": 163111600, + "step": 7603, + "time_per_iteration": 2.775020122528076 + }, + { + "auxiliary_loss_clip": 0.01455542, + "auxiliary_loss_mlp": 0.0125009, + "balance_loss_clip": 1.14229274, + "balance_loss_mlp": 1.03074527, + "epoch": 0.4571772132872388, + "flos": 22159938263040.0, + "grad_norm": 2.2696577034170953, + "language_loss": 0.83090138, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85795772, + "num_input_tokens_seen": 163127350, + "step": 7604, + "time_per_iteration": 2.776607036590576 + }, + { + "auxiliary_loss_clip": 0.01462391, + "auxiliary_loss_mlp": 0.01248904, + "balance_loss_clip": 1.14888215, + "balance_loss_mlp": 1.02765131, + "epoch": 0.45723733653990684, + "flos": 23881173215040.0, + "grad_norm": 1.7139127206940312, + "language_loss": 0.86244112, + "learning_rate": 2.371900659559016e-06, + "loss": 0.88955408, + "num_input_tokens_seen": 163145855, + "step": 7605, + "time_per_iteration": 2.785215139389038 + }, + { + "auxiliary_loss_clip": 0.01456718, + "auxiliary_loss_mlp": 0.01248422, + "balance_loss_clip": 1.14361167, + "balance_loss_mlp": 1.02697861, + "epoch": 0.4572974597925748, + "flos": 16873610208480.0, + "grad_norm": 1.7259632637313083, + "language_loss": 0.73642373, + "learning_rate": 2.371517983373138e-06, + "loss": 0.76347512, + "num_input_tokens_seen": 163163830, + "step": 7606, + "time_per_iteration": 2.711724281311035 + }, + { + "auxiliary_loss_clip": 0.01464813, + "auxiliary_loss_mlp": 0.01248709, + "balance_loss_clip": 1.15242076, + "balance_loss_mlp": 1.02669382, + "epoch": 0.45735758304524277, + "flos": 13773179730240.0, + "grad_norm": 2.7149150593534483, + "language_loss": 0.80551016, + "learning_rate": 2.371135293099262e-06, + "loss": 0.83264536, + "num_input_tokens_seen": 163180700, + "step": 7607, + "time_per_iteration": 2.731659412384033 + }, + { + "auxiliary_loss_clip": 0.01462059, + "auxiliary_loss_mlp": 0.01254042, + "balance_loss_clip": 1.14931726, + "balance_loss_mlp": 1.03259921, + "epoch": 0.45741770629791073, + "flos": 21102604381440.0, + "grad_norm": 2.0078228940923553, + "language_loss": 0.80805337, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83521438, + "num_input_tokens_seen": 163199450, + "step": 7608, + "time_per_iteration": 4.170407056808472 + }, + { + "auxiliary_loss_clip": 0.01459918, + "auxiliary_loss_mlp": 0.0125378, + "balance_loss_clip": 1.14571261, + "balance_loss_mlp": 1.03443503, + "epoch": 0.4574778295505787, + "flos": 23115168444960.0, + "grad_norm": 1.7327693023444013, + "language_loss": 0.68773127, + "learning_rate": 2.370369870345559e-06, + "loss": 0.71486831, + "num_input_tokens_seen": 163217875, + "step": 7609, + "time_per_iteration": 2.7975010871887207 + }, + { + "auxiliary_loss_clip": 0.01457929, + "auxiliary_loss_mlp": 0.01244239, + "balance_loss_clip": 1.1448884, + "balance_loss_mlp": 1.02489352, + "epoch": 0.45753795280324666, + "flos": 24355242023040.0, + "grad_norm": 2.040611895111116, + "language_loss": 0.80526078, + "learning_rate": 2.369987137894757e-06, + "loss": 0.83228242, + "num_input_tokens_seen": 163237430, + "step": 7610, + "time_per_iteration": 2.7996268272399902 + }, + { + "auxiliary_loss_clip": 0.01456337, + "auxiliary_loss_mlp": 0.01259906, + "balance_loss_clip": 1.14109397, + "balance_loss_mlp": 1.03998828, + "epoch": 0.4575980760559146, + "flos": 16655634887040.0, + "grad_norm": 2.4647541963780277, + "language_loss": 0.82660425, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.85376668, + "num_input_tokens_seen": 163253905, + "step": 7611, + "time_per_iteration": 2.731144428253174 + }, + { + "auxiliary_loss_clip": 0.0146389, + "auxiliary_loss_mlp": 0.01260698, + "balance_loss_clip": 1.1483736, + "balance_loss_mlp": 1.04020882, + "epoch": 0.4576581993085826, + "flos": 35913964272480.0, + "grad_norm": 2.579902465455726, + "language_loss": 0.73396838, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76121426, + "num_input_tokens_seen": 163274285, + "step": 7612, + "time_per_iteration": 2.981215000152588 + }, + { + "auxiliary_loss_clip": 0.01456467, + "auxiliary_loss_mlp": 0.01250828, + "balance_loss_clip": 1.14215684, + "balance_loss_mlp": 1.03453493, + "epoch": 0.45771832256125056, + "flos": 20082288748320.0, + "grad_norm": 1.7088273941700007, + "language_loss": 0.84804744, + "learning_rate": 2.368838856420711e-06, + "loss": 0.8751204, + "num_input_tokens_seen": 163293150, + "step": 7613, + "time_per_iteration": 2.758359909057617 + }, + { + "auxiliary_loss_clip": 0.01457849, + "auxiliary_loss_mlp": 0.01257207, + "balance_loss_clip": 1.14549088, + "balance_loss_mlp": 1.03786182, + "epoch": 0.4577784458139185, + "flos": 10745913401280.0, + "grad_norm": 1.971898431621837, + "language_loss": 0.75618553, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.7833361, + "num_input_tokens_seen": 163310065, + "step": 7614, + "time_per_iteration": 2.815089464187622 + }, + { + "auxiliary_loss_clip": 0.01457859, + "auxiliary_loss_mlp": 0.01255216, + "balance_loss_clip": 1.14435565, + "balance_loss_mlp": 1.03987575, + "epoch": 0.4578385690665865, + "flos": 21909382287840.0, + "grad_norm": 1.4304274771438255, + "language_loss": 0.74554777, + "learning_rate": 2.368073265481791e-06, + "loss": 0.77267849, + "num_input_tokens_seen": 163329415, + "step": 7615, + "time_per_iteration": 2.765101194381714 + }, + { + "auxiliary_loss_clip": 0.01526643, + "auxiliary_loss_mlp": 0.01237625, + "balance_loss_clip": 1.23983812, + "balance_loss_mlp": 1.03392029, + "epoch": 0.45789869231925445, + "flos": 64763738066880.0, + "grad_norm": 0.7767302511804622, + "language_loss": 0.57572329, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.60336602, + "num_input_tokens_seen": 163385875, + "step": 7616, + "time_per_iteration": 3.255859375 + }, + { + "auxiliary_loss_clip": 0.0145686, + "auxiliary_loss_mlp": 0.01258503, + "balance_loss_clip": 1.14228106, + "balance_loss_mlp": 1.03973055, + "epoch": 0.4579588155719224, + "flos": 16145875316160.0, + "grad_norm": 1.9083416505376118, + "language_loss": 0.71213782, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.73929143, + "num_input_tokens_seen": 163405170, + "step": 7617, + "time_per_iteration": 2.8764216899871826 + }, + { + "auxiliary_loss_clip": 0.01467481, + "auxiliary_loss_mlp": 0.01253567, + "balance_loss_clip": 1.15503192, + "balance_loss_mlp": 1.03326845, + "epoch": 0.45801893882459044, + "flos": 21397574596320.0, + "grad_norm": 1.9903199282431951, + "language_loss": 0.76668334, + "learning_rate": 2.36692477442939e-06, + "loss": 0.79389387, + "num_input_tokens_seen": 163423155, + "step": 7618, + "time_per_iteration": 2.742586135864258 + }, + { + "auxiliary_loss_clip": 0.01460379, + "auxiliary_loss_mlp": 0.01261276, + "balance_loss_clip": 1.14637947, + "balance_loss_mlp": 1.04250288, + "epoch": 0.4580790620772584, + "flos": 19538583109920.0, + "grad_norm": 1.641376889417013, + "language_loss": 0.76839447, + "learning_rate": 2.366541916231585e-06, + "loss": 0.79561102, + "num_input_tokens_seen": 163442450, + "step": 7619, + "time_per_iteration": 2.8108503818511963 + }, + { + "auxiliary_loss_clip": 0.01460986, + "auxiliary_loss_mlp": 0.01246232, + "balance_loss_clip": 1.1483444, + "balance_loss_mlp": 1.02936637, + "epoch": 0.45813918532992637, + "flos": 16582925875680.0, + "grad_norm": 1.9200935441546951, + "language_loss": 0.71871352, + "learning_rate": 2.366159044134473e-06, + "loss": 0.74578571, + "num_input_tokens_seen": 163459810, + "step": 7620, + "time_per_iteration": 2.723343849182129 + }, + { + "auxiliary_loss_clip": 0.01458717, + "auxiliary_loss_mlp": 0.01250431, + "balance_loss_clip": 1.14605665, + "balance_loss_mlp": 1.03490067, + "epoch": 0.45819930858259433, + "flos": 42233579390880.0, + "grad_norm": 1.7357873209349335, + "language_loss": 0.78419435, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.81128585, + "num_input_tokens_seen": 163482970, + "step": 7621, + "time_per_iteration": 2.9277186393737793 + }, + { + "auxiliary_loss_clip": 0.01516106, + "auxiliary_loss_mlp": 0.01206276, + "balance_loss_clip": 1.22915244, + "balance_loss_mlp": 1.00104523, + "epoch": 0.4582594318352623, + "flos": 63721006526880.0, + "grad_norm": 0.7921706165730013, + "language_loss": 0.64849186, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67571568, + "num_input_tokens_seen": 163545330, + "step": 7622, + "time_per_iteration": 3.371899366378784 + }, + { + "auxiliary_loss_clip": 0.01458727, + "auxiliary_loss_mlp": 0.01247614, + "balance_loss_clip": 1.14511299, + "balance_loss_mlp": 1.02960348, + "epoch": 0.45831955508793026, + "flos": 26872407427680.0, + "grad_norm": 1.9438247342015875, + "language_loss": 0.79770344, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.82476687, + "num_input_tokens_seen": 163564620, + "step": 7623, + "time_per_iteration": 2.820124626159668 + }, + { + "auxiliary_loss_clip": 0.01456222, + "auxiliary_loss_mlp": 0.01261311, + "balance_loss_clip": 1.14251041, + "balance_loss_mlp": 1.04101181, + "epoch": 0.45837967834059823, + "flos": 18735939372960.0, + "grad_norm": 2.18867872411962, + "language_loss": 0.70681506, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.73399037, + "num_input_tokens_seen": 163581010, + "step": 7624, + "time_per_iteration": 2.827639579772949 + }, + { + "auxiliary_loss_clip": 0.01454293, + "auxiliary_loss_mlp": 0.01259843, + "balance_loss_clip": 1.14011598, + "balance_loss_mlp": 1.04049802, + "epoch": 0.4584398015932662, + "flos": 21180206125440.0, + "grad_norm": 2.776441664917115, + "language_loss": 0.73272479, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75986612, + "num_input_tokens_seen": 163599955, + "step": 7625, + "time_per_iteration": 2.7477381229400635 + }, + { + "auxiliary_loss_clip": 0.01459344, + "auxiliary_loss_mlp": 0.01255576, + "balance_loss_clip": 1.14562714, + "balance_loss_mlp": 1.03813791, + "epoch": 0.45849992484593416, + "flos": 19791983697120.0, + "grad_norm": 2.6291391091793286, + "language_loss": 0.78234994, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80949914, + "num_input_tokens_seen": 163618545, + "step": 7626, + "time_per_iteration": 2.8265061378479004 + }, + { + "auxiliary_loss_clip": 0.01454136, + "auxiliary_loss_mlp": 0.01247641, + "balance_loss_clip": 1.14141297, + "balance_loss_mlp": 1.02867723, + "epoch": 0.4585600480986021, + "flos": 18225231598080.0, + "grad_norm": 1.5273463222507178, + "language_loss": 0.84896541, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87598312, + "num_input_tokens_seen": 163636055, + "step": 7627, + "time_per_iteration": 2.735961437225342 + }, + { + "auxiliary_loss_clip": 0.01457574, + "auxiliary_loss_mlp": 0.01252829, + "balance_loss_clip": 1.14263475, + "balance_loss_mlp": 1.03233957, + "epoch": 0.4586201713512701, + "flos": 29024738218080.0, + "grad_norm": 1.7166232463187563, + "language_loss": 0.69477797, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.7218821, + "num_input_tokens_seen": 163657485, + "step": 7628, + "time_per_iteration": 2.82955002784729 + }, + { + "auxiliary_loss_clip": 0.01452285, + "auxiliary_loss_mlp": 0.01261457, + "balance_loss_clip": 1.13971174, + "balance_loss_mlp": 1.04726183, + "epoch": 0.45868029460393805, + "flos": 23406876838080.0, + "grad_norm": 1.56952794185792, + "language_loss": 0.78276551, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80990297, + "num_input_tokens_seen": 163676030, + "step": 7629, + "time_per_iteration": 2.763808012008667 + }, + { + "auxiliary_loss_clip": 0.01460213, + "auxiliary_loss_mlp": 0.01254035, + "balance_loss_clip": 1.14712083, + "balance_loss_mlp": 1.03411746, + "epoch": 0.458740417856606, + "flos": 18223866184320.0, + "grad_norm": 2.14740350249135, + "language_loss": 0.7958681, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.82301056, + "num_input_tokens_seen": 163694490, + "step": 7630, + "time_per_iteration": 4.477554798126221 + }, + { + "auxiliary_loss_clip": 0.01455867, + "auxiliary_loss_mlp": 0.01254021, + "balance_loss_clip": 1.1416868, + "balance_loss_mlp": 1.03467643, + "epoch": 0.458800541109274, + "flos": 34571635282080.0, + "grad_norm": 1.6919308028708095, + "language_loss": 0.72140098, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.74849987, + "num_input_tokens_seen": 163717035, + "step": 7631, + "time_per_iteration": 2.9404361248016357 + }, + { + "auxiliary_loss_clip": 0.01455458, + "auxiliary_loss_mlp": 0.0126299, + "balance_loss_clip": 1.14210486, + "balance_loss_mlp": 1.04326367, + "epoch": 0.458860664361942, + "flos": 17714030757120.0, + "grad_norm": 2.042227112680629, + "language_loss": 0.71932471, + "learning_rate": 2.361563500108531e-06, + "loss": 0.74650919, + "num_input_tokens_seen": 163734525, + "step": 7632, + "time_per_iteration": 2.7925100326538086 + }, + { + "auxiliary_loss_clip": 0.01453519, + "auxiliary_loss_mlp": 0.01248477, + "balance_loss_clip": 1.13917828, + "balance_loss_mlp": 1.0262711, + "epoch": 0.45892078761460997, + "flos": 18443851698240.0, + "grad_norm": 3.2859051563646724, + "language_loss": 0.69635016, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.72337013, + "num_input_tokens_seen": 163752860, + "step": 7633, + "time_per_iteration": 2.802891254425049 + }, + { + "auxiliary_loss_clip": 0.01454713, + "auxiliary_loss_mlp": 0.01255202, + "balance_loss_clip": 1.1420691, + "balance_loss_mlp": 1.03585708, + "epoch": 0.45898091086727794, + "flos": 22675083632640.0, + "grad_norm": 1.5436609900484826, + "language_loss": 0.80820775, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.83530688, + "num_input_tokens_seen": 163772495, + "step": 7634, + "time_per_iteration": 2.7914483547210693 + }, + { + "auxiliary_loss_clip": 0.01458579, + "auxiliary_loss_mlp": 0.01259567, + "balance_loss_clip": 1.14549708, + "balance_loss_mlp": 1.03831482, + "epoch": 0.4590410341199459, + "flos": 21655109352960.0, + "grad_norm": 1.6457059675951242, + "language_loss": 0.81955332, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.84673482, + "num_input_tokens_seen": 163791475, + "step": 7635, + "time_per_iteration": 2.7862043380737305 + }, + { + "auxiliary_loss_clip": 0.01456302, + "auxiliary_loss_mlp": 0.01245157, + "balance_loss_clip": 1.14318156, + "balance_loss_mlp": 1.02543032, + "epoch": 0.45910115737261387, + "flos": 36538154195040.0, + "grad_norm": 1.4632650226631454, + "language_loss": 0.64600778, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.67302233, + "num_input_tokens_seen": 163812995, + "step": 7636, + "time_per_iteration": 2.902723789215088 + }, + { + "auxiliary_loss_clip": 0.01456729, + "auxiliary_loss_mlp": 0.0124318, + "balance_loss_clip": 1.14309824, + "balance_loss_mlp": 1.02364469, + "epoch": 0.45916128062528183, + "flos": 24421844600640.0, + "grad_norm": 1.6775480407718313, + "language_loss": 0.8061139, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.83311301, + "num_input_tokens_seen": 163833945, + "step": 7637, + "time_per_iteration": 4.361846923828125 + }, + { + "auxiliary_loss_clip": 0.01451325, + "auxiliary_loss_mlp": 0.01253779, + "balance_loss_clip": 1.13776159, + "balance_loss_mlp": 1.03214526, + "epoch": 0.4592214038779498, + "flos": 23224212997920.0, + "grad_norm": 1.4413740391238519, + "language_loss": 0.75480103, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.78185201, + "num_input_tokens_seen": 163853885, + "step": 7638, + "time_per_iteration": 2.7477872371673584 + }, + { + "auxiliary_loss_clip": 0.01456707, + "auxiliary_loss_mlp": 0.01247614, + "balance_loss_clip": 1.14356112, + "balance_loss_mlp": 1.02731562, + "epoch": 0.45928152713061776, + "flos": 19174165705440.0, + "grad_norm": 1.8470223562528398, + "language_loss": 0.73749936, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76454258, + "num_input_tokens_seen": 163871855, + "step": 7639, + "time_per_iteration": 2.7541797161102295 + }, + { + "auxiliary_loss_clip": 0.01453196, + "auxiliary_loss_mlp": 0.01251418, + "balance_loss_clip": 1.14058316, + "balance_loss_mlp": 1.02825856, + "epoch": 0.4593416503832857, + "flos": 22416638600160.0, + "grad_norm": 1.6393257765360236, + "language_loss": 0.68116403, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70821017, + "num_input_tokens_seen": 163891450, + "step": 7640, + "time_per_iteration": 2.73185133934021 + }, + { + "auxiliary_loss_clip": 0.01450963, + "auxiliary_loss_mlp": 0.01255162, + "balance_loss_clip": 1.13777518, + "balance_loss_mlp": 1.03543508, + "epoch": 0.4594017736359537, + "flos": 18882229743360.0, + "grad_norm": 1.650469437784049, + "language_loss": 0.75659502, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.78365624, + "num_input_tokens_seen": 163909345, + "step": 7641, + "time_per_iteration": 2.778110980987549 + }, + { + "auxiliary_loss_clip": 0.01453946, + "auxiliary_loss_mlp": 0.01240472, + "balance_loss_clip": 1.14167893, + "balance_loss_mlp": 1.01826596, + "epoch": 0.45946189688862166, + "flos": 20520477152640.0, + "grad_norm": 1.7595704904168155, + "language_loss": 0.74656463, + "learning_rate": 2.357732370864668e-06, + "loss": 0.77350885, + "num_input_tokens_seen": 163926940, + "step": 7642, + "time_per_iteration": 2.7140157222747803 + }, + { + "auxiliary_loss_clip": 0.01500658, + "auxiliary_loss_mlp": 0.01208572, + "balance_loss_clip": 1.21052527, + "balance_loss_mlp": 1.00257874, + "epoch": 0.4595220201412896, + "flos": 61411499984160.0, + "grad_norm": 0.8554394509657904, + "language_loss": 0.58187985, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60897207, + "num_input_tokens_seen": 163977785, + "step": 7643, + "time_per_iteration": 3.0945262908935547 + }, + { + "auxiliary_loss_clip": 0.01444551, + "auxiliary_loss_mlp": 0.01258652, + "balance_loss_clip": 1.13118732, + "balance_loss_mlp": 1.03949821, + "epoch": 0.4595821433939576, + "flos": 23333181694560.0, + "grad_norm": 1.5263237577541435, + "language_loss": 0.93086529, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95789742, + "num_input_tokens_seen": 163996630, + "step": 7644, + "time_per_iteration": 2.8479275703430176 + }, + { + "auxiliary_loss_clip": 0.01449654, + "auxiliary_loss_mlp": 0.01249546, + "balance_loss_clip": 1.1362009, + "balance_loss_mlp": 1.02505112, + "epoch": 0.4596422666466256, + "flos": 14284077145920.0, + "grad_norm": 2.007340328135902, + "language_loss": 0.82772958, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.85472155, + "num_input_tokens_seen": 164013190, + "step": 7645, + "time_per_iteration": 2.834528923034668 + }, + { + "auxiliary_loss_clip": 0.01500799, + "auxiliary_loss_mlp": 0.01215729, + "balance_loss_clip": 1.21234095, + "balance_loss_mlp": 1.00973511, + "epoch": 0.4597023898992936, + "flos": 65733987800160.0, + "grad_norm": 0.7750489222840455, + "language_loss": 0.59818935, + "learning_rate": 2.356199538526593e-06, + "loss": 0.62535465, + "num_input_tokens_seen": 164074030, + "step": 7646, + "time_per_iteration": 4.650912046432495 + }, + { + "auxiliary_loss_clip": 0.01445808, + "auxiliary_loss_mlp": 0.01240673, + "balance_loss_clip": 1.13151801, + "balance_loss_mlp": 1.02247238, + "epoch": 0.45976251315196154, + "flos": 26909539460640.0, + "grad_norm": 1.8438345662273723, + "language_loss": 0.72582662, + "learning_rate": 2.355816296637939e-06, + "loss": 0.75269151, + "num_input_tokens_seen": 164095515, + "step": 7647, + "time_per_iteration": 2.816174268722534 + }, + { + "auxiliary_loss_clip": 0.01446593, + "auxiliary_loss_mlp": 0.01257415, + "balance_loss_clip": 1.1345644, + "balance_loss_mlp": 1.03559041, + "epoch": 0.4598226364046295, + "flos": 26620675679520.0, + "grad_norm": 2.33158108653156, + "language_loss": 0.66923088, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.69627094, + "num_input_tokens_seen": 164117270, + "step": 7648, + "time_per_iteration": 2.783527135848999 + }, + { + "auxiliary_loss_clip": 0.01446731, + "auxiliary_loss_mlp": 0.01252939, + "balance_loss_clip": 1.13337684, + "balance_loss_mlp": 1.03054237, + "epoch": 0.45988275965729747, + "flos": 24389984581920.0, + "grad_norm": 1.4927132165083492, + "language_loss": 0.78845185, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.81544852, + "num_input_tokens_seen": 164137850, + "step": 7649, + "time_per_iteration": 2.7962775230407715 + }, + { + "auxiliary_loss_clip": 0.01445027, + "auxiliary_loss_mlp": 0.01241696, + "balance_loss_clip": 1.13298738, + "balance_loss_mlp": 1.02101612, + "epoch": 0.45994288290996543, + "flos": 24538171360320.0, + "grad_norm": 1.70514136346554, + "language_loss": 0.69057488, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.71744215, + "num_input_tokens_seen": 164157960, + "step": 7650, + "time_per_iteration": 2.749377727508545 + }, + { + "auxiliary_loss_clip": 0.01447201, + "auxiliary_loss_mlp": 0.01253822, + "balance_loss_clip": 1.1332587, + "balance_loss_mlp": 1.03066218, + "epoch": 0.4600030061626334, + "flos": 14832030738240.0, + "grad_norm": 1.9927261798784024, + "language_loss": 0.84589708, + "learning_rate": 2.354283194302761e-06, + "loss": 0.87290728, + "num_input_tokens_seen": 164174590, + "step": 7651, + "time_per_iteration": 2.772172451019287 + }, + { + "auxiliary_loss_clip": 0.01454781, + "auxiliary_loss_mlp": 0.01255464, + "balance_loss_clip": 1.14198601, + "balance_loss_mlp": 1.03382969, + "epoch": 0.46006312941530136, + "flos": 18115807763520.0, + "grad_norm": 2.2784105125334997, + "language_loss": 0.75355518, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.78065765, + "num_input_tokens_seen": 164192935, + "step": 7652, + "time_per_iteration": 2.754087448120117 + }, + { + "auxiliary_loss_clip": 0.01440997, + "auxiliary_loss_mlp": 0.01243471, + "balance_loss_clip": 1.12740517, + "balance_loss_mlp": 1.02240944, + "epoch": 0.46012325266796933, + "flos": 21978374339520.0, + "grad_norm": 1.7198735862750976, + "language_loss": 0.75884509, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.78568977, + "num_input_tokens_seen": 164213160, + "step": 7653, + "time_per_iteration": 2.8294618129730225 + }, + { + "auxiliary_loss_clip": 0.01453108, + "auxiliary_loss_mlp": 0.01250591, + "balance_loss_clip": 1.13815403, + "balance_loss_mlp": 1.02189982, + "epoch": 0.4601833759206373, + "flos": 15269877789120.0, + "grad_norm": 2.2668428544713306, + "language_loss": 0.65801901, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68505597, + "num_input_tokens_seen": 164229330, + "step": 7654, + "time_per_iteration": 2.741428852081299 + }, + { + "auxiliary_loss_clip": 0.01441794, + "auxiliary_loss_mlp": 0.01251675, + "balance_loss_clip": 1.1274519, + "balance_loss_mlp": 1.03404689, + "epoch": 0.46024349917330526, + "flos": 27091444737600.0, + "grad_norm": 2.6693970784467096, + "language_loss": 0.78879786, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81573254, + "num_input_tokens_seen": 164248240, + "step": 7655, + "time_per_iteration": 2.838717460632324 + }, + { + "auxiliary_loss_clip": 0.01442954, + "auxiliary_loss_mlp": 0.0124706, + "balance_loss_clip": 1.12937689, + "balance_loss_mlp": 1.02790534, + "epoch": 0.4603036224259732, + "flos": 24465272708160.0, + "grad_norm": 1.62881579478236, + "language_loss": 0.67384601, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.70074612, + "num_input_tokens_seen": 164268020, + "step": 7656, + "time_per_iteration": 2.8534016609191895 + }, + { + "auxiliary_loss_clip": 0.01440286, + "auxiliary_loss_mlp": 0.01256357, + "balance_loss_clip": 1.12729049, + "balance_loss_mlp": 1.03853762, + "epoch": 0.4603637456786412, + "flos": 28111722442560.0, + "grad_norm": 1.787981661253878, + "language_loss": 0.81045389, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83742034, + "num_input_tokens_seen": 164287305, + "step": 7657, + "time_per_iteration": 2.8070781230926514 + }, + { + "auxiliary_loss_clip": 0.01445487, + "auxiliary_loss_mlp": 0.01254041, + "balance_loss_clip": 1.1304121, + "balance_loss_mlp": 1.0356499, + "epoch": 0.4604238689313092, + "flos": 24351031997280.0, + "grad_norm": 3.355923287119471, + "language_loss": 0.70503628, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.73203158, + "num_input_tokens_seen": 164306835, + "step": 7658, + "time_per_iteration": 2.842088460922241 + }, + { + "auxiliary_loss_clip": 0.01486863, + "auxiliary_loss_mlp": 0.01213959, + "balance_loss_clip": 1.19849253, + "balance_loss_mlp": 1.00949097, + "epoch": 0.4604839921839772, + "flos": 53611761412800.0, + "grad_norm": 0.9797095447968418, + "language_loss": 0.62088072, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64788896, + "num_input_tokens_seen": 164367095, + "step": 7659, + "time_per_iteration": 3.3916072845458984 + }, + { + "auxiliary_loss_clip": 0.01448714, + "auxiliary_loss_mlp": 0.01249861, + "balance_loss_clip": 1.13644528, + "balance_loss_mlp": 1.03127909, + "epoch": 0.46054411543664514, + "flos": 31251105505440.0, + "grad_norm": 1.6294694819746325, + "language_loss": 0.68579388, + "learning_rate": 2.350832929550336e-06, + "loss": 0.71277964, + "num_input_tokens_seen": 164388895, + "step": 7660, + "time_per_iteration": 2.845538377761841 + }, + { + "auxiliary_loss_clip": 0.01450669, + "auxiliary_loss_mlp": 0.01249548, + "balance_loss_clip": 1.13828516, + "balance_loss_mlp": 1.02848589, + "epoch": 0.4606042386893131, + "flos": 24094900582560.0, + "grad_norm": 1.7826687942290576, + "language_loss": 0.76854748, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79554957, + "num_input_tokens_seen": 164409080, + "step": 7661, + "time_per_iteration": 2.798292875289917 + }, + { + "auxiliary_loss_clip": 0.0144558, + "auxiliary_loss_mlp": 0.01255699, + "balance_loss_clip": 1.1327287, + "balance_loss_mlp": 1.0365448, + "epoch": 0.46066436194198107, + "flos": 26580888675360.0, + "grad_norm": 2.499409234440069, + "language_loss": 0.74810088, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77511364, + "num_input_tokens_seen": 164427585, + "step": 7662, + "time_per_iteration": 2.7563886642456055 + }, + { + "auxiliary_loss_clip": 0.01448817, + "auxiliary_loss_mlp": 0.01266833, + "balance_loss_clip": 1.13422275, + "balance_loss_mlp": 1.04443669, + "epoch": 0.46072448519464904, + "flos": 17776992231360.0, + "grad_norm": 3.0900746486816186, + "language_loss": 0.80027914, + "learning_rate": 2.349682601310998e-06, + "loss": 0.82743561, + "num_input_tokens_seen": 164438455, + "step": 7663, + "time_per_iteration": 2.7251410484313965 + }, + { + "auxiliary_loss_clip": 0.01445659, + "auxiliary_loss_mlp": 0.01250241, + "balance_loss_clip": 1.13324666, + "balance_loss_mlp": 1.03013301, + "epoch": 0.460784608447317, + "flos": 15087934584000.0, + "grad_norm": 2.322006295917434, + "language_loss": 0.73986965, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.76682866, + "num_input_tokens_seen": 164456830, + "step": 7664, + "time_per_iteration": 2.713109016418457 + }, + { + "auxiliary_loss_clip": 0.01453421, + "auxiliary_loss_mlp": 0.01243906, + "balance_loss_clip": 1.14124358, + "balance_loss_mlp": 1.02456093, + "epoch": 0.46084473169998497, + "flos": 18590255853120.0, + "grad_norm": 1.5333394638479585, + "language_loss": 0.72383875, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.75081205, + "num_input_tokens_seen": 164475375, + "step": 7665, + "time_per_iteration": 2.742211103439331 + }, + { + "auxiliary_loss_clip": 0.01451731, + "auxiliary_loss_mlp": 0.01249025, + "balance_loss_clip": 1.13730359, + "balance_loss_mlp": 1.03139615, + "epoch": 0.46090485495265293, + "flos": 19496520416160.0, + "grad_norm": 1.7372666966052035, + "language_loss": 0.77812684, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80513442, + "num_input_tokens_seen": 164492040, + "step": 7666, + "time_per_iteration": 2.738459348678589 + }, + { + "auxiliary_loss_clip": 0.01456372, + "auxiliary_loss_mlp": 0.01247828, + "balance_loss_clip": 1.14292598, + "balance_loss_mlp": 1.02752876, + "epoch": 0.4609649782053209, + "flos": 33367745532960.0, + "grad_norm": 1.6571522694071747, + "language_loss": 0.7392447, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76628673, + "num_input_tokens_seen": 164513665, + "step": 7667, + "time_per_iteration": 2.8395578861236572 + }, + { + "auxiliary_loss_clip": 0.01453066, + "auxiliary_loss_mlp": 0.01247809, + "balance_loss_clip": 1.13999891, + "balance_loss_mlp": 1.02884507, + "epoch": 0.46102510145798886, + "flos": 23771445955200.0, + "grad_norm": 2.095100977133572, + "language_loss": 0.76338053, + "learning_rate": 2.347765122572676e-06, + "loss": 0.7903893, + "num_input_tokens_seen": 164533890, + "step": 7668, + "time_per_iteration": 2.7331883907318115 + }, + { + "auxiliary_loss_clip": 0.01458547, + "auxiliary_loss_mlp": 0.01255208, + "balance_loss_clip": 1.14635265, + "balance_loss_mlp": 1.04005861, + "epoch": 0.4610852247106568, + "flos": 23297073721920.0, + "grad_norm": 1.8281527898933752, + "language_loss": 0.784284, + "learning_rate": 2.347381587204975e-06, + "loss": 0.81142151, + "num_input_tokens_seen": 164553815, + "step": 7669, + "time_per_iteration": 4.349703073501587 + }, + { + "auxiliary_loss_clip": 0.01457638, + "auxiliary_loss_mlp": 0.012574, + "balance_loss_clip": 1.14442873, + "balance_loss_mlp": 1.03805542, + "epoch": 0.4611453479633248, + "flos": 25449821722080.0, + "grad_norm": 1.8832925358143564, + "language_loss": 0.83051533, + "learning_rate": 2.34699803866453e-06, + "loss": 0.85766566, + "num_input_tokens_seen": 164573125, + "step": 7670, + "time_per_iteration": 2.7857539653778076 + }, + { + "auxiliary_loss_clip": 0.01455029, + "auxiliary_loss_mlp": 0.01244292, + "balance_loss_clip": 1.14240229, + "balance_loss_mlp": 1.02513766, + "epoch": 0.4612054712159928, + "flos": 21141594894240.0, + "grad_norm": 1.679541312319474, + "language_loss": 0.63783962, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.66483277, + "num_input_tokens_seen": 164592575, + "step": 7671, + "time_per_iteration": 2.8326733112335205 + }, + { + "auxiliary_loss_clip": 0.01485355, + "auxiliary_loss_mlp": 0.01213127, + "balance_loss_clip": 1.19739521, + "balance_loss_mlp": 1.00789642, + "epoch": 0.4612655944686608, + "flos": 69966130010400.0, + "grad_norm": 0.6806000391272577, + "language_loss": 0.55825019, + "learning_rate": 2.346230902123583e-06, + "loss": 0.585235, + "num_input_tokens_seen": 164659795, + "step": 7672, + "time_per_iteration": 3.4347217082977295 + }, + { + "auxiliary_loss_clip": 0.01452622, + "auxiliary_loss_mlp": 0.01252341, + "balance_loss_clip": 1.14025748, + "balance_loss_mlp": 1.03146982, + "epoch": 0.46132571772132874, + "flos": 16839057290400.0, + "grad_norm": 2.9743932850781456, + "language_loss": 0.71247554, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.7395252, + "num_input_tokens_seen": 164678735, + "step": 7673, + "time_per_iteration": 2.803971767425537 + }, + { + "auxiliary_loss_clip": 0.01457237, + "auxiliary_loss_mlp": 0.01246249, + "balance_loss_clip": 1.14535284, + "balance_loss_mlp": 1.0265224, + "epoch": 0.4613858409739967, + "flos": 35811860572800.0, + "grad_norm": 1.8162037486020295, + "language_loss": 0.71024781, + "learning_rate": 2.345463713066195e-06, + "loss": 0.73728263, + "num_input_tokens_seen": 164700885, + "step": 7674, + "time_per_iteration": 2.904296398162842 + }, + { + "auxiliary_loss_clip": 0.01446347, + "auxiliary_loss_mlp": 0.01247767, + "balance_loss_clip": 1.13240397, + "balance_loss_mlp": 1.02994776, + "epoch": 0.4614459642266647, + "flos": 35269937557920.0, + "grad_norm": 1.472543126850114, + "language_loss": 0.65620506, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.68314624, + "num_input_tokens_seen": 164726960, + "step": 7675, + "time_per_iteration": 4.548037767410278 + }, + { + "auxiliary_loss_clip": 0.01479267, + "auxiliary_loss_mlp": 0.01212891, + "balance_loss_clip": 1.19198871, + "balance_loss_mlp": 1.00842285, + "epoch": 0.46150608747933264, + "flos": 66710875325760.0, + "grad_norm": 0.7540899619750038, + "language_loss": 0.5851298, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.61205137, + "num_input_tokens_seen": 164788525, + "step": 7676, + "time_per_iteration": 4.812359571456909 + }, + { + "auxiliary_loss_clip": 0.01475975, + "auxiliary_loss_mlp": 0.0120929, + "balance_loss_clip": 1.18904257, + "balance_loss_mlp": 1.00405884, + "epoch": 0.4615662107320006, + "flos": 55835739226080.0, + "grad_norm": 0.8029586774634426, + "language_loss": 0.62719649, + "learning_rate": 2.344312831266341e-06, + "loss": 0.6540491, + "num_input_tokens_seen": 164843525, + "step": 7677, + "time_per_iteration": 3.1208600997924805 + }, + { + "auxiliary_loss_clip": 0.01448911, + "auxiliary_loss_mlp": 0.01252736, + "balance_loss_clip": 1.13593709, + "balance_loss_mlp": 1.03587079, + "epoch": 0.46162633398466857, + "flos": 15484856785920.0, + "grad_norm": 2.2968567991858038, + "language_loss": 0.76022065, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78723711, + "num_input_tokens_seen": 164859895, + "step": 7678, + "time_per_iteration": 2.7222917079925537 + }, + { + "auxiliary_loss_clip": 0.01450487, + "auxiliary_loss_mlp": 0.01254966, + "balance_loss_clip": 1.1372962, + "balance_loss_mlp": 1.03905416, + "epoch": 0.46168645723733653, + "flos": 20013524265600.0, + "grad_norm": 2.966488008914292, + "language_loss": 0.66537172, + "learning_rate": 2.343545511426974e-06, + "loss": 0.69242632, + "num_input_tokens_seen": 164878030, + "step": 7679, + "time_per_iteration": 2.8334290981292725 + }, + { + "auxiliary_loss_clip": 0.01450633, + "auxiliary_loss_mlp": 0.01257024, + "balance_loss_clip": 1.13710499, + "balance_loss_mlp": 1.03710651, + "epoch": 0.4617465804900045, + "flos": 20300377854240.0, + "grad_norm": 2.386416818248592, + "language_loss": 0.7004956, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.7275722, + "num_input_tokens_seen": 164895710, + "step": 7680, + "time_per_iteration": 2.7597711086273193 + }, + { + "auxiliary_loss_clip": 0.01456772, + "auxiliary_loss_mlp": 0.012637, + "balance_loss_clip": 1.14433694, + "balance_loss_mlp": 1.04359174, + "epoch": 0.46180670374267246, + "flos": 22348518896160.0, + "grad_norm": 1.845612971673861, + "language_loss": 0.63804758, + "learning_rate": 2.342778139478487e-06, + "loss": 0.66525233, + "num_input_tokens_seen": 164913365, + "step": 7681, + "time_per_iteration": 2.777106285095215 + }, + { + "auxiliary_loss_clip": 0.01450223, + "auxiliary_loss_mlp": 0.0125882, + "balance_loss_clip": 1.13728666, + "balance_loss_mlp": 1.04138184, + "epoch": 0.46186682699534043, + "flos": 19897425074880.0, + "grad_norm": 1.503259085896481, + "language_loss": 0.67314839, + "learning_rate": 2.342394433999697e-06, + "loss": 0.70023888, + "num_input_tokens_seen": 164931620, + "step": 7682, + "time_per_iteration": 2.8015594482421875 + }, + { + "auxiliary_loss_clip": 0.01457438, + "auxiliary_loss_mlp": 0.01254782, + "balance_loss_clip": 1.14457989, + "balance_loss_mlp": 1.03581882, + "epoch": 0.4619269502480084, + "flos": 31506250788000.0, + "grad_norm": 3.356979028635302, + "language_loss": 0.7427367, + "learning_rate": 2.342010715537275e-06, + "loss": 0.7698589, + "num_input_tokens_seen": 164950905, + "step": 7683, + "time_per_iteration": 2.9161205291748047 + }, + { + "auxiliary_loss_clip": 0.01452319, + "auxiliary_loss_mlp": 0.01250363, + "balance_loss_clip": 1.13889694, + "balance_loss_mlp": 1.03197181, + "epoch": 0.46198707350067636, + "flos": 25011822958560.0, + "grad_norm": 2.094641729008843, + "language_loss": 0.76921999, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.79624677, + "num_input_tokens_seen": 164970950, + "step": 7684, + "time_per_iteration": 4.437100172042847 + }, + { + "auxiliary_loss_clip": 0.01454081, + "auxiliary_loss_mlp": 0.01262151, + "balance_loss_clip": 1.14044595, + "balance_loss_mlp": 1.04318738, + "epoch": 0.4620471967533444, + "flos": 18294109865280.0, + "grad_norm": 1.7536246957554749, + "language_loss": 0.79732203, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82448435, + "num_input_tokens_seen": 164989855, + "step": 7685, + "time_per_iteration": 2.9153451919555664 + }, + { + "auxiliary_loss_clip": 0.01457243, + "auxiliary_loss_mlp": 0.01248956, + "balance_loss_clip": 1.14490032, + "balance_loss_mlp": 1.0292294, + "epoch": 0.46210732000601235, + "flos": 33987952998720.0, + "grad_norm": 2.0231646103564054, + "language_loss": 0.66201419, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68907619, + "num_input_tokens_seen": 165012290, + "step": 7686, + "time_per_iteration": 2.9726502895355225 + }, + { + "auxiliary_loss_clip": 0.0145318, + "auxiliary_loss_mlp": 0.01263924, + "balance_loss_clip": 1.13888907, + "balance_loss_mlp": 1.04381633, + "epoch": 0.4621674432586803, + "flos": 25011747102240.0, + "grad_norm": 2.167607695127726, + "language_loss": 0.7442019, + "learning_rate": 2.340475712142296e-06, + "loss": 0.77137291, + "num_input_tokens_seen": 165030810, + "step": 7687, + "time_per_iteration": 2.895103931427002 + }, + { + "auxiliary_loss_clip": 0.01458327, + "auxiliary_loss_mlp": 0.01249679, + "balance_loss_clip": 1.14650846, + "balance_loss_mlp": 1.03052449, + "epoch": 0.4622275665113483, + "flos": 22015999438560.0, + "grad_norm": 2.365546250287906, + "language_loss": 0.7522788, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.7793588, + "num_input_tokens_seen": 165050205, + "step": 7688, + "time_per_iteration": 2.8054230213165283 + }, + { + "auxiliary_loss_clip": 0.01453506, + "auxiliary_loss_mlp": 0.01248042, + "balance_loss_clip": 1.14084423, + "balance_loss_mlp": 1.02717113, + "epoch": 0.46228768976401624, + "flos": 24060992443200.0, + "grad_norm": 1.7364451666563632, + "language_loss": 0.78908134, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.81609678, + "num_input_tokens_seen": 165069370, + "step": 7689, + "time_per_iteration": 2.8269736766815186 + }, + { + "auxiliary_loss_clip": 0.01457733, + "auxiliary_loss_mlp": 0.01258724, + "balance_loss_clip": 1.14376879, + "balance_loss_mlp": 1.03937876, + "epoch": 0.4623478130166842, + "flos": 26653825255680.0, + "grad_norm": 2.468332315087855, + "language_loss": 0.57411611, + "learning_rate": 2.339324323980964e-06, + "loss": 0.60128069, + "num_input_tokens_seen": 165089610, + "step": 7690, + "time_per_iteration": 2.891810894012451 + }, + { + "auxiliary_loss_clip": 0.01455409, + "auxiliary_loss_mlp": 0.01251633, + "balance_loss_clip": 1.14314651, + "balance_loss_mlp": 1.03324151, + "epoch": 0.46240793626935217, + "flos": 20560529653920.0, + "grad_norm": 2.5842057078467153, + "language_loss": 0.82745385, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.85452425, + "num_input_tokens_seen": 165109050, + "step": 7691, + "time_per_iteration": 2.8537437915802 + }, + { + "auxiliary_loss_clip": 0.01455598, + "auxiliary_loss_mlp": 0.01243238, + "balance_loss_clip": 1.14371586, + "balance_loss_mlp": 1.02408338, + "epoch": 0.46246805952202014, + "flos": 22458284084160.0, + "grad_norm": 1.3615885127544138, + "language_loss": 0.754861, + "learning_rate": 2.338556667513091e-06, + "loss": 0.78184932, + "num_input_tokens_seen": 165130130, + "step": 7692, + "time_per_iteration": 2.9559874534606934 + }, + { + "auxiliary_loss_clip": 0.01456134, + "auxiliary_loss_mlp": 0.01256855, + "balance_loss_clip": 1.14398861, + "balance_loss_mlp": 1.03674746, + "epoch": 0.4625281827746881, + "flos": 35044111107360.0, + "grad_norm": 1.9494603407823639, + "language_loss": 0.74001539, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76714528, + "num_input_tokens_seen": 165152685, + "step": 7693, + "time_per_iteration": 2.9710772037506104 + }, + { + "auxiliary_loss_clip": 0.01461126, + "auxiliary_loss_mlp": 0.01250398, + "balance_loss_clip": 1.14880025, + "balance_loss_mlp": 1.02990878, + "epoch": 0.46258830602735607, + "flos": 21070744362720.0, + "grad_norm": 1.778840456776577, + "language_loss": 0.8573072, + "learning_rate": 2.337788959692808e-06, + "loss": 0.88442242, + "num_input_tokens_seen": 165173315, + "step": 7694, + "time_per_iteration": 2.871751546859741 + }, + { + "auxiliary_loss_clip": 0.01458986, + "auxiliary_loss_mlp": 0.01259854, + "balance_loss_clip": 1.14634204, + "balance_loss_mlp": 1.04012716, + "epoch": 0.46264842928002403, + "flos": 26179642663200.0, + "grad_norm": 24.45861189946397, + "language_loss": 0.7904653, + "learning_rate": 2.337405086561902e-06, + "loss": 0.81765372, + "num_input_tokens_seen": 165192395, + "step": 7695, + "time_per_iteration": 2.8980445861816406 + }, + { + "auxiliary_loss_clip": 0.01452647, + "auxiliary_loss_mlp": 0.01253138, + "balance_loss_clip": 1.14034009, + "balance_loss_mlp": 1.03360224, + "epoch": 0.462708552532692, + "flos": 16766120710080.0, + "grad_norm": 1.7756844806669958, + "language_loss": 0.72505319, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.75211108, + "num_input_tokens_seen": 165211355, + "step": 7696, + "time_per_iteration": 2.9130859375 + }, + { + "auxiliary_loss_clip": 0.01453985, + "auxiliary_loss_mlp": 0.01247142, + "balance_loss_clip": 1.14198291, + "balance_loss_mlp": 1.02493548, + "epoch": 0.46276867578535996, + "flos": 15562306817280.0, + "grad_norm": 1.8375823891951082, + "language_loss": 0.69753671, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.72454798, + "num_input_tokens_seen": 165229380, + "step": 7697, + "time_per_iteration": 2.8149969577789307 + }, + { + "auxiliary_loss_clip": 0.0145445, + "auxiliary_loss_mlp": 0.0124815, + "balance_loss_clip": 1.14246416, + "balance_loss_mlp": 1.0280422, + "epoch": 0.462828799038028, + "flos": 22417397163360.0, + "grad_norm": 5.648868157591627, + "language_loss": 0.84736061, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.87438667, + "num_input_tokens_seen": 165247200, + "step": 7698, + "time_per_iteration": 2.89158296585083 + }, + { + "auxiliary_loss_clip": 0.01454102, + "auxiliary_loss_mlp": 0.01249519, + "balance_loss_clip": 1.14354992, + "balance_loss_mlp": 1.03112721, + "epoch": 0.46288892229069595, + "flos": 21071730494880.0, + "grad_norm": 1.8539169516068896, + "language_loss": 0.71358418, + "learning_rate": 2.335869466239502e-06, + "loss": 0.74062037, + "num_input_tokens_seen": 165265825, + "step": 7699, + "time_per_iteration": 2.7811660766601562 + }, + { + "auxiliary_loss_clip": 0.01453633, + "auxiliary_loss_mlp": 0.01251299, + "balance_loss_clip": 1.14156365, + "balance_loss_mlp": 1.03214419, + "epoch": 0.4629490455433639, + "flos": 23187953312640.0, + "grad_norm": 1.9372233477007608, + "language_loss": 0.71685708, + "learning_rate": 2.335485529281996e-06, + "loss": 0.74390638, + "num_input_tokens_seen": 165284380, + "step": 7700, + "time_per_iteration": 2.7664458751678467 + }, + { + "auxiliary_loss_clip": 0.01460467, + "auxiliary_loss_mlp": 0.01250768, + "balance_loss_clip": 1.14841807, + "balance_loss_mlp": 1.03199506, + "epoch": 0.4630091687960319, + "flos": 18837094868640.0, + "grad_norm": 2.044593875357559, + "language_loss": 0.727332, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.7544443, + "num_input_tokens_seen": 165300320, + "step": 7701, + "time_per_iteration": 2.7807703018188477 + }, + { + "auxiliary_loss_clip": 0.01455873, + "auxiliary_loss_mlp": 0.01248976, + "balance_loss_clip": 1.14304435, + "balance_loss_mlp": 1.0281055, + "epoch": 0.46306929204869984, + "flos": 38909939505120.0, + "grad_norm": 2.1127564139731483, + "language_loss": 0.65079033, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.6778388, + "num_input_tokens_seen": 165318130, + "step": 7702, + "time_per_iteration": 2.908108949661255 + }, + { + "auxiliary_loss_clip": 0.01449679, + "auxiliary_loss_mlp": 0.01241126, + "balance_loss_clip": 1.1375773, + "balance_loss_mlp": 1.02330673, + "epoch": 0.4631294153013678, + "flos": 19646110536480.0, + "grad_norm": 2.3092158384624826, + "language_loss": 0.73568034, + "learning_rate": 2.33433364213785e-06, + "loss": 0.76258838, + "num_input_tokens_seen": 165336225, + "step": 7703, + "time_per_iteration": 2.7254726886749268 + }, + { + "auxiliary_loss_clip": 0.01455067, + "auxiliary_loss_mlp": 0.01247378, + "balance_loss_clip": 1.14164078, + "balance_loss_mlp": 1.02517164, + "epoch": 0.4631895385540358, + "flos": 24610576946400.0, + "grad_norm": 49.63969160537785, + "language_loss": 0.69311172, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.72013617, + "num_input_tokens_seen": 165355005, + "step": 7704, + "time_per_iteration": 2.762146472930908 + }, + { + "auxiliary_loss_clip": 0.01461303, + "auxiliary_loss_mlp": 0.01251593, + "balance_loss_clip": 1.14855576, + "balance_loss_mlp": 1.03243899, + "epoch": 0.46324966180670374, + "flos": 26322367786560.0, + "grad_norm": 2.547940735466216, + "language_loss": 0.80757457, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83470351, + "num_input_tokens_seen": 165374910, + "step": 7705, + "time_per_iteration": 2.85296893119812 + }, + { + "auxiliary_loss_clip": 0.01454552, + "auxiliary_loss_mlp": 0.01256715, + "balance_loss_clip": 1.14027727, + "balance_loss_mlp": 1.03717923, + "epoch": 0.4633097850593717, + "flos": 19242285409440.0, + "grad_norm": 1.7107171367607068, + "language_loss": 0.77777457, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.80488729, + "num_input_tokens_seen": 165392590, + "step": 7706, + "time_per_iteration": 2.795977830886841 + }, + { + "auxiliary_loss_clip": 0.01452492, + "auxiliary_loss_mlp": 0.01240998, + "balance_loss_clip": 1.14095843, + "balance_loss_mlp": 1.02451444, + "epoch": 0.46336990831203967, + "flos": 22785341886720.0, + "grad_norm": 1.9595844638469155, + "language_loss": 0.70610154, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.73303646, + "num_input_tokens_seen": 165411195, + "step": 7707, + "time_per_iteration": 2.7848474979400635 + }, + { + "auxiliary_loss_clip": 0.01452207, + "auxiliary_loss_mlp": 0.01246312, + "balance_loss_clip": 1.13944507, + "balance_loss_mlp": 1.02753949, + "epoch": 0.46343003156470763, + "flos": 38213306068320.0, + "grad_norm": 2.4076269083375523, + "language_loss": 0.61597729, + "learning_rate": 2.332413576865791e-06, + "loss": 0.64296246, + "num_input_tokens_seen": 165430150, + "step": 7708, + "time_per_iteration": 4.5623109340667725 + }, + { + "auxiliary_loss_clip": 0.01451147, + "auxiliary_loss_mlp": 0.01245654, + "balance_loss_clip": 1.13900018, + "balance_loss_mlp": 1.02516437, + "epoch": 0.4634901548173756, + "flos": 31941101514240.0, + "grad_norm": 2.9857943954826567, + "language_loss": 0.77558017, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.80254817, + "num_input_tokens_seen": 165450595, + "step": 7709, + "time_per_iteration": 2.906186103820801 + }, + { + "auxiliary_loss_clip": 0.01455676, + "auxiliary_loss_mlp": 0.01260223, + "balance_loss_clip": 1.14281869, + "balance_loss_mlp": 1.03973377, + "epoch": 0.46355027807004356, + "flos": 20084526509760.0, + "grad_norm": 1.702284138257196, + "language_loss": 0.77128047, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79843944, + "num_input_tokens_seen": 165469515, + "step": 7710, + "time_per_iteration": 2.780522108078003 + }, + { + "auxiliary_loss_clip": 0.01456737, + "auxiliary_loss_mlp": 0.01249884, + "balance_loss_clip": 1.14475393, + "balance_loss_mlp": 1.02596092, + "epoch": 0.4636104013227116, + "flos": 24063685342560.0, + "grad_norm": 1.9418384829826505, + "language_loss": 0.73669362, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.76375985, + "num_input_tokens_seen": 165488125, + "step": 7711, + "time_per_iteration": 2.843921422958374 + }, + { + "auxiliary_loss_clip": 0.0145864, + "auxiliary_loss_mlp": 0.01255811, + "balance_loss_clip": 1.14670563, + "balance_loss_mlp": 1.03646553, + "epoch": 0.46367052457537955, + "flos": 23916901906080.0, + "grad_norm": 1.357045185157995, + "language_loss": 0.71683627, + "learning_rate": 2.33087729766797e-06, + "loss": 0.74398077, + "num_input_tokens_seen": 165509225, + "step": 7712, + "time_per_iteration": 2.770073175430298 + }, + { + "auxiliary_loss_clip": 0.01458024, + "auxiliary_loss_mlp": 0.01249602, + "balance_loss_clip": 1.14649963, + "balance_loss_mlp": 1.02834964, + "epoch": 0.4637306478280475, + "flos": 26398983398400.0, + "grad_norm": 1.7956740899178583, + "language_loss": 0.73303956, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.7601158, + "num_input_tokens_seen": 165529945, + "step": 7713, + "time_per_iteration": 2.812588930130005 + }, + { + "auxiliary_loss_clip": 0.01458712, + "auxiliary_loss_mlp": 0.01249237, + "balance_loss_clip": 1.14726806, + "balance_loss_mlp": 1.02684021, + "epoch": 0.4637907710807155, + "flos": 21982849862400.0, + "grad_norm": 1.5481527432220565, + "language_loss": 0.58286846, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60994792, + "num_input_tokens_seen": 165550690, + "step": 7714, + "time_per_iteration": 4.375011205673218 + }, + { + "auxiliary_loss_clip": 0.01455322, + "auxiliary_loss_mlp": 0.01243438, + "balance_loss_clip": 1.14332962, + "balance_loss_mlp": 1.02599978, + "epoch": 0.46385089433338345, + "flos": 12423947814720.0, + "grad_norm": 2.0361871352264385, + "language_loss": 0.69895053, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72593808, + "num_input_tokens_seen": 165567775, + "step": 7715, + "time_per_iteration": 4.251397371292114 + }, + { + "auxiliary_loss_clip": 0.01461931, + "auxiliary_loss_mlp": 0.01262353, + "balance_loss_clip": 1.15007162, + "balance_loss_mlp": 1.0433898, + "epoch": 0.4639110175860514, + "flos": 23918077679040.0, + "grad_norm": 1.8288139464151223, + "language_loss": 0.6787535, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70599639, + "num_input_tokens_seen": 165587010, + "step": 7716, + "time_per_iteration": 2.7490248680114746 + }, + { + "auxiliary_loss_clip": 0.01453544, + "auxiliary_loss_mlp": 0.01250769, + "balance_loss_clip": 1.14082754, + "balance_loss_mlp": 1.0331409, + "epoch": 0.4639711408387194, + "flos": 25302696932160.0, + "grad_norm": 2.9561548907226465, + "language_loss": 0.81285471, + "learning_rate": 2.328956666474691e-06, + "loss": 0.83989787, + "num_input_tokens_seen": 165607850, + "step": 7717, + "time_per_iteration": 2.976576089859009 + }, + { + "auxiliary_loss_clip": 0.01455069, + "auxiliary_loss_mlp": 0.01258495, + "balance_loss_clip": 1.14377081, + "balance_loss_mlp": 1.04277408, + "epoch": 0.46403126409138734, + "flos": 21213772911360.0, + "grad_norm": 1.7686140487679327, + "language_loss": 0.73399568, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.76113129, + "num_input_tokens_seen": 165627175, + "step": 7718, + "time_per_iteration": 2.7999515533447266 + }, + { + "auxiliary_loss_clip": 0.01452954, + "auxiliary_loss_mlp": 0.01262767, + "balance_loss_clip": 1.14041567, + "balance_loss_mlp": 1.04456639, + "epoch": 0.4640913873440553, + "flos": 35848499539680.0, + "grad_norm": 2.399462403860068, + "language_loss": 0.70729399, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.73445123, + "num_input_tokens_seen": 165648340, + "step": 7719, + "time_per_iteration": 2.8690085411071777 + }, + { + "auxiliary_loss_clip": 0.01459626, + "auxiliary_loss_mlp": 0.01271192, + "balance_loss_clip": 1.14850116, + "balance_loss_mlp": 1.0541358, + "epoch": 0.46415151059672327, + "flos": 19167907559040.0, + "grad_norm": 2.0660474304152547, + "language_loss": 0.86848211, + "learning_rate": 2.327804137953357e-06, + "loss": 0.89579034, + "num_input_tokens_seen": 165667195, + "step": 7720, + "time_per_iteration": 2.7402050495147705 + }, + { + "auxiliary_loss_clip": 0.01520187, + "auxiliary_loss_mlp": 0.01212784, + "balance_loss_clip": 1.23574471, + "balance_loss_mlp": 1.00831604, + "epoch": 0.46421163384939124, + "flos": 58919480949600.0, + "grad_norm": 0.7118739036951308, + "language_loss": 0.54909009, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57641983, + "num_input_tokens_seen": 165726760, + "step": 7721, + "time_per_iteration": 3.346466541290283 + }, + { + "auxiliary_loss_clip": 0.01465846, + "auxiliary_loss_mlp": 0.01256787, + "balance_loss_clip": 1.1537776, + "balance_loss_mlp": 1.03801465, + "epoch": 0.4642717571020592, + "flos": 20159435354400.0, + "grad_norm": 2.1330173378364212, + "language_loss": 0.80281305, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.83003944, + "num_input_tokens_seen": 165745005, + "step": 7722, + "time_per_iteration": 4.205268859863281 + }, + { + "auxiliary_loss_clip": 0.01453318, + "auxiliary_loss_mlp": 0.01262249, + "balance_loss_clip": 1.14098072, + "balance_loss_mlp": 1.04385734, + "epoch": 0.46433188035472717, + "flos": 25048537781760.0, + "grad_norm": 1.9456431037449116, + "language_loss": 0.77751458, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.80467027, + "num_input_tokens_seen": 165765750, + "step": 7723, + "time_per_iteration": 2.8274691104888916 + }, + { + "auxiliary_loss_clip": 0.01450659, + "auxiliary_loss_mlp": 0.01240411, + "balance_loss_clip": 1.13916743, + "balance_loss_mlp": 1.02201915, + "epoch": 0.4643920036073952, + "flos": 28078231512960.0, + "grad_norm": 1.4762342905564134, + "language_loss": 0.68473381, + "learning_rate": 2.326267259301118e-06, + "loss": 0.71164453, + "num_input_tokens_seen": 165787515, + "step": 7724, + "time_per_iteration": 2.790813446044922 + }, + { + "auxiliary_loss_clip": 0.01455903, + "auxiliary_loss_mlp": 0.01246507, + "balance_loss_clip": 1.144207, + "balance_loss_mlp": 1.02792525, + "epoch": 0.46445212686006315, + "flos": 18371332327680.0, + "grad_norm": 2.4054833381088767, + "language_loss": 0.67286581, + "learning_rate": 2.325883008671415e-06, + "loss": 0.6998899, + "num_input_tokens_seen": 165806675, + "step": 7725, + "time_per_iteration": 2.7926957607269287 + }, + { + "auxiliary_loss_clip": 0.01453991, + "auxiliary_loss_mlp": 0.01242097, + "balance_loss_clip": 1.14356065, + "balance_loss_mlp": 1.0252316, + "epoch": 0.4645122501127311, + "flos": 31724264037600.0, + "grad_norm": 1.6899685878406956, + "language_loss": 0.65203983, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67900074, + "num_input_tokens_seen": 165829835, + "step": 7726, + "time_per_iteration": 2.8173015117645264 + }, + { + "auxiliary_loss_clip": 0.01456856, + "auxiliary_loss_mlp": 0.0124524, + "balance_loss_clip": 1.14465201, + "balance_loss_mlp": 1.02494097, + "epoch": 0.4645723733653991, + "flos": 23771066673600.0, + "grad_norm": 1.8424408741344371, + "language_loss": 0.74741781, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.7744388, + "num_input_tokens_seen": 165849380, + "step": 7727, + "time_per_iteration": 2.827847957611084 + }, + { + "auxiliary_loss_clip": 0.01447128, + "auxiliary_loss_mlp": 0.01249496, + "balance_loss_clip": 1.13611913, + "balance_loss_mlp": 1.0303421, + "epoch": 0.46463249661806705, + "flos": 33148291013280.0, + "grad_norm": 3.2070669248113117, + "language_loss": 0.78893793, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.81590426, + "num_input_tokens_seen": 165868620, + "step": 7728, + "time_per_iteration": 2.803298234939575 + }, + { + "auxiliary_loss_clip": 0.01450975, + "auxiliary_loss_mlp": 0.01266551, + "balance_loss_clip": 1.13869333, + "balance_loss_mlp": 1.04892254, + "epoch": 0.464692619870735, + "flos": 18297978537600.0, + "grad_norm": 1.9269541633002158, + "language_loss": 0.75755352, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78472883, + "num_input_tokens_seen": 165885915, + "step": 7729, + "time_per_iteration": 2.7536401748657227 + }, + { + "auxiliary_loss_clip": 0.01450931, + "auxiliary_loss_mlp": 0.01247517, + "balance_loss_clip": 1.13955045, + "balance_loss_mlp": 1.02702761, + "epoch": 0.464752743123403, + "flos": 22640265217440.0, + "grad_norm": 1.5926592361050782, + "language_loss": 0.80058408, + "learning_rate": 2.323961570451588e-06, + "loss": 0.82756865, + "num_input_tokens_seen": 165905465, + "step": 7730, + "time_per_iteration": 2.8067405223846436 + }, + { + "auxiliary_loss_clip": 0.01447278, + "auxiliary_loss_mlp": 0.01242238, + "balance_loss_clip": 1.13482118, + "balance_loss_mlp": 1.02327466, + "epoch": 0.46481286637607094, + "flos": 20414201355360.0, + "grad_norm": 1.524476603495807, + "language_loss": 0.77084571, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.79774082, + "num_input_tokens_seen": 165924640, + "step": 7731, + "time_per_iteration": 2.775721549987793 + }, + { + "auxiliary_loss_clip": 0.01448672, + "auxiliary_loss_mlp": 0.01242855, + "balance_loss_clip": 1.13669741, + "balance_loss_mlp": 1.02484512, + "epoch": 0.4648729896287389, + "flos": 34278182193600.0, + "grad_norm": 1.9827535271162628, + "language_loss": 0.6603179, + "learning_rate": 2.323192909069061e-06, + "loss": 0.68723315, + "num_input_tokens_seen": 165945765, + "step": 7732, + "time_per_iteration": 2.8861279487609863 + }, + { + "auxiliary_loss_clip": 0.01449725, + "auxiliary_loss_mlp": 0.01254221, + "balance_loss_clip": 1.13772488, + "balance_loss_mlp": 1.03125191, + "epoch": 0.4649331128814069, + "flos": 21323765668320.0, + "grad_norm": 2.3479046782836095, + "language_loss": 0.73270124, + "learning_rate": 2.32280855998725e-06, + "loss": 0.75974071, + "num_input_tokens_seen": 165964025, + "step": 7733, + "time_per_iteration": 2.7469146251678467 + }, + { + "auxiliary_loss_clip": 0.01510073, + "auxiliary_loss_mlp": 0.01222954, + "balance_loss_clip": 1.22471297, + "balance_loss_mlp": 1.0200119, + "epoch": 0.46499323613407484, + "flos": 58314065830560.0, + "grad_norm": 1.372120067754872, + "language_loss": 0.5192312, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54656154, + "num_input_tokens_seen": 166021950, + "step": 7734, + "time_per_iteration": 3.29384183883667 + }, + { + "auxiliary_loss_clip": 0.0144511, + "auxiliary_loss_mlp": 0.0125891, + "balance_loss_clip": 1.13429558, + "balance_loss_mlp": 1.03746676, + "epoch": 0.4650533593867428, + "flos": 10891217639520.0, + "grad_norm": 2.018538200311713, + "language_loss": 0.75328863, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.78032887, + "num_input_tokens_seen": 166039675, + "step": 7735, + "time_per_iteration": 2.75626277923584 + }, + { + "auxiliary_loss_clip": 0.01453149, + "auxiliary_loss_mlp": 0.01256055, + "balance_loss_clip": 1.1413064, + "balance_loss_mlp": 1.03918922, + "epoch": 0.46511348263941077, + "flos": 19976657729760.0, + "grad_norm": 1.7506304183605503, + "language_loss": 0.69495004, + "learning_rate": 2.321655439354519e-06, + "loss": 0.72204208, + "num_input_tokens_seen": 166057745, + "step": 7736, + "time_per_iteration": 2.7976062297821045 + }, + { + "auxiliary_loss_clip": 0.0145203, + "auxiliary_loss_mlp": 0.01240516, + "balance_loss_clip": 1.13935161, + "balance_loss_mlp": 1.02365041, + "epoch": 0.46517360589207873, + "flos": 19680208316640.0, + "grad_norm": 1.7752302952023633, + "language_loss": 0.72234344, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74926889, + "num_input_tokens_seen": 166076440, + "step": 7737, + "time_per_iteration": 2.729520082473755 + }, + { + "auxiliary_loss_clip": 0.01457239, + "auxiliary_loss_mlp": 0.01251978, + "balance_loss_clip": 1.14381611, + "balance_loss_mlp": 1.03015375, + "epoch": 0.46523372914474675, + "flos": 16874520484320.0, + "grad_norm": 1.973437815317795, + "language_loss": 0.83851409, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.86560631, + "num_input_tokens_seen": 166092520, + "step": 7738, + "time_per_iteration": 2.836033582687378 + }, + { + "auxiliary_loss_clip": 0.01508868, + "auxiliary_loss_mlp": 0.01214195, + "balance_loss_clip": 1.22324228, + "balance_loss_mlp": 1.01049042, + "epoch": 0.4652938523974147, + "flos": 53445520465920.0, + "grad_norm": 0.7690455807408779, + "language_loss": 0.57720327, + "learning_rate": 2.320502208946932e-06, + "loss": 0.60443389, + "num_input_tokens_seen": 166156285, + "step": 7739, + "time_per_iteration": 3.3632428646087646 + }, + { + "auxiliary_loss_clip": 0.0145563, + "auxiliary_loss_mlp": 0.01247989, + "balance_loss_clip": 1.14471149, + "balance_loss_mlp": 1.02711773, + "epoch": 0.4653539756500827, + "flos": 15233087109600.0, + "grad_norm": 1.8194165318164053, + "language_loss": 0.85155654, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.87859273, + "num_input_tokens_seen": 166173455, + "step": 7740, + "time_per_iteration": 2.7521698474884033 + }, + { + "auxiliary_loss_clip": 0.01454321, + "auxiliary_loss_mlp": 0.01247764, + "balance_loss_clip": 1.14332211, + "balance_loss_mlp": 1.02784693, + "epoch": 0.46541409890275065, + "flos": 23734503563040.0, + "grad_norm": 1.5797716878607002, + "language_loss": 0.76091295, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.78793383, + "num_input_tokens_seen": 166194370, + "step": 7741, + "time_per_iteration": 2.856642723083496 + }, + { + "auxiliary_loss_clip": 0.01456388, + "auxiliary_loss_mlp": 0.01258909, + "balance_loss_clip": 1.14445353, + "balance_loss_mlp": 1.0384196, + "epoch": 0.4654742221554186, + "flos": 20849279650560.0, + "grad_norm": 4.8183046855741125, + "language_loss": 0.81087959, + "learning_rate": 2.319348869158064e-06, + "loss": 0.83803254, + "num_input_tokens_seen": 166213195, + "step": 7742, + "time_per_iteration": 2.731271982192993 + }, + { + "auxiliary_loss_clip": 0.01455928, + "auxiliary_loss_mlp": 0.01252975, + "balance_loss_clip": 1.14217186, + "balance_loss_mlp": 1.03267634, + "epoch": 0.4655343454080866, + "flos": 20706971736960.0, + "grad_norm": 1.836477403570476, + "language_loss": 0.72816133, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.75525039, + "num_input_tokens_seen": 166231350, + "step": 7743, + "time_per_iteration": 2.7396903038024902 + }, + { + "auxiliary_loss_clip": 0.01450039, + "auxiliary_loss_mlp": 0.01261047, + "balance_loss_clip": 1.13758528, + "balance_loss_mlp": 1.03979492, + "epoch": 0.46559446866075455, + "flos": 18991881146880.0, + "grad_norm": 2.813937284711216, + "language_loss": 0.7188673, + "learning_rate": 2.318579915392483e-06, + "loss": 0.74597812, + "num_input_tokens_seen": 166250530, + "step": 7744, + "time_per_iteration": 2.733579635620117 + }, + { + "auxiliary_loss_clip": 0.0144793, + "auxiliary_loss_mlp": 0.01247666, + "balance_loss_clip": 1.1368165, + "balance_loss_mlp": 1.03080058, + "epoch": 0.4656545919134225, + "flos": 34499267624160.0, + "grad_norm": 1.7237361624582155, + "language_loss": 0.84893417, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87589014, + "num_input_tokens_seen": 166272545, + "step": 7745, + "time_per_iteration": 4.535351514816284 + }, + { + "auxiliary_loss_clip": 0.01451732, + "auxiliary_loss_mlp": 0.0124379, + "balance_loss_clip": 1.1387732, + "balance_loss_mlp": 1.02616167, + "epoch": 0.4657147151660905, + "flos": 24312534550560.0, + "grad_norm": 1.634974224782504, + "language_loss": 0.72913551, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75609076, + "num_input_tokens_seen": 166292135, + "step": 7746, + "time_per_iteration": 2.787651300430298 + }, + { + "auxiliary_loss_clip": 0.01446635, + "auxiliary_loss_mlp": 0.01243925, + "balance_loss_clip": 1.13425684, + "balance_loss_mlp": 1.02896738, + "epoch": 0.46577483841875844, + "flos": 58799361738240.0, + "grad_norm": 1.6179642696490604, + "language_loss": 0.69975233, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72665793, + "num_input_tokens_seen": 166316710, + "step": 7747, + "time_per_iteration": 3.066265344619751 + }, + { + "auxiliary_loss_clip": 0.01446815, + "auxiliary_loss_mlp": 0.01243014, + "balance_loss_clip": 1.13448286, + "balance_loss_mlp": 1.02500403, + "epoch": 0.4658349616714264, + "flos": 31324686864480.0, + "grad_norm": 1.7516989379241872, + "language_loss": 0.6727649, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69966322, + "num_input_tokens_seen": 166338535, + "step": 7748, + "time_per_iteration": 2.8365516662597656 + }, + { + "auxiliary_loss_clip": 0.01444579, + "auxiliary_loss_mlp": 0.01255222, + "balance_loss_clip": 1.13068628, + "balance_loss_mlp": 1.03568649, + "epoch": 0.46589508492409437, + "flos": 14861766780000.0, + "grad_norm": 2.1562949279364925, + "language_loss": 0.64109552, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.66809356, + "num_input_tokens_seen": 166355540, + "step": 7749, + "time_per_iteration": 2.9018142223358154 + }, + { + "auxiliary_loss_clip": 0.01457057, + "auxiliary_loss_mlp": 0.01270794, + "balance_loss_clip": 1.14510381, + "balance_loss_mlp": 1.04820669, + "epoch": 0.46595520817676234, + "flos": 12897637341120.0, + "grad_norm": 2.1798381897726924, + "language_loss": 0.74520671, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.77248526, + "num_input_tokens_seen": 166372635, + "step": 7750, + "time_per_iteration": 2.7841062545776367 + }, + { + "auxiliary_loss_clip": 0.01452848, + "auxiliary_loss_mlp": 0.01258648, + "balance_loss_clip": 1.14151525, + "balance_loss_mlp": 1.04025686, + "epoch": 0.46601533142943036, + "flos": 32856961901760.0, + "grad_norm": 1.8298942867279393, + "language_loss": 0.74062657, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.76774156, + "num_input_tokens_seen": 166393175, + "step": 7751, + "time_per_iteration": 2.9335858821868896 + }, + { + "auxiliary_loss_clip": 0.01456647, + "auxiliary_loss_mlp": 0.0125387, + "balance_loss_clip": 1.14400554, + "balance_loss_mlp": 1.03090096, + "epoch": 0.4660754546820983, + "flos": 19969185882240.0, + "grad_norm": 1.7584785208947606, + "language_loss": 0.74121058, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.76831573, + "num_input_tokens_seen": 166408630, + "step": 7752, + "time_per_iteration": 4.42493748664856 + }, + { + "auxiliary_loss_clip": 0.0145486, + "auxiliary_loss_mlp": 0.01253019, + "balance_loss_clip": 1.14094639, + "balance_loss_mlp": 1.03195763, + "epoch": 0.4661355779347663, + "flos": 26690615935200.0, + "grad_norm": 3.761939493398429, + "language_loss": 0.6945374, + "learning_rate": 2.315119027142644e-06, + "loss": 0.72161615, + "num_input_tokens_seen": 166428170, + "step": 7753, + "time_per_iteration": 4.3714823722839355 + }, + { + "auxiliary_loss_clip": 0.01453521, + "auxiliary_loss_mlp": 0.01254597, + "balance_loss_clip": 1.14182138, + "balance_loss_mlp": 1.03906691, + "epoch": 0.46619570118743425, + "flos": 20961548097120.0, + "grad_norm": 1.7995331632649592, + "language_loss": 0.73037171, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.75745296, + "num_input_tokens_seen": 166446705, + "step": 7754, + "time_per_iteration": 2.8738739490509033 + }, + { + "auxiliary_loss_clip": 0.01453824, + "auxiliary_loss_mlp": 0.01257184, + "balance_loss_clip": 1.14175606, + "balance_loss_mlp": 1.03974605, + "epoch": 0.4662558244401022, + "flos": 24428406172320.0, + "grad_norm": 1.6053109028543557, + "language_loss": 0.78662616, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81373626, + "num_input_tokens_seen": 166466750, + "step": 7755, + "time_per_iteration": 2.9327943325042725 + }, + { + "auxiliary_loss_clip": 0.01453891, + "auxiliary_loss_mlp": 0.01255418, + "balance_loss_clip": 1.14284396, + "balance_loss_mlp": 1.03950572, + "epoch": 0.4663159476927702, + "flos": 20597585830560.0, + "grad_norm": 1.7457294498478027, + "language_loss": 0.72521812, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.75231117, + "num_input_tokens_seen": 166485400, + "step": 7756, + "time_per_iteration": 2.8609602451324463 + }, + { + "auxiliary_loss_clip": 0.01451335, + "auxiliary_loss_mlp": 0.01246673, + "balance_loss_clip": 1.13995004, + "balance_loss_mlp": 1.03037989, + "epoch": 0.46637607094543815, + "flos": 25664004227520.0, + "grad_norm": 1.741245367159185, + "language_loss": 0.78484559, + "learning_rate": 2.313580543272274e-06, + "loss": 0.81182563, + "num_input_tokens_seen": 166505730, + "step": 7757, + "time_per_iteration": 2.970519781112671 + }, + { + "auxiliary_loss_clip": 0.01451057, + "auxiliary_loss_mlp": 0.01257641, + "balance_loss_clip": 1.13976502, + "balance_loss_mlp": 1.041538, + "epoch": 0.4664361941981061, + "flos": 24275819727360.0, + "grad_norm": 1.7890882525556349, + "language_loss": 0.66303027, + "learning_rate": 2.313195892540705e-06, + "loss": 0.69011724, + "num_input_tokens_seen": 166523770, + "step": 7758, + "time_per_iteration": 2.906113862991333 + }, + { + "auxiliary_loss_clip": 0.01454883, + "auxiliary_loss_mlp": 0.01249799, + "balance_loss_clip": 1.14367747, + "balance_loss_mlp": 1.03159869, + "epoch": 0.4664963174507741, + "flos": 18407857510080.0, + "grad_norm": 1.6772102698904179, + "language_loss": 0.74867404, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.77572083, + "num_input_tokens_seen": 166542935, + "step": 7759, + "time_per_iteration": 2.81233286857605 + }, + { + "auxiliary_loss_clip": 0.01451422, + "auxiliary_loss_mlp": 0.01250532, + "balance_loss_clip": 1.14029515, + "balance_loss_mlp": 1.03385687, + "epoch": 0.46655644070344204, + "flos": 22457297952000.0, + "grad_norm": 1.527735182450596, + "language_loss": 0.77786976, + "learning_rate": 2.312426555462893e-06, + "loss": 0.80488932, + "num_input_tokens_seen": 166563935, + "step": 7760, + "time_per_iteration": 4.261553764343262 + }, + { + "auxiliary_loss_clip": 0.01450278, + "auxiliary_loss_mlp": 0.01246308, + "balance_loss_clip": 1.14048386, + "balance_loss_mlp": 1.03001487, + "epoch": 0.46661656395611, + "flos": 13810122122400.0, + "grad_norm": 1.6944870622850334, + "language_loss": 0.73946518, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76643109, + "num_input_tokens_seen": 166582175, + "step": 7761, + "time_per_iteration": 2.8699848651885986 + }, + { + "auxiliary_loss_clip": 0.01459826, + "auxiliary_loss_mlp": 0.01265215, + "balance_loss_clip": 1.14812982, + "balance_loss_mlp": 1.04586983, + "epoch": 0.466676687208778, + "flos": 21654274933440.0, + "grad_norm": 1.6436868911840516, + "language_loss": 0.78847539, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.8157258, + "num_input_tokens_seen": 166601870, + "step": 7762, + "time_per_iteration": 2.921980857849121 + }, + { + "auxiliary_loss_clip": 0.01499152, + "auxiliary_loss_mlp": 0.01215691, + "balance_loss_clip": 1.21728098, + "balance_loss_mlp": 1.01351166, + "epoch": 0.46673681046144594, + "flos": 68540927261760.0, + "grad_norm": 0.7916519901296378, + "language_loss": 0.59756476, + "learning_rate": 2.311272461028297e-06, + "loss": 0.62471318, + "num_input_tokens_seen": 166668960, + "step": 7763, + "time_per_iteration": 3.4834656715393066 + }, + { + "auxiliary_loss_clip": 0.01455011, + "auxiliary_loss_mlp": 0.01255989, + "balance_loss_clip": 1.14216006, + "balance_loss_mlp": 1.03569078, + "epoch": 0.46679693371411396, + "flos": 15816238398720.0, + "grad_norm": 1.8635231549129765, + "language_loss": 0.78946757, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.81657767, + "num_input_tokens_seen": 166686110, + "step": 7764, + "time_per_iteration": 2.8427205085754395 + }, + { + "auxiliary_loss_clip": 0.01449332, + "auxiliary_loss_mlp": 0.0124672, + "balance_loss_clip": 1.13918447, + "balance_loss_mlp": 1.03157091, + "epoch": 0.4668570569667819, + "flos": 18516522781440.0, + "grad_norm": 1.8925641414603795, + "language_loss": 0.72122014, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74818069, + "num_input_tokens_seen": 166703930, + "step": 7765, + "time_per_iteration": 2.9336135387420654 + }, + { + "auxiliary_loss_clip": 0.01448806, + "auxiliary_loss_mlp": 0.01255994, + "balance_loss_clip": 1.13765311, + "balance_loss_mlp": 1.03798449, + "epoch": 0.4669171802194499, + "flos": 19208566910880.0, + "grad_norm": 2.091583248410818, + "language_loss": 0.78191721, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.80896521, + "num_input_tokens_seen": 166719940, + "step": 7766, + "time_per_iteration": 2.8520140647888184 + }, + { + "auxiliary_loss_clip": 0.01447123, + "auxiliary_loss_mlp": 0.01240866, + "balance_loss_clip": 1.13636649, + "balance_loss_mlp": 1.02171135, + "epoch": 0.46697730347211786, + "flos": 12277771228800.0, + "grad_norm": 2.4959743187035857, + "language_loss": 0.65281677, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.67969668, + "num_input_tokens_seen": 166738285, + "step": 7767, + "time_per_iteration": 2.8507707118988037 + }, + { + "auxiliary_loss_clip": 0.0145403, + "auxiliary_loss_mlp": 0.01239424, + "balance_loss_clip": 1.14375305, + "balance_loss_mlp": 1.01893461, + "epoch": 0.4670374267247858, + "flos": 23589009684000.0, + "grad_norm": 2.1851413661066674, + "language_loss": 0.7470144, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.77394903, + "num_input_tokens_seen": 166758170, + "step": 7768, + "time_per_iteration": 3.060908555984497 + }, + { + "auxiliary_loss_clip": 0.01449519, + "auxiliary_loss_mlp": 0.01249485, + "balance_loss_clip": 1.13838947, + "balance_loss_mlp": 1.03071213, + "epoch": 0.4670975499774538, + "flos": 15992568236160.0, + "grad_norm": 1.7005162072524986, + "language_loss": 0.70647246, + "learning_rate": 2.308963953858982e-06, + "loss": 0.73346245, + "num_input_tokens_seen": 166775750, + "step": 7769, + "time_per_iteration": 2.859041690826416 + }, + { + "auxiliary_loss_clip": 0.01446441, + "auxiliary_loss_mlp": 0.0125513, + "balance_loss_clip": 1.13508451, + "balance_loss_mlp": 1.03731036, + "epoch": 0.46715767323012175, + "flos": 15379415408160.0, + "grad_norm": 1.7950147404510939, + "language_loss": 0.81262505, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83964074, + "num_input_tokens_seen": 166791720, + "step": 7770, + "time_per_iteration": 2.908907651901245 + }, + { + "auxiliary_loss_clip": 0.01500516, + "auxiliary_loss_mlp": 0.01219948, + "balance_loss_clip": 1.21739829, + "balance_loss_mlp": 1.01700592, + "epoch": 0.4672177964827897, + "flos": 60258434333760.0, + "grad_norm": 0.803625108056871, + "language_loss": 0.55594283, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.58314741, + "num_input_tokens_seen": 166856360, + "step": 7771, + "time_per_iteration": 3.393845796585083 + }, + { + "auxiliary_loss_clip": 0.01452782, + "auxiliary_loss_mlp": 0.01247025, + "balance_loss_clip": 1.1426909, + "balance_loss_mlp": 1.03149509, + "epoch": 0.4672779197354577, + "flos": 27638601838560.0, + "grad_norm": 3.303588076191888, + "language_loss": 0.6601094, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.68710744, + "num_input_tokens_seen": 166875925, + "step": 7772, + "time_per_iteration": 2.9613969326019287 + }, + { + "auxiliary_loss_clip": 0.0144574, + "auxiliary_loss_mlp": 0.01254017, + "balance_loss_clip": 1.13488972, + "balance_loss_mlp": 1.03772354, + "epoch": 0.46733804298812565, + "flos": 31396940737920.0, + "grad_norm": 1.8429132305474352, + "language_loss": 0.63388026, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.66087782, + "num_input_tokens_seen": 166896520, + "step": 7773, + "time_per_iteration": 2.960338592529297 + }, + { + "auxiliary_loss_clip": 0.01449691, + "auxiliary_loss_mlp": 0.01246602, + "balance_loss_clip": 1.13822055, + "balance_loss_mlp": 1.02840161, + "epoch": 0.4673981662407936, + "flos": 19502664778080.0, + "grad_norm": 2.4687889232233884, + "language_loss": 0.80311286, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.8300758, + "num_input_tokens_seen": 166915370, + "step": 7774, + "time_per_iteration": 2.7048323154449463 + }, + { + "auxiliary_loss_clip": 0.01450146, + "auxiliary_loss_mlp": 0.01241632, + "balance_loss_clip": 1.13933659, + "balance_loss_mlp": 1.02438545, + "epoch": 0.4674582894934616, + "flos": 20523928615200.0, + "grad_norm": 1.724831009100156, + "language_loss": 0.77658176, + "learning_rate": 2.306655024915726e-06, + "loss": 0.80349952, + "num_input_tokens_seen": 166934875, + "step": 7775, + "time_per_iteration": 2.7786755561828613 + }, + { + "auxiliary_loss_clip": 0.01448606, + "auxiliary_loss_mlp": 0.01241637, + "balance_loss_clip": 1.13786745, + "balance_loss_mlp": 1.02381802, + "epoch": 0.46751841274612954, + "flos": 22093146044640.0, + "grad_norm": 2.151630508322119, + "language_loss": 0.70047045, + "learning_rate": 2.306270162640694e-06, + "loss": 0.72737294, + "num_input_tokens_seen": 166954285, + "step": 7776, + "time_per_iteration": 2.7754931449890137 + }, + { + "auxiliary_loss_clip": 0.01452355, + "auxiliary_loss_mlp": 0.01245814, + "balance_loss_clip": 1.14093065, + "balance_loss_mlp": 1.02913892, + "epoch": 0.46757853599879756, + "flos": 26982551897280.0, + "grad_norm": 1.4330743917911024, + "language_loss": 0.74124181, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.7682234, + "num_input_tokens_seen": 166975975, + "step": 7777, + "time_per_iteration": 2.8074989318847656 + }, + { + "auxiliary_loss_clip": 0.01453595, + "auxiliary_loss_mlp": 0.0125311, + "balance_loss_clip": 1.14310491, + "balance_loss_mlp": 1.03471899, + "epoch": 0.4676386592514655, + "flos": 24136356425760.0, + "grad_norm": 2.1625631901122886, + "language_loss": 0.69900936, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.72607636, + "num_input_tokens_seen": 166996140, + "step": 7778, + "time_per_iteration": 2.8018617630004883 + }, + { + "auxiliary_loss_clip": 0.0144906, + "auxiliary_loss_mlp": 0.01249474, + "balance_loss_clip": 1.13830447, + "balance_loss_mlp": 1.03051031, + "epoch": 0.4676987825041335, + "flos": 25486081407360.0, + "grad_norm": 2.190630571174162, + "language_loss": 0.73623788, + "learning_rate": 2.305115506191206e-06, + "loss": 0.76322317, + "num_input_tokens_seen": 167016105, + "step": 7779, + "time_per_iteration": 2.9125871658325195 + }, + { + "auxiliary_loss_clip": 0.01449951, + "auxiliary_loss_mlp": 0.01238019, + "balance_loss_clip": 1.13997579, + "balance_loss_mlp": 1.01962733, + "epoch": 0.46775890575680146, + "flos": 21947500452960.0, + "grad_norm": 1.966671946449183, + "language_loss": 0.72458017, + "learning_rate": 2.304730597548562e-06, + "loss": 0.75145984, + "num_input_tokens_seen": 167036185, + "step": 7780, + "time_per_iteration": 2.8301072120666504 + }, + { + "auxiliary_loss_clip": 0.01453676, + "auxiliary_loss_mlp": 0.01257877, + "balance_loss_clip": 1.14219332, + "balance_loss_mlp": 1.03548002, + "epoch": 0.4678190290094694, + "flos": 25230746484000.0, + "grad_norm": 1.7537574272977585, + "language_loss": 0.73806334, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76517892, + "num_input_tokens_seen": 167054515, + "step": 7781, + "time_per_iteration": 2.743647336959839 + }, + { + "auxiliary_loss_clip": 0.01451652, + "auxiliary_loss_mlp": 0.01244752, + "balance_loss_clip": 1.14005005, + "balance_loss_mlp": 1.02636075, + "epoch": 0.4678791522621374, + "flos": 32271003928800.0, + "grad_norm": 1.7188421559969262, + "language_loss": 0.62883538, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.65579939, + "num_input_tokens_seen": 167077245, + "step": 7782, + "time_per_iteration": 2.8788037300109863 + }, + { + "auxiliary_loss_clip": 0.01455106, + "auxiliary_loss_mlp": 0.01247759, + "balance_loss_clip": 1.14313698, + "balance_loss_mlp": 1.02707911, + "epoch": 0.46793927551480535, + "flos": 27048585552480.0, + "grad_norm": 1.879825570167163, + "language_loss": 0.63024271, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65727139, + "num_input_tokens_seen": 167097235, + "step": 7783, + "time_per_iteration": 4.503692388534546 + }, + { + "auxiliary_loss_clip": 0.01457577, + "auxiliary_loss_mlp": 0.01251214, + "balance_loss_clip": 1.14507627, + "balance_loss_mlp": 1.02996135, + "epoch": 0.4679993987674733, + "flos": 17459340612480.0, + "grad_norm": 3.1490884473369256, + "language_loss": 0.68275487, + "learning_rate": 2.303190847569801e-06, + "loss": 0.7098428, + "num_input_tokens_seen": 167113155, + "step": 7784, + "time_per_iteration": 2.7038562297821045 + }, + { + "auxiliary_loss_clip": 0.01450949, + "auxiliary_loss_mlp": 0.01243259, + "balance_loss_clip": 1.1414144, + "balance_loss_mlp": 1.02677488, + "epoch": 0.4680595220201413, + "flos": 17167025368800.0, + "grad_norm": 2.1344486496257575, + "language_loss": 0.84555519, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.8724972, + "num_input_tokens_seen": 167131765, + "step": 7785, + "time_per_iteration": 2.7360761165618896 + }, + { + "auxiliary_loss_clip": 0.01449526, + "auxiliary_loss_mlp": 0.01241947, + "balance_loss_clip": 1.13883758, + "balance_loss_mlp": 1.0248909, + "epoch": 0.46811964527280925, + "flos": 11329178474880.0, + "grad_norm": 4.045637743971119, + "language_loss": 0.77268457, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79959929, + "num_input_tokens_seen": 167149030, + "step": 7786, + "time_per_iteration": 2.6992032527923584 + }, + { + "auxiliary_loss_clip": 0.0144846, + "auxiliary_loss_mlp": 0.01240352, + "balance_loss_clip": 1.13737094, + "balance_loss_mlp": 1.02444041, + "epoch": 0.4681797685254772, + "flos": 24281053813440.0, + "grad_norm": 2.250417446959275, + "language_loss": 0.74419057, + "learning_rate": 2.302035914315856e-06, + "loss": 0.77107871, + "num_input_tokens_seen": 167167375, + "step": 7787, + "time_per_iteration": 2.811680316925049 + }, + { + "auxiliary_loss_clip": 0.01452305, + "auxiliary_loss_mlp": 0.01246462, + "balance_loss_clip": 1.14180195, + "balance_loss_mlp": 1.03074098, + "epoch": 0.4682398917781452, + "flos": 31653110080800.0, + "grad_norm": 1.7385136721820331, + "language_loss": 0.65631652, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.68330425, + "num_input_tokens_seen": 167188065, + "step": 7788, + "time_per_iteration": 2.833041191101074 + }, + { + "auxiliary_loss_clip": 0.0145031, + "auxiliary_loss_mlp": 0.01244571, + "balance_loss_clip": 1.14098358, + "balance_loss_mlp": 1.0278964, + "epoch": 0.46830001503081314, + "flos": 28113277497120.0, + "grad_norm": 1.9728539348008614, + "language_loss": 0.64032233, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.66727114, + "num_input_tokens_seen": 167209675, + "step": 7789, + "time_per_iteration": 2.8148353099823 + }, + { + "auxiliary_loss_clip": 0.0149573, + "auxiliary_loss_mlp": 0.0119561, + "balance_loss_clip": 1.21385884, + "balance_loss_mlp": 0.99190521, + "epoch": 0.4683601382834811, + "flos": 57887862724800.0, + "grad_norm": 0.7383287788308762, + "language_loss": 0.61802197, + "learning_rate": 2.300880877982825e-06, + "loss": 0.64493537, + "num_input_tokens_seen": 167273940, + "step": 7790, + "time_per_iteration": 4.898469924926758 + }, + { + "auxiliary_loss_clip": 0.01453154, + "auxiliary_loss_mlp": 0.01247917, + "balance_loss_clip": 1.14359069, + "balance_loss_mlp": 1.03314936, + "epoch": 0.46842026153614913, + "flos": 21874108734720.0, + "grad_norm": 4.04906623510211, + "language_loss": 0.7927717, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81978238, + "num_input_tokens_seen": 167292730, + "step": 7791, + "time_per_iteration": 4.276557207107544 + }, + { + "auxiliary_loss_clip": 0.01451344, + "auxiliary_loss_mlp": 0.0123821, + "balance_loss_clip": 1.13932657, + "balance_loss_mlp": 1.01981807, + "epoch": 0.4684803847888171, + "flos": 24903195615360.0, + "grad_norm": 1.5314408961221186, + "language_loss": 0.74971259, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.77660811, + "num_input_tokens_seen": 167313460, + "step": 7792, + "time_per_iteration": 2.803750514984131 + }, + { + "auxiliary_loss_clip": 0.01444048, + "auxiliary_loss_mlp": 0.01238883, + "balance_loss_clip": 1.13275003, + "balance_loss_mlp": 1.02373433, + "epoch": 0.46854050804148506, + "flos": 26254172226240.0, + "grad_norm": 1.5363441813285363, + "language_loss": 0.68190312, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70873237, + "num_input_tokens_seen": 167335385, + "step": 7793, + "time_per_iteration": 2.896150827407837 + }, + { + "auxiliary_loss_clip": 0.01449257, + "auxiliary_loss_mlp": 0.01239369, + "balance_loss_clip": 1.13793457, + "balance_loss_mlp": 1.02383804, + "epoch": 0.468600631294153, + "flos": 21581945203680.0, + "grad_norm": 2.19462579647104, + "language_loss": 0.74004185, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.76692814, + "num_input_tokens_seen": 167353625, + "step": 7794, + "time_per_iteration": 2.7872848510742188 + }, + { + "auxiliary_loss_clip": 0.01448946, + "auxiliary_loss_mlp": 0.0125598, + "balance_loss_clip": 1.13980484, + "balance_loss_mlp": 1.03777933, + "epoch": 0.468660754546821, + "flos": 25888048054560.0, + "grad_norm": 1.5671148339121617, + "language_loss": 0.63582653, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.66287577, + "num_input_tokens_seen": 167374565, + "step": 7795, + "time_per_iteration": 2.919768810272217 + }, + { + "auxiliary_loss_clip": 0.01451397, + "auxiliary_loss_mlp": 0.01247753, + "balance_loss_clip": 1.14053762, + "balance_loss_mlp": 1.03279448, + "epoch": 0.46872087779948896, + "flos": 35477596419840.0, + "grad_norm": 5.606728230840502, + "language_loss": 0.68297511, + "learning_rate": 2.298570497656304e-06, + "loss": 0.7099666, + "num_input_tokens_seen": 167395010, + "step": 7796, + "time_per_iteration": 2.907317638397217 + }, + { + "auxiliary_loss_clip": 0.01448342, + "auxiliary_loss_mlp": 0.0125119, + "balance_loss_clip": 1.1367346, + "balance_loss_mlp": 1.0368042, + "epoch": 0.4687810010521569, + "flos": 26398869613920.0, + "grad_norm": 2.1131551642629662, + "language_loss": 0.70379579, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.73079109, + "num_input_tokens_seen": 167415285, + "step": 7797, + "time_per_iteration": 4.351669549942017 + }, + { + "auxiliary_loss_clip": 0.01451608, + "auxiliary_loss_mlp": 0.01252786, + "balance_loss_clip": 1.1395936, + "balance_loss_mlp": 1.03515697, + "epoch": 0.4688411243048249, + "flos": 19974723393600.0, + "grad_norm": 2.2807912256573157, + "language_loss": 0.67455673, + "learning_rate": 2.297800280150454e-06, + "loss": 0.70160067, + "num_input_tokens_seen": 167432405, + "step": 7798, + "time_per_iteration": 2.7269468307495117 + }, + { + "auxiliary_loss_clip": 0.01494579, + "auxiliary_loss_mlp": 0.01237938, + "balance_loss_clip": 1.21154284, + "balance_loss_mlp": 1.03804779, + "epoch": 0.46890124755749285, + "flos": 63983623656960.0, + "grad_norm": 0.9325837263979864, + "language_loss": 0.64423925, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.6715644, + "num_input_tokens_seen": 167499365, + "step": 7799, + "time_per_iteration": 3.4743812084198 + }, + { + "auxiliary_loss_clip": 0.01445085, + "auxiliary_loss_mlp": 0.01237403, + "balance_loss_clip": 1.13398814, + "balance_loss_mlp": 1.0226357, + "epoch": 0.4689613708101608, + "flos": 23771256314400.0, + "grad_norm": 1.3102681017618825, + "language_loss": 0.72192919, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74875408, + "num_input_tokens_seen": 167520390, + "step": 7800, + "time_per_iteration": 2.7910168170928955 + }, + { + "auxiliary_loss_clip": 0.01443194, + "auxiliary_loss_mlp": 0.01249918, + "balance_loss_clip": 1.13263988, + "balance_loss_mlp": 1.0353415, + "epoch": 0.4690214940628288, + "flos": 24790927168800.0, + "grad_norm": 1.8017582278525202, + "language_loss": 0.72442526, + "learning_rate": 2.296644869233568e-06, + "loss": 0.75135636, + "num_input_tokens_seen": 167539865, + "step": 7801, + "time_per_iteration": 2.8162200450897217 + }, + { + "auxiliary_loss_clip": 0.01450788, + "auxiliary_loss_mlp": 0.0125182, + "balance_loss_clip": 1.13871479, + "balance_loss_mlp": 1.03323817, + "epoch": 0.46908161731549675, + "flos": 18079282581120.0, + "grad_norm": 2.2379718577656473, + "language_loss": 0.62702566, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.65405178, + "num_input_tokens_seen": 167558190, + "step": 7802, + "time_per_iteration": 2.89460825920105 + }, + { + "auxiliary_loss_clip": 0.01448887, + "auxiliary_loss_mlp": 0.01258911, + "balance_loss_clip": 1.13652039, + "balance_loss_mlp": 1.04319048, + "epoch": 0.4691417405681647, + "flos": 25705763496000.0, + "grad_norm": 9.112478838688412, + "language_loss": 0.73839682, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.76547486, + "num_input_tokens_seen": 167577685, + "step": 7803, + "time_per_iteration": 2.8381896018981934 + }, + { + "auxiliary_loss_clip": 0.01446842, + "auxiliary_loss_mlp": 0.01241571, + "balance_loss_clip": 1.13427043, + "balance_loss_mlp": 1.02661252, + "epoch": 0.46920186382083273, + "flos": 17458961330880.0, + "grad_norm": 1.6758593453066764, + "language_loss": 0.77279949, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79968369, + "num_input_tokens_seen": 167596390, + "step": 7804, + "time_per_iteration": 2.9072892665863037 + }, + { + "auxiliary_loss_clip": 0.01445733, + "auxiliary_loss_mlp": 0.01253066, + "balance_loss_clip": 1.13344586, + "balance_loss_mlp": 1.03868032, + "epoch": 0.4692619870735007, + "flos": 20341530272160.0, + "grad_norm": 1.6904039508895878, + "language_loss": 0.77484381, + "learning_rate": 2.295104163929305e-06, + "loss": 0.80183178, + "num_input_tokens_seen": 167614980, + "step": 7805, + "time_per_iteration": 2.7518298625946045 + }, + { + "auxiliary_loss_clip": 0.01452693, + "auxiliary_loss_mlp": 0.01249385, + "balance_loss_clip": 1.13811481, + "balance_loss_mlp": 1.02851379, + "epoch": 0.46932211032616866, + "flos": 29499110451360.0, + "grad_norm": 1.6628029798607797, + "language_loss": 0.83170211, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85872293, + "num_input_tokens_seen": 167635895, + "step": 7806, + "time_per_iteration": 2.8740148544311523 + }, + { + "auxiliary_loss_clip": 0.0144512, + "auxiliary_loss_mlp": 0.01259442, + "balance_loss_clip": 1.13223457, + "balance_loss_mlp": 1.0460093, + "epoch": 0.4693822335788366, + "flos": 36214244429760.0, + "grad_norm": 1.7616042436128665, + "language_loss": 0.77270573, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79975128, + "num_input_tokens_seen": 167657440, + "step": 7807, + "time_per_iteration": 2.870454788208008 + }, + { + "auxiliary_loss_clip": 0.01453346, + "auxiliary_loss_mlp": 0.01257908, + "balance_loss_clip": 1.14057255, + "balance_loss_mlp": 1.04123306, + "epoch": 0.4694423568315046, + "flos": 20341037206080.0, + "grad_norm": 2.265928761944394, + "language_loss": 0.51883125, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.5459438, + "num_input_tokens_seen": 167675025, + "step": 7808, + "time_per_iteration": 2.794706344604492 + }, + { + "auxiliary_loss_clip": 0.0150035, + "auxiliary_loss_mlp": 0.01200768, + "balance_loss_clip": 1.21525431, + "balance_loss_mlp": 0.99858856, + "epoch": 0.46950248008417256, + "flos": 64332111234240.0, + "grad_norm": 0.7805973556036105, + "language_loss": 0.57707632, + "learning_rate": 2.293563279578978e-06, + "loss": 0.60408747, + "num_input_tokens_seen": 167729635, + "step": 7809, + "time_per_iteration": 3.245617151260376 + }, + { + "auxiliary_loss_clip": 0.01454391, + "auxiliary_loss_mlp": 0.01257777, + "balance_loss_clip": 1.14013708, + "balance_loss_mlp": 1.04167449, + "epoch": 0.4695626033368405, + "flos": 19201663985760.0, + "grad_norm": 1.9710029903902657, + "language_loss": 0.71464467, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.74176639, + "num_input_tokens_seen": 167745135, + "step": 7810, + "time_per_iteration": 2.7873377799987793 + }, + { + "auxiliary_loss_clip": 0.01451086, + "auxiliary_loss_mlp": 0.01250877, + "balance_loss_clip": 1.13773465, + "balance_loss_mlp": 1.03134155, + "epoch": 0.4696227265895085, + "flos": 23004644693760.0, + "grad_norm": 1.8803123662141075, + "language_loss": 0.81173515, + "learning_rate": 2.29279277055369e-06, + "loss": 0.83875477, + "num_input_tokens_seen": 167763875, + "step": 7811, + "time_per_iteration": 2.8514177799224854 + }, + { + "auxiliary_loss_clip": 0.01452268, + "auxiliary_loss_mlp": 0.01255479, + "balance_loss_clip": 1.13889682, + "balance_loss_mlp": 1.03727794, + "epoch": 0.46968284984217645, + "flos": 21872970889920.0, + "grad_norm": 1.6316520484567538, + "language_loss": 0.80967236, + "learning_rate": 2.292407499379644e-06, + "loss": 0.83674985, + "num_input_tokens_seen": 167784895, + "step": 7812, + "time_per_iteration": 2.7704288959503174 + }, + { + "auxiliary_loss_clip": 0.01446618, + "auxiliary_loss_mlp": 0.01249072, + "balance_loss_clip": 1.13365579, + "balance_loss_mlp": 1.03411365, + "epoch": 0.4697429730948444, + "flos": 19977074939520.0, + "grad_norm": 1.7185504141632908, + "language_loss": 0.7431581, + "learning_rate": 2.292022217117477e-06, + "loss": 0.77011502, + "num_input_tokens_seen": 167803185, + "step": 7813, + "time_per_iteration": 2.8316125869750977 + }, + { + "auxiliary_loss_clip": 0.01443058, + "auxiliary_loss_mlp": 0.01256607, + "balance_loss_clip": 1.12934959, + "balance_loss_mlp": 1.04088593, + "epoch": 0.4698030963475124, + "flos": 15157874839680.0, + "grad_norm": 2.3163884276066233, + "language_loss": 0.84565103, + "learning_rate": 2.291636923781798e-06, + "loss": 0.87264764, + "num_input_tokens_seen": 167816550, + "step": 7814, + "time_per_iteration": 2.765237808227539 + }, + { + "auxiliary_loss_clip": 0.01446428, + "auxiliary_loss_mlp": 0.01245475, + "balance_loss_clip": 1.13328838, + "balance_loss_mlp": 1.0307076, + "epoch": 0.46986321960018035, + "flos": 15152564897280.0, + "grad_norm": 2.2489751693400026, + "language_loss": 0.81504267, + "learning_rate": 2.291251619387217e-06, + "loss": 0.84196168, + "num_input_tokens_seen": 167831845, + "step": 7815, + "time_per_iteration": 2.7712833881378174 + }, + { + "auxiliary_loss_clip": 0.01448503, + "auxiliary_loss_mlp": 0.01245054, + "balance_loss_clip": 1.13454938, + "balance_loss_mlp": 1.02971423, + "epoch": 0.4699233428528483, + "flos": 23110958419200.0, + "grad_norm": 2.1893924378079395, + "language_loss": 0.77933621, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.80627179, + "num_input_tokens_seen": 167850360, + "step": 7816, + "time_per_iteration": 2.830904722213745 + }, + { + "auxiliary_loss_clip": 0.01496361, + "auxiliary_loss_mlp": 0.01201263, + "balance_loss_clip": 1.2101841, + "balance_loss_mlp": 1.00061035, + "epoch": 0.46998346610551633, + "flos": 68112713963520.0, + "grad_norm": 0.8427107630546983, + "language_loss": 0.5896219, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61659819, + "num_input_tokens_seen": 167908660, + "step": 7817, + "time_per_iteration": 3.3562331199645996 + }, + { + "auxiliary_loss_clip": 0.01455733, + "auxiliary_loss_mlp": 0.01247116, + "balance_loss_clip": 1.14354825, + "balance_loss_mlp": 1.03044128, + "epoch": 0.4700435893581843, + "flos": 24131501621280.0, + "grad_norm": 1.7466682419316355, + "language_loss": 0.7930882, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.82011664, + "num_input_tokens_seen": 167927905, + "step": 7818, + "time_per_iteration": 2.857898235321045 + }, + { + "auxiliary_loss_clip": 0.01453663, + "auxiliary_loss_mlp": 0.01237582, + "balance_loss_clip": 1.14023066, + "balance_loss_mlp": 1.02205205, + "epoch": 0.47010371261085226, + "flos": 20152229004000.0, + "grad_norm": 2.0867090183510997, + "language_loss": 0.83823073, + "learning_rate": 2.289710291512104e-06, + "loss": 0.86514318, + "num_input_tokens_seen": 167945995, + "step": 7819, + "time_per_iteration": 2.759115695953369 + }, + { + "auxiliary_loss_clip": 0.01452955, + "auxiliary_loss_mlp": 0.01248064, + "balance_loss_clip": 1.14000368, + "balance_loss_mlp": 1.02852821, + "epoch": 0.47016383586352023, + "flos": 15124004628480.0, + "grad_norm": 2.118309079441479, + "language_loss": 0.76473808, + "learning_rate": 2.289324932042186e-06, + "loss": 0.79174823, + "num_input_tokens_seen": 167963380, + "step": 7820, + "time_per_iteration": 2.793083667755127 + }, + { + "auxiliary_loss_clip": 0.0145223, + "auxiliary_loss_mlp": 0.01253171, + "balance_loss_clip": 1.14013958, + "balance_loss_mlp": 1.0359242, + "epoch": 0.4702239591161882, + "flos": 13554673414560.0, + "grad_norm": 1.9311578761583723, + "language_loss": 0.74582028, + "learning_rate": 2.288939561601039e-06, + "loss": 0.77287436, + "num_input_tokens_seen": 167981740, + "step": 7821, + "time_per_iteration": 4.442604303359985 + }, + { + "auxiliary_loss_clip": 0.01446752, + "auxiliary_loss_mlp": 0.01242804, + "balance_loss_clip": 1.13439, + "balance_loss_mlp": 1.02479434, + "epoch": 0.47028408236885616, + "flos": 24278626411200.0, + "grad_norm": 1.7348547618358257, + "language_loss": 0.8920657, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91896129, + "num_input_tokens_seen": 167999380, + "step": 7822, + "time_per_iteration": 2.783780574798584 + }, + { + "auxiliary_loss_clip": 0.0145139, + "auxiliary_loss_mlp": 0.01244438, + "balance_loss_clip": 1.13791955, + "balance_loss_mlp": 1.02776337, + "epoch": 0.4703442056215241, + "flos": 22859150814720.0, + "grad_norm": 1.5617563061183972, + "language_loss": 0.79985285, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.82681119, + "num_input_tokens_seen": 168018395, + "step": 7823, + "time_per_iteration": 2.779024124145508 + }, + { + "auxiliary_loss_clip": 0.01495436, + "auxiliary_loss_mlp": 0.01211265, + "balance_loss_clip": 1.20472801, + "balance_loss_mlp": 1.00832367, + "epoch": 0.4704043288741921, + "flos": 69247649589120.0, + "grad_norm": 0.7166624628905143, + "language_loss": 0.56611359, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.59318066, + "num_input_tokens_seen": 168084080, + "step": 7824, + "time_per_iteration": 3.4271748065948486 + }, + { + "auxiliary_loss_clip": 0.01452759, + "auxiliary_loss_mlp": 0.0124701, + "balance_loss_clip": 1.13922668, + "balance_loss_mlp": 1.02728391, + "epoch": 0.47046445212686006, + "flos": 18043060824000.0, + "grad_norm": 1.8625056309143737, + "language_loss": 0.81118751, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83818519, + "num_input_tokens_seen": 168101555, + "step": 7825, + "time_per_iteration": 2.7546634674072266 + }, + { + "auxiliary_loss_clip": 0.01452124, + "auxiliary_loss_mlp": 0.0125064, + "balance_loss_clip": 1.13832235, + "balance_loss_mlp": 1.03415644, + "epoch": 0.470524575379528, + "flos": 23953730513760.0, + "grad_norm": 2.1146260115462727, + "language_loss": 0.66573024, + "learning_rate": 2.287012545338324e-06, + "loss": 0.6927579, + "num_input_tokens_seen": 168121530, + "step": 7826, + "time_per_iteration": 2.840786933898926 + }, + { + "auxiliary_loss_clip": 0.01444698, + "auxiliary_loss_mlp": 0.01249144, + "balance_loss_clip": 1.13002849, + "balance_loss_mlp": 1.03304148, + "epoch": 0.470584698632196, + "flos": 18115504338240.0, + "grad_norm": 1.7008419607877552, + "language_loss": 0.83934832, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86628675, + "num_input_tokens_seen": 168140335, + "step": 7827, + "time_per_iteration": 2.7250237464904785 + }, + { + "auxiliary_loss_clip": 0.01491498, + "auxiliary_loss_mlp": 0.01212463, + "balance_loss_clip": 1.20111144, + "balance_loss_mlp": 1.01257324, + "epoch": 0.47064482188486395, + "flos": 57257490512160.0, + "grad_norm": 0.7988535022161865, + "language_loss": 0.55652916, + "learning_rate": 2.286241662546122e-06, + "loss": 0.58356881, + "num_input_tokens_seen": 168200535, + "step": 7828, + "time_per_iteration": 4.802292108535767 + }, + { + "auxiliary_loss_clip": 0.014451, + "auxiliary_loss_mlp": 0.01249913, + "balance_loss_clip": 1.13294828, + "balance_loss_mlp": 1.03342903, + "epoch": 0.4707049451375319, + "flos": 17897035950720.0, + "grad_norm": 1.887180454795088, + "language_loss": 0.81089848, + "learning_rate": 2.285856204861245e-06, + "loss": 0.83784866, + "num_input_tokens_seen": 168219610, + "step": 7829, + "time_per_iteration": 4.262857437133789 + }, + { + "auxiliary_loss_clip": 0.01451943, + "auxiliary_loss_mlp": 0.01250271, + "balance_loss_clip": 1.13805521, + "balance_loss_mlp": 1.03550339, + "epoch": 0.47076506839019994, + "flos": 25235259935040.0, + "grad_norm": 1.3131062242307674, + "language_loss": 0.75800663, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78502882, + "num_input_tokens_seen": 168242505, + "step": 7830, + "time_per_iteration": 2.8930625915527344 + }, + { + "auxiliary_loss_clip": 0.01444606, + "auxiliary_loss_mlp": 0.01255622, + "balance_loss_clip": 1.13113248, + "balance_loss_mlp": 1.04028249, + "epoch": 0.4708251916428679, + "flos": 13481357552640.0, + "grad_norm": 2.1915994916014006, + "language_loss": 0.78292274, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.80992496, + "num_input_tokens_seen": 168260220, + "step": 7831, + "time_per_iteration": 2.7226221561431885 + }, + { + "auxiliary_loss_clip": 0.01440606, + "auxiliary_loss_mlp": 0.01250769, + "balance_loss_clip": 1.12637639, + "balance_loss_mlp": 1.03104281, + "epoch": 0.47088531489553587, + "flos": 30150115947360.0, + "grad_norm": 2.5031959948230793, + "language_loss": 0.75614941, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.78306317, + "num_input_tokens_seen": 168277360, + "step": 7832, + "time_per_iteration": 2.8552322387695312 + }, + { + "auxiliary_loss_clip": 0.01443462, + "auxiliary_loss_mlp": 0.01252034, + "balance_loss_clip": 1.13178229, + "balance_loss_mlp": 1.0374577, + "epoch": 0.47094543814820383, + "flos": 21800413591200.0, + "grad_norm": 1.2978359868816243, + "language_loss": 0.74845421, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.77540916, + "num_input_tokens_seen": 168296605, + "step": 7833, + "time_per_iteration": 2.7642178535461426 + }, + { + "auxiliary_loss_clip": 0.01438343, + "auxiliary_loss_mlp": 0.01251407, + "balance_loss_clip": 1.12634301, + "balance_loss_mlp": 1.03606784, + "epoch": 0.4710055614008718, + "flos": 23005251544320.0, + "grad_norm": 1.59723868793269, + "language_loss": 0.75951529, + "learning_rate": 2.283928754133762e-06, + "loss": 0.78641284, + "num_input_tokens_seen": 168316205, + "step": 7834, + "time_per_iteration": 2.7966363430023193 + }, + { + "auxiliary_loss_clip": 0.01447227, + "auxiliary_loss_mlp": 0.01251207, + "balance_loss_clip": 1.13598156, + "balance_loss_mlp": 1.0343411, + "epoch": 0.47106568465353976, + "flos": 42744666447360.0, + "grad_norm": 1.3865565840701883, + "language_loss": 0.66268873, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68967301, + "num_input_tokens_seen": 168338935, + "step": 7835, + "time_per_iteration": 2.95955753326416 + }, + { + "auxiliary_loss_clip": 0.01489844, + "auxiliary_loss_mlp": 0.01218109, + "balance_loss_clip": 1.20131516, + "balance_loss_mlp": 1.01745605, + "epoch": 0.4711258079062077, + "flos": 68559739993440.0, + "grad_norm": 0.8658512682534553, + "language_loss": 0.62137258, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64845216, + "num_input_tokens_seen": 168392800, + "step": 7836, + "time_per_iteration": 4.8497021198272705 + }, + { + "auxiliary_loss_clip": 0.01438162, + "auxiliary_loss_mlp": 0.01256797, + "balance_loss_clip": 1.12537932, + "balance_loss_mlp": 1.03993154, + "epoch": 0.4711859311588757, + "flos": 25448911446240.0, + "grad_norm": 1.9622696718585677, + "language_loss": 0.69502759, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.72197711, + "num_input_tokens_seen": 168412940, + "step": 7837, + "time_per_iteration": 2.8058698177337646 + }, + { + "auxiliary_loss_clip": 0.01444693, + "auxiliary_loss_mlp": 0.01248273, + "balance_loss_clip": 1.13046396, + "balance_loss_mlp": 1.032933, + "epoch": 0.47124605441154366, + "flos": 21984101491680.0, + "grad_norm": 1.7946504266202623, + "language_loss": 0.66477847, + "learning_rate": 2.282386599665153e-06, + "loss": 0.69170815, + "num_input_tokens_seen": 168431995, + "step": 7838, + "time_per_iteration": 2.7704408168792725 + }, + { + "auxiliary_loss_clip": 0.01439078, + "auxiliary_loss_mlp": 0.01254961, + "balance_loss_clip": 1.12571883, + "balance_loss_mlp": 1.03599763, + "epoch": 0.4713061776642116, + "flos": 25415155019520.0, + "grad_norm": 2.532486906375829, + "language_loss": 0.77901137, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.80595171, + "num_input_tokens_seen": 168454585, + "step": 7839, + "time_per_iteration": 2.7959706783294678 + }, + { + "auxiliary_loss_clip": 0.01435347, + "auxiliary_loss_mlp": 0.012495, + "balance_loss_clip": 1.12094879, + "balance_loss_mlp": 1.03492332, + "epoch": 0.4713663009168796, + "flos": 26544704846400.0, + "grad_norm": 1.9431297550433253, + "language_loss": 0.73166811, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75851661, + "num_input_tokens_seen": 168471265, + "step": 7840, + "time_per_iteration": 2.856720209121704 + }, + { + "auxiliary_loss_clip": 0.01430104, + "auxiliary_loss_mlp": 0.01251503, + "balance_loss_clip": 1.11711693, + "balance_loss_mlp": 1.03635454, + "epoch": 0.47142642416954755, + "flos": 23625876219840.0, + "grad_norm": 1.5618148564723233, + "language_loss": 0.75041461, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77723074, + "num_input_tokens_seen": 168491360, + "step": 7841, + "time_per_iteration": 2.8253591060638428 + }, + { + "auxiliary_loss_clip": 0.01428175, + "auxiliary_loss_mlp": 0.01245337, + "balance_loss_clip": 1.11498094, + "balance_loss_mlp": 1.02961576, + "epoch": 0.4714865474222155, + "flos": 22312372995360.0, + "grad_norm": 2.0444896145816847, + "language_loss": 0.7056973, + "learning_rate": 2.280844273866501e-06, + "loss": 0.73243237, + "num_input_tokens_seen": 168511335, + "step": 7842, + "time_per_iteration": 2.8293559551239014 + }, + { + "auxiliary_loss_clip": 0.01436841, + "auxiliary_loss_mlp": 0.01251447, + "balance_loss_clip": 1.12315452, + "balance_loss_mlp": 1.03687072, + "epoch": 0.4715466706748835, + "flos": 17824402795680.0, + "grad_norm": 2.133127575445659, + "language_loss": 0.79248232, + "learning_rate": 2.280458665756177e-06, + "loss": 0.8193652, + "num_input_tokens_seen": 168529920, + "step": 7843, + "time_per_iteration": 2.8340563774108887 + }, + { + "auxiliary_loss_clip": 0.01440355, + "auxiliary_loss_mlp": 0.01242625, + "balance_loss_clip": 1.12424791, + "balance_loss_mlp": 1.02575922, + "epoch": 0.4716067939275515, + "flos": 23661794551680.0, + "grad_norm": 1.62176850749619, + "language_loss": 0.7425245, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76935434, + "num_input_tokens_seen": 168550595, + "step": 7844, + "time_per_iteration": 2.8085498809814453 + }, + { + "auxiliary_loss_clip": 0.01436584, + "auxiliary_loss_mlp": 0.01246061, + "balance_loss_clip": 1.12288022, + "balance_loss_mlp": 1.0295769, + "epoch": 0.47166691718021947, + "flos": 17932044006720.0, + "grad_norm": 1.6640784318919057, + "language_loss": 0.78341347, + "learning_rate": 2.279687417645088e-06, + "loss": 0.81023991, + "num_input_tokens_seen": 168569765, + "step": 7845, + "time_per_iteration": 2.7479677200317383 + }, + { + "auxiliary_loss_clip": 0.01435232, + "auxiliary_loss_mlp": 0.01242418, + "balance_loss_clip": 1.12273192, + "balance_loss_mlp": 1.02784121, + "epoch": 0.47172704043288743, + "flos": 26616996648000.0, + "grad_norm": 3.8664575922460935, + "language_loss": 0.73127037, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75804687, + "num_input_tokens_seen": 168591525, + "step": 7846, + "time_per_iteration": 2.805109739303589 + }, + { + "auxiliary_loss_clip": 0.01439085, + "auxiliary_loss_mlp": 0.01245415, + "balance_loss_clip": 1.12665689, + "balance_loss_mlp": 1.0296936, + "epoch": 0.4717871636855554, + "flos": 27924886504800.0, + "grad_norm": 1.4117324905862065, + "language_loss": 0.74307042, + "learning_rate": 2.2789161271109e-06, + "loss": 0.7699154, + "num_input_tokens_seen": 168611235, + "step": 7847, + "time_per_iteration": 2.8741018772125244 + }, + { + "auxiliary_loss_clip": 0.01439154, + "auxiliary_loss_mlp": 0.01239835, + "balance_loss_clip": 1.12670028, + "balance_loss_mlp": 1.02583003, + "epoch": 0.47184728693822336, + "flos": 14504024731680.0, + "grad_norm": 1.9162751691446605, + "language_loss": 0.81210852, + "learning_rate": 2.278530465971703e-06, + "loss": 0.83889836, + "num_input_tokens_seen": 168628710, + "step": 7848, + "time_per_iteration": 2.7318456172943115 + }, + { + "auxiliary_loss_clip": 0.0144823, + "auxiliary_loss_mlp": 0.01259285, + "balance_loss_clip": 1.13628721, + "balance_loss_mlp": 1.04184747, + "epoch": 0.47190741019089133, + "flos": 17858386791360.0, + "grad_norm": 2.0345847955192418, + "language_loss": 0.70546252, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.73253763, + "num_input_tokens_seen": 168645645, + "step": 7849, + "time_per_iteration": 2.7070260047912598 + }, + { + "auxiliary_loss_clip": 0.0143751, + "auxiliary_loss_mlp": 0.01258265, + "balance_loss_clip": 1.12395382, + "balance_loss_mlp": 1.03930163, + "epoch": 0.4719675334435593, + "flos": 17897453160480.0, + "grad_norm": 2.483673126322484, + "language_loss": 0.69484353, + "learning_rate": 2.277759112022224e-06, + "loss": 0.72180128, + "num_input_tokens_seen": 168664165, + "step": 7850, + "time_per_iteration": 2.722595453262329 + }, + { + "auxiliary_loss_clip": 0.01435705, + "auxiliary_loss_mlp": 0.01250491, + "balance_loss_clip": 1.1215086, + "balance_loss_mlp": 1.03381634, + "epoch": 0.47202765669622726, + "flos": 20706478670880.0, + "grad_norm": 1.788473211612147, + "language_loss": 0.75136089, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.77822286, + "num_input_tokens_seen": 168681940, + "step": 7851, + "time_per_iteration": 2.764580249786377 + }, + { + "auxiliary_loss_clip": 0.01432561, + "auxiliary_loss_mlp": 0.01256691, + "balance_loss_clip": 1.11834884, + "balance_loss_mlp": 1.04116023, + "epoch": 0.4720877799488952, + "flos": 16361992157760.0, + "grad_norm": 1.7947705066376014, + "language_loss": 0.76547611, + "learning_rate": 2.276987715942132e-06, + "loss": 0.79236865, + "num_input_tokens_seen": 168698830, + "step": 7852, + "time_per_iteration": 2.7283987998962402 + }, + { + "auxiliary_loss_clip": 0.01435749, + "auxiliary_loss_mlp": 0.01247774, + "balance_loss_clip": 1.122563, + "balance_loss_mlp": 1.03129053, + "epoch": 0.4721479032015632, + "flos": 20670294841920.0, + "grad_norm": 1.6307829637288214, + "language_loss": 0.69258994, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.7194252, + "num_input_tokens_seen": 168718305, + "step": 7853, + "time_per_iteration": 2.802957773208618 + }, + { + "auxiliary_loss_clip": 0.01499101, + "auxiliary_loss_mlp": 0.01213089, + "balance_loss_clip": 1.21287203, + "balance_loss_mlp": 1.01319885, + "epoch": 0.47220802645423116, + "flos": 67758119952480.0, + "grad_norm": 1.7129319282904707, + "language_loss": 0.50158429, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52870619, + "num_input_tokens_seen": 168782365, + "step": 7854, + "time_per_iteration": 3.4734530448913574 + }, + { + "auxiliary_loss_clip": 0.01433321, + "auxiliary_loss_mlp": 0.01249554, + "balance_loss_clip": 1.11907911, + "balance_loss_mlp": 1.0319252, + "epoch": 0.4722681497068991, + "flos": 20923467860160.0, + "grad_norm": 1.7975653805072342, + "language_loss": 0.64350277, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.67033154, + "num_input_tokens_seen": 168800485, + "step": 7855, + "time_per_iteration": 2.7765016555786133 + }, + { + "auxiliary_loss_clip": 0.01432673, + "auxiliary_loss_mlp": 0.01250468, + "balance_loss_clip": 1.11897242, + "balance_loss_mlp": 1.03379297, + "epoch": 0.4723282729595671, + "flos": 28295599983840.0, + "grad_norm": 1.9741224114804516, + "language_loss": 0.76274985, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.7895813, + "num_input_tokens_seen": 168818965, + "step": 7856, + "time_per_iteration": 2.8636157512664795 + }, + { + "auxiliary_loss_clip": 0.01429683, + "auxiliary_loss_mlp": 0.01241382, + "balance_loss_clip": 1.11620986, + "balance_loss_mlp": 1.02527964, + "epoch": 0.4723883962122351, + "flos": 27127476853920.0, + "grad_norm": 1.7207266271217845, + "language_loss": 0.74919981, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.77591044, + "num_input_tokens_seen": 168840355, + "step": 7857, + "time_per_iteration": 2.8204777240753174 + }, + { + "auxiliary_loss_clip": 0.01435662, + "auxiliary_loss_mlp": 0.0125473, + "balance_loss_clip": 1.12121129, + "balance_loss_mlp": 1.0393908, + "epoch": 0.47244851946490307, + "flos": 31539779645760.0, + "grad_norm": 1.5755279919903076, + "language_loss": 0.64791054, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.67481452, + "num_input_tokens_seen": 168861765, + "step": 7858, + "time_per_iteration": 2.8914520740509033 + }, + { + "auxiliary_loss_clip": 0.01437653, + "auxiliary_loss_mlp": 0.01247548, + "balance_loss_clip": 1.1237359, + "balance_loss_mlp": 1.03220868, + "epoch": 0.47250864271757104, + "flos": 20888649444960.0, + "grad_norm": 1.5993639173979448, + "language_loss": 0.70231158, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72916359, + "num_input_tokens_seen": 168881310, + "step": 7859, + "time_per_iteration": 4.756227254867554 + }, + { + "auxiliary_loss_clip": 0.01433487, + "auxiliary_loss_mlp": 0.0124262, + "balance_loss_clip": 1.12020278, + "balance_loss_mlp": 1.0240376, + "epoch": 0.472568765970239, + "flos": 20524194112320.0, + "grad_norm": 1.687390615392113, + "language_loss": 0.61940396, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64616507, + "num_input_tokens_seen": 168899470, + "step": 7860, + "time_per_iteration": 2.7921605110168457 + }, + { + "auxiliary_loss_clip": 0.01436637, + "auxiliary_loss_mlp": 0.01254463, + "balance_loss_clip": 1.123456, + "balance_loss_mlp": 1.03721619, + "epoch": 0.47262888922290697, + "flos": 35807916044160.0, + "grad_norm": 2.2237282544778143, + "language_loss": 0.72472, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.75163102, + "num_input_tokens_seen": 168921495, + "step": 7861, + "time_per_iteration": 2.8702590465545654 + }, + { + "auxiliary_loss_clip": 0.01438886, + "auxiliary_loss_mlp": 0.01235567, + "balance_loss_clip": 1.12418795, + "balance_loss_mlp": 1.01927423, + "epoch": 0.47268901247557493, + "flos": 20669839704000.0, + "grad_norm": 2.0204435371012264, + "language_loss": 0.85437906, + "learning_rate": 2.273130107677896e-06, + "loss": 0.88112354, + "num_input_tokens_seen": 168940515, + "step": 7862, + "time_per_iteration": 2.8031845092773438 + }, + { + "auxiliary_loss_clip": 0.01436846, + "auxiliary_loss_mlp": 0.01240978, + "balance_loss_clip": 1.12378216, + "balance_loss_mlp": 1.02449381, + "epoch": 0.4727491357282429, + "flos": 19575335861280.0, + "grad_norm": 1.833768492461049, + "language_loss": 0.84633875, + "learning_rate": 2.272744289645927e-06, + "loss": 0.87311697, + "num_input_tokens_seen": 168958340, + "step": 7863, + "time_per_iteration": 2.830202102661133 + }, + { + "auxiliary_loss_clip": 0.01441651, + "auxiliary_loss_mlp": 0.0124558, + "balance_loss_clip": 1.12870145, + "balance_loss_mlp": 1.02757001, + "epoch": 0.47280925898091086, + "flos": 18218859667200.0, + "grad_norm": 2.0386039546389503, + "language_loss": 0.65806079, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68493307, + "num_input_tokens_seen": 168974850, + "step": 7864, + "time_per_iteration": 2.8081798553466797 + }, + { + "auxiliary_loss_clip": 0.01434102, + "auxiliary_loss_mlp": 0.01250361, + "balance_loss_clip": 1.12071276, + "balance_loss_mlp": 1.03082502, + "epoch": 0.4728693822335788, + "flos": 17823758016960.0, + "grad_norm": 2.0639308644352794, + "language_loss": 0.65074879, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67759347, + "num_input_tokens_seen": 168992860, + "step": 7865, + "time_per_iteration": 2.7211151123046875 + }, + { + "auxiliary_loss_clip": 0.01429875, + "auxiliary_loss_mlp": 0.01244746, + "balance_loss_clip": 1.11657262, + "balance_loss_mlp": 1.02921605, + "epoch": 0.4729295054862468, + "flos": 20597168620800.0, + "grad_norm": 1.785376454118189, + "language_loss": 0.74146247, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76820868, + "num_input_tokens_seen": 169010325, + "step": 7866, + "time_per_iteration": 4.280797719955444 + }, + { + "auxiliary_loss_clip": 0.01428246, + "auxiliary_loss_mlp": 0.01234346, + "balance_loss_clip": 1.11525273, + "balance_loss_mlp": 1.01690793, + "epoch": 0.47298962873891476, + "flos": 23370351655680.0, + "grad_norm": 1.7291263111077364, + "language_loss": 0.82872033, + "learning_rate": 2.271200914239451e-06, + "loss": 0.8553462, + "num_input_tokens_seen": 169029840, + "step": 7867, + "time_per_iteration": 2.8003151416778564 + }, + { + "auxiliary_loss_clip": 0.01427831, + "auxiliary_loss_mlp": 0.01234731, + "balance_loss_clip": 1.1130172, + "balance_loss_mlp": 1.01881874, + "epoch": 0.4730497519915827, + "flos": 22054307244480.0, + "grad_norm": 1.719268669820948, + "language_loss": 0.79699862, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.82362425, + "num_input_tokens_seen": 169049975, + "step": 7868, + "time_per_iteration": 4.346321105957031 + }, + { + "auxiliary_loss_clip": 0.01433453, + "auxiliary_loss_mlp": 0.01246382, + "balance_loss_clip": 1.12018299, + "balance_loss_mlp": 1.02646446, + "epoch": 0.4731098752442507, + "flos": 21071882207520.0, + "grad_norm": 1.9604635731272997, + "language_loss": 0.75108981, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.77788818, + "num_input_tokens_seen": 169069540, + "step": 7869, + "time_per_iteration": 2.803053140640259 + }, + { + "auxiliary_loss_clip": 0.01435313, + "auxiliary_loss_mlp": 0.0123929, + "balance_loss_clip": 1.12105381, + "balance_loss_mlp": 1.01860964, + "epoch": 0.4731699984969187, + "flos": 22530993095520.0, + "grad_norm": 2.489947555212546, + "language_loss": 0.74024636, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.76699233, + "num_input_tokens_seen": 169089940, + "step": 7870, + "time_per_iteration": 2.8315317630767822 + }, + { + "auxiliary_loss_clip": 0.01437882, + "auxiliary_loss_mlp": 0.01253865, + "balance_loss_clip": 1.12384558, + "balance_loss_mlp": 1.03413892, + "epoch": 0.4732301217495867, + "flos": 24900351003360.0, + "grad_norm": 2.260805314783617, + "language_loss": 0.81377757, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.84069502, + "num_input_tokens_seen": 169109650, + "step": 7871, + "time_per_iteration": 2.7847516536712646 + }, + { + "auxiliary_loss_clip": 0.0143113, + "auxiliary_loss_mlp": 0.01249735, + "balance_loss_clip": 1.11563182, + "balance_loss_mlp": 1.03286934, + "epoch": 0.47329024500225464, + "flos": 22786669372320.0, + "grad_norm": 1.566501728769851, + "language_loss": 0.75978529, + "learning_rate": 2.269271463701879e-06, + "loss": 0.78659391, + "num_input_tokens_seen": 169128990, + "step": 7872, + "time_per_iteration": 2.836992025375366 + }, + { + "auxiliary_loss_clip": 0.01425075, + "auxiliary_loss_mlp": 0.01248689, + "balance_loss_clip": 1.11177588, + "balance_loss_mlp": 1.03220487, + "epoch": 0.4733503682549226, + "flos": 38699246390400.0, + "grad_norm": 1.7280726998015763, + "language_loss": 0.68164486, + "learning_rate": 2.268885542903428e-06, + "loss": 0.70838249, + "num_input_tokens_seen": 169154645, + "step": 7873, + "time_per_iteration": 2.997701406478882 + }, + { + "auxiliary_loss_clip": 0.01436841, + "auxiliary_loss_mlp": 0.01250161, + "balance_loss_clip": 1.12208819, + "balance_loss_mlp": 1.03272295, + "epoch": 0.47341049150759057, + "flos": 22969105643520.0, + "grad_norm": 1.5622695768802872, + "language_loss": 0.7276721, + "learning_rate": 2.26849961190881e-06, + "loss": 0.75454211, + "num_input_tokens_seen": 169174995, + "step": 7874, + "time_per_iteration": 4.263060808181763 + }, + { + "auxiliary_loss_clip": 0.01436498, + "auxiliary_loss_mlp": 0.01262344, + "balance_loss_clip": 1.12233162, + "balance_loss_mlp": 1.04509735, + "epoch": 0.47347061476025853, + "flos": 14540246488800.0, + "grad_norm": 2.432463698274674, + "language_loss": 0.65287316, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67986161, + "num_input_tokens_seen": 169191815, + "step": 7875, + "time_per_iteration": 2.8196160793304443 + }, + { + "auxiliary_loss_clip": 0.0143925, + "auxiliary_loss_mlp": 0.0124393, + "balance_loss_clip": 1.12368011, + "balance_loss_mlp": 1.02592039, + "epoch": 0.4735307380129265, + "flos": 30265873784640.0, + "grad_norm": 2.3058222560239545, + "language_loss": 0.81498969, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.84182149, + "num_input_tokens_seen": 169210430, + "step": 7876, + "time_per_iteration": 2.860128164291382 + }, + { + "auxiliary_loss_clip": 0.01431321, + "auxiliary_loss_mlp": 0.01250971, + "balance_loss_clip": 1.11555576, + "balance_loss_mlp": 1.03486824, + "epoch": 0.47359086126559446, + "flos": 19393089230880.0, + "grad_norm": 1.8404680577573123, + "language_loss": 0.78895056, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81577349, + "num_input_tokens_seen": 169229295, + "step": 7877, + "time_per_iteration": 2.760380268096924 + }, + { + "auxiliary_loss_clip": 0.01436587, + "auxiliary_loss_mlp": 0.01241726, + "balance_loss_clip": 1.12230754, + "balance_loss_mlp": 1.02695847, + "epoch": 0.47365098451826243, + "flos": 21941280234720.0, + "grad_norm": 1.923164490275193, + "language_loss": 0.70540422, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.73218733, + "num_input_tokens_seen": 169247855, + "step": 7878, + "time_per_iteration": 2.7785396575927734 + }, + { + "auxiliary_loss_clip": 0.01439143, + "auxiliary_loss_mlp": 0.01254392, + "balance_loss_clip": 1.12335014, + "balance_loss_mlp": 1.04115045, + "epoch": 0.4737111077709304, + "flos": 25847236990080.0, + "grad_norm": 1.8578054719223058, + "language_loss": 0.75552374, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.78245908, + "num_input_tokens_seen": 169268860, + "step": 7879, + "time_per_iteration": 2.805699348449707 + }, + { + "auxiliary_loss_clip": 0.01496038, + "auxiliary_loss_mlp": 0.01214119, + "balance_loss_clip": 1.20517802, + "balance_loss_mlp": 1.01346588, + "epoch": 0.47377123102359836, + "flos": 67767109290720.0, + "grad_norm": 0.7287106434272986, + "language_loss": 0.61278522, + "learning_rate": 2.266183812641164e-06, + "loss": 0.6398868, + "num_input_tokens_seen": 169331855, + "step": 7880, + "time_per_iteration": 3.328974962234497 + }, + { + "auxiliary_loss_clip": 0.01434543, + "auxiliary_loss_mlp": 0.01242761, + "balance_loss_clip": 1.11916423, + "balance_loss_mlp": 1.02379727, + "epoch": 0.4738313542762663, + "flos": 24318261702720.0, + "grad_norm": 1.8639797365414985, + "language_loss": 0.6776455, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70441854, + "num_input_tokens_seen": 169352175, + "step": 7881, + "time_per_iteration": 2.7943167686462402 + }, + { + "auxiliary_loss_clip": 0.01436631, + "auxiliary_loss_mlp": 0.01251206, + "balance_loss_clip": 1.12105608, + "balance_loss_mlp": 1.03491306, + "epoch": 0.4738914775289343, + "flos": 20707502731200.0, + "grad_norm": 1.7376959196343273, + "language_loss": 0.77303565, + "learning_rate": 2.265411798646092e-06, + "loss": 0.799914, + "num_input_tokens_seen": 169371215, + "step": 7882, + "time_per_iteration": 2.775729179382324 + }, + { + "auxiliary_loss_clip": 0.01431432, + "auxiliary_loss_mlp": 0.01251449, + "balance_loss_clip": 1.11498559, + "balance_loss_mlp": 1.03477454, + "epoch": 0.4739516007816023, + "flos": 25448645949120.0, + "grad_norm": 1.9809716135685973, + "language_loss": 0.76105046, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78787923, + "num_input_tokens_seen": 169391745, + "step": 7883, + "time_per_iteration": 2.8095180988311768 + }, + { + "auxiliary_loss_clip": 0.01435737, + "auxiliary_loss_mlp": 0.01242338, + "balance_loss_clip": 1.12016201, + "balance_loss_mlp": 1.02604485, + "epoch": 0.4740117240342703, + "flos": 19976392232640.0, + "grad_norm": 1.7677747523342895, + "language_loss": 0.72444445, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.75122523, + "num_input_tokens_seen": 169409845, + "step": 7884, + "time_per_iteration": 2.7595229148864746 + }, + { + "auxiliary_loss_clip": 0.01433103, + "auxiliary_loss_mlp": 0.01250749, + "balance_loss_clip": 1.11777139, + "balance_loss_mlp": 1.0336926, + "epoch": 0.47407184728693824, + "flos": 15662514108960.0, + "grad_norm": 2.0773338694030783, + "language_loss": 0.82137156, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84820998, + "num_input_tokens_seen": 169426085, + "step": 7885, + "time_per_iteration": 2.6662490367889404 + }, + { + "auxiliary_loss_clip": 0.01433467, + "auxiliary_loss_mlp": 0.01254666, + "balance_loss_clip": 1.11828935, + "balance_loss_mlp": 1.03684664, + "epoch": 0.4741319705396062, + "flos": 18590824775520.0, + "grad_norm": 2.083638260002572, + "language_loss": 0.73527348, + "learning_rate": 2.263867649999751e-06, + "loss": 0.76215488, + "num_input_tokens_seen": 169444705, + "step": 7886, + "time_per_iteration": 2.764181137084961 + }, + { + "auxiliary_loss_clip": 0.01436451, + "auxiliary_loss_mlp": 0.01248174, + "balance_loss_clip": 1.12057757, + "balance_loss_mlp": 1.0257771, + "epoch": 0.47419209379227417, + "flos": 13262775380640.0, + "grad_norm": 2.7601130005609487, + "language_loss": 0.74007499, + "learning_rate": 2.263481587786849e-06, + "loss": 0.76692128, + "num_input_tokens_seen": 169460850, + "step": 7887, + "time_per_iteration": 2.8526852130889893 + }, + { + "auxiliary_loss_clip": 0.01432115, + "auxiliary_loss_mlp": 0.01238094, + "balance_loss_clip": 1.11900365, + "balance_loss_mlp": 1.02065587, + "epoch": 0.47425221704494214, + "flos": 20045915278560.0, + "grad_norm": 1.8920459181203833, + "language_loss": 0.77215123, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.7988534, + "num_input_tokens_seen": 169478890, + "step": 7888, + "time_per_iteration": 2.7861714363098145 + }, + { + "auxiliary_loss_clip": 0.01432835, + "auxiliary_loss_mlp": 0.01242863, + "balance_loss_clip": 1.11695147, + "balance_loss_mlp": 1.02485275, + "epoch": 0.4743123402976101, + "flos": 27274374074880.0, + "grad_norm": 1.982254274001814, + "language_loss": 0.72597402, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.75273097, + "num_input_tokens_seen": 169499690, + "step": 7889, + "time_per_iteration": 2.8961715698242188 + }, + { + "auxiliary_loss_clip": 0.01475064, + "auxiliary_loss_mlp": 0.0118615, + "balance_loss_clip": 1.18116677, + "balance_loss_mlp": 0.98473358, + "epoch": 0.47437246355027807, + "flos": 55399598942400.0, + "grad_norm": 0.73164835561231, + "language_loss": 0.55900431, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58561641, + "num_input_tokens_seen": 169560475, + "step": 7890, + "time_per_iteration": 3.323418140411377 + }, + { + "auxiliary_loss_clip": 0.01431197, + "auxiliary_loss_mlp": 0.01244603, + "balance_loss_clip": 1.11688399, + "balance_loss_mlp": 1.02754712, + "epoch": 0.47443258680294603, + "flos": 23880756005280.0, + "grad_norm": 2.164967451652837, + "language_loss": 0.65810919, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.68486714, + "num_input_tokens_seen": 169580110, + "step": 7891, + "time_per_iteration": 2.8537440299987793 + }, + { + "auxiliary_loss_clip": 0.0142836, + "auxiliary_loss_mlp": 0.01252985, + "balance_loss_clip": 1.11318231, + "balance_loss_mlp": 1.03249514, + "epoch": 0.474492710055614, + "flos": 21979739753280.0, + "grad_norm": 2.740288587477437, + "language_loss": 0.70378304, + "learning_rate": 2.26155112714642e-06, + "loss": 0.73059648, + "num_input_tokens_seen": 169597510, + "step": 7892, + "time_per_iteration": 2.7869646549224854 + }, + { + "auxiliary_loss_clip": 0.01469158, + "auxiliary_loss_mlp": 0.01207199, + "balance_loss_clip": 1.17462111, + "balance_loss_mlp": 1.0080719, + "epoch": 0.47455283330828196, + "flos": 62563882563360.0, + "grad_norm": 0.8065826267508563, + "language_loss": 0.5859344, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.61269796, + "num_input_tokens_seen": 169660010, + "step": 7893, + "time_per_iteration": 3.375082492828369 + }, + { + "auxiliary_loss_clip": 0.01429346, + "auxiliary_loss_mlp": 0.01242955, + "balance_loss_clip": 1.11352479, + "balance_loss_mlp": 1.02647066, + "epoch": 0.47461295656094993, + "flos": 12095259101280.0, + "grad_norm": 2.235925407206761, + "language_loss": 0.77417099, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.80089396, + "num_input_tokens_seen": 169678485, + "step": 7894, + "time_per_iteration": 2.8579907417297363 + }, + { + "auxiliary_loss_clip": 0.01424778, + "auxiliary_loss_mlp": 0.01248648, + "balance_loss_clip": 1.11026049, + "balance_loss_mlp": 1.03578758, + "epoch": 0.4746730798136179, + "flos": 20886563396160.0, + "grad_norm": 1.9749499296581576, + "language_loss": 0.7470271, + "learning_rate": 2.260392731628497e-06, + "loss": 0.77376139, + "num_input_tokens_seen": 169697335, + "step": 7895, + "time_per_iteration": 2.8859853744506836 + }, + { + "auxiliary_loss_clip": 0.01431863, + "auxiliary_loss_mlp": 0.01239097, + "balance_loss_clip": 1.1163547, + "balance_loss_mlp": 1.0228039, + "epoch": 0.4747332030662859, + "flos": 19976885298720.0, + "grad_norm": 1.932268994595859, + "language_loss": 0.82391965, + "learning_rate": 2.260006580021429e-06, + "loss": 0.85062921, + "num_input_tokens_seen": 169715395, + "step": 7896, + "time_per_iteration": 2.7859809398651123 + }, + { + "auxiliary_loss_clip": 0.01433976, + "auxiliary_loss_mlp": 0.01251164, + "balance_loss_clip": 1.11866117, + "balance_loss_mlp": 1.03677821, + "epoch": 0.4747933263189539, + "flos": 16036034271840.0, + "grad_norm": 1.957745495233551, + "language_loss": 0.76084101, + "learning_rate": 2.259620418554886e-06, + "loss": 0.78769243, + "num_input_tokens_seen": 169733755, + "step": 7897, + "time_per_iteration": 2.7384002208709717 + }, + { + "auxiliary_loss_clip": 0.01429897, + "auxiliary_loss_mlp": 0.01249544, + "balance_loss_clip": 1.11376226, + "balance_loss_mlp": 1.03229713, + "epoch": 0.47485344957162184, + "flos": 13956867630720.0, + "grad_norm": 1.9294718945495999, + "language_loss": 0.63701355, + "learning_rate": 2.25923424724351e-06, + "loss": 0.66380799, + "num_input_tokens_seen": 169751390, + "step": 7898, + "time_per_iteration": 4.322643995285034 + }, + { + "auxiliary_loss_clip": 0.01432837, + "auxiliary_loss_mlp": 0.01249091, + "balance_loss_clip": 1.11719584, + "balance_loss_mlp": 1.0310806, + "epoch": 0.4749135728242898, + "flos": 20451181675680.0, + "grad_norm": 2.149552764104555, + "language_loss": 0.69404185, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72086114, + "num_input_tokens_seen": 169769500, + "step": 7899, + "time_per_iteration": 2.815859079360962 + }, + { + "auxiliary_loss_clip": 0.014298, + "auxiliary_loss_mlp": 0.01244817, + "balance_loss_clip": 1.11368418, + "balance_loss_mlp": 1.02795112, + "epoch": 0.4749736960769578, + "flos": 28952522272800.0, + "grad_norm": 2.1363680130710043, + "language_loss": 0.68523479, + "learning_rate": 2.258461875144837e-06, + "loss": 0.71198094, + "num_input_tokens_seen": 169789215, + "step": 7900, + "time_per_iteration": 2.8098137378692627 + }, + { + "auxiliary_loss_clip": 0.01430179, + "auxiliary_loss_mlp": 0.01240894, + "balance_loss_clip": 1.11532474, + "balance_loss_mlp": 1.02555466, + "epoch": 0.47503381932962574, + "flos": 31941329083200.0, + "grad_norm": 1.9799987796939118, + "language_loss": 0.70785868, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.73456943, + "num_input_tokens_seen": 169808825, + "step": 7901, + "time_per_iteration": 2.8356313705444336 + }, + { + "auxiliary_loss_clip": 0.01432306, + "auxiliary_loss_mlp": 0.01253126, + "balance_loss_clip": 1.11700666, + "balance_loss_mlp": 1.03740454, + "epoch": 0.4750939425822937, + "flos": 22129443658080.0, + "grad_norm": 87.92049189345956, + "language_loss": 0.73824555, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.76509988, + "num_input_tokens_seen": 169827590, + "step": 7902, + "time_per_iteration": 2.7655584812164307 + }, + { + "auxiliary_loss_clip": 0.01426498, + "auxiliary_loss_mlp": 0.0124746, + "balance_loss_clip": 1.11229527, + "balance_loss_mlp": 1.03574419, + "epoch": 0.47515406583496167, + "flos": 20852200118880.0, + "grad_norm": 2.5642481004236792, + "language_loss": 0.69281662, + "learning_rate": 2.257303243526688e-06, + "loss": 0.71955621, + "num_input_tokens_seen": 169844925, + "step": 7903, + "time_per_iteration": 2.8297252655029297 + }, + { + "auxiliary_loss_clip": 0.01422863, + "auxiliary_loss_mlp": 0.0123733, + "balance_loss_clip": 1.10815799, + "balance_loss_mlp": 1.025805, + "epoch": 0.47521418908762963, + "flos": 17526436256160.0, + "grad_norm": 1.7896425974154981, + "language_loss": 0.72133309, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74793506, + "num_input_tokens_seen": 169862705, + "step": 7904, + "time_per_iteration": 4.272402286529541 + }, + { + "auxiliary_loss_clip": 0.01432825, + "auxiliary_loss_mlp": 0.01254518, + "balance_loss_clip": 1.1177845, + "balance_loss_mlp": 1.03994143, + "epoch": 0.4752743123402976, + "flos": 20561591642400.0, + "grad_norm": 1.6826155344388363, + "language_loss": 0.86296427, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88983768, + "num_input_tokens_seen": 169880155, + "step": 7905, + "time_per_iteration": 2.693085193634033 + }, + { + "auxiliary_loss_clip": 0.01423316, + "auxiliary_loss_mlp": 0.01242647, + "balance_loss_clip": 1.10861182, + "balance_loss_mlp": 1.03016829, + "epoch": 0.47533443559296557, + "flos": 26363102994720.0, + "grad_norm": 2.936513977611472, + "language_loss": 0.82266635, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84932601, + "num_input_tokens_seen": 169901525, + "step": 7906, + "time_per_iteration": 4.2445478439331055 + }, + { + "auxiliary_loss_clip": 0.01461166, + "auxiliary_loss_mlp": 0.01253548, + "balance_loss_clip": 1.16582, + "balance_loss_mlp": 1.05365753, + "epoch": 0.47539455884563353, + "flos": 65956514500800.0, + "grad_norm": 0.7984470500319605, + "language_loss": 0.58943403, + "learning_rate": 2.255758264840002e-06, + "loss": 0.6165812, + "num_input_tokens_seen": 169970345, + "step": 7907, + "time_per_iteration": 3.4150147438049316 + }, + { + "auxiliary_loss_clip": 0.01423472, + "auxiliary_loss_mlp": 0.01242324, + "balance_loss_clip": 1.10944092, + "balance_loss_mlp": 1.02812922, + "epoch": 0.4754546820983015, + "flos": 17240037805440.0, + "grad_norm": 1.8769016483049152, + "language_loss": 0.81564415, + "learning_rate": 2.255371995885765e-06, + "loss": 0.84230214, + "num_input_tokens_seen": 169986440, + "step": 7908, + "time_per_iteration": 2.8639144897460938 + }, + { + "auxiliary_loss_clip": 0.01428199, + "auxiliary_loss_mlp": 0.01241159, + "balance_loss_clip": 1.11125708, + "balance_loss_mlp": 1.02772713, + "epoch": 0.47551480535096946, + "flos": 19827788244480.0, + "grad_norm": 1.8771810467929781, + "language_loss": 0.74227458, + "learning_rate": 2.254985717247797e-06, + "loss": 0.76896816, + "num_input_tokens_seen": 170005705, + "step": 7909, + "time_per_iteration": 2.774064779281616 + }, + { + "auxiliary_loss_clip": 0.01425203, + "auxiliary_loss_mlp": 0.01234658, + "balance_loss_clip": 1.11020613, + "balance_loss_mlp": 1.01969957, + "epoch": 0.4755749286036375, + "flos": 22166158481280.0, + "grad_norm": 1.6367005059364974, + "language_loss": 0.75651699, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.78311563, + "num_input_tokens_seen": 170023415, + "step": 7910, + "time_per_iteration": 2.7907259464263916 + }, + { + "auxiliary_loss_clip": 0.01425349, + "auxiliary_loss_mlp": 0.01235245, + "balance_loss_clip": 1.10983706, + "balance_loss_mlp": 1.02085912, + "epoch": 0.47563505185630545, + "flos": 21650178692160.0, + "grad_norm": 1.6692834245950516, + "language_loss": 0.78855777, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81516373, + "num_input_tokens_seen": 170042395, + "step": 7911, + "time_per_iteration": 4.290902376174927 + }, + { + "auxiliary_loss_clip": 0.01418076, + "auxiliary_loss_mlp": 0.01251786, + "balance_loss_clip": 1.10414267, + "balance_loss_mlp": 1.03606474, + "epoch": 0.4756951751089734, + "flos": 20630735406720.0, + "grad_norm": 1.9546590406937088, + "language_loss": 0.75732827, + "learning_rate": 2.253826823377983e-06, + "loss": 0.78402692, + "num_input_tokens_seen": 170061610, + "step": 7912, + "time_per_iteration": 2.735828161239624 + }, + { + "auxiliary_loss_clip": 0.01427379, + "auxiliary_loss_mlp": 0.01236534, + "balance_loss_clip": 1.11230838, + "balance_loss_mlp": 1.02004945, + "epoch": 0.4757552983616414, + "flos": 25851181518720.0, + "grad_norm": 1.5561318564284756, + "language_loss": 0.74067307, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76731217, + "num_input_tokens_seen": 170083505, + "step": 7913, + "time_per_iteration": 2.8825464248657227 + }, + { + "auxiliary_loss_clip": 0.01429111, + "auxiliary_loss_mlp": 0.01237719, + "balance_loss_clip": 1.11285853, + "balance_loss_mlp": 1.02314222, + "epoch": 0.47581542161430934, + "flos": 18224966100960.0, + "grad_norm": 2.6090604697979964, + "language_loss": 0.7255342, + "learning_rate": 2.253054179314666e-06, + "loss": 0.75220251, + "num_input_tokens_seen": 170100690, + "step": 7914, + "time_per_iteration": 2.7866148948669434 + }, + { + "auxiliary_loss_clip": 0.01430441, + "auxiliary_loss_mlp": 0.01247969, + "balance_loss_clip": 1.11622143, + "balance_loss_mlp": 1.03129435, + "epoch": 0.4758755448669773, + "flos": 21581907275520.0, + "grad_norm": 2.1399487863981848, + "language_loss": 0.64844775, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.67523193, + "num_input_tokens_seen": 170119240, + "step": 7915, + "time_per_iteration": 2.764240264892578 + }, + { + "auxiliary_loss_clip": 0.01420556, + "auxiliary_loss_mlp": 0.01230977, + "balance_loss_clip": 1.1066469, + "balance_loss_mlp": 1.0169723, + "epoch": 0.47593566811964527, + "flos": 15232935396960.0, + "grad_norm": 1.7768992033146884, + "language_loss": 0.76743841, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.79395378, + "num_input_tokens_seen": 170136450, + "step": 7916, + "time_per_iteration": 2.852808952331543 + }, + { + "auxiliary_loss_clip": 0.01425168, + "auxiliary_loss_mlp": 0.01238322, + "balance_loss_clip": 1.11010575, + "balance_loss_mlp": 1.0237453, + "epoch": 0.47599579137231324, + "flos": 21545382093120.0, + "grad_norm": 1.9677875419941084, + "language_loss": 0.64215809, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66879296, + "num_input_tokens_seen": 170155295, + "step": 7917, + "time_per_iteration": 2.7600622177124023 + }, + { + "auxiliary_loss_clip": 0.01456647, + "auxiliary_loss_mlp": 0.01195183, + "balance_loss_clip": 1.16555023, + "balance_loss_mlp": 0.99529266, + "epoch": 0.4760559146249812, + "flos": 64560896081280.0, + "grad_norm": 0.8414470763385806, + "language_loss": 0.65613902, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.6826573, + "num_input_tokens_seen": 170222325, + "step": 7918, + "time_per_iteration": 3.3338539600372314 + }, + { + "auxiliary_loss_clip": 0.01425862, + "auxiliary_loss_mlp": 0.01236224, + "balance_loss_clip": 1.1126864, + "balance_loss_mlp": 1.02088428, + "epoch": 0.47611603787764917, + "flos": 22235909096160.0, + "grad_norm": 1.7440214754557422, + "language_loss": 0.688591, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.71521187, + "num_input_tokens_seen": 170241625, + "step": 7919, + "time_per_iteration": 2.799788475036621 + }, + { + "auxiliary_loss_clip": 0.01419729, + "auxiliary_loss_mlp": 0.01253117, + "balance_loss_clip": 1.10635185, + "balance_loss_mlp": 1.03853989, + "epoch": 0.47617616113031713, + "flos": 22782004208640.0, + "grad_norm": 1.5434636616240334, + "language_loss": 0.74533248, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.77206087, + "num_input_tokens_seen": 170262470, + "step": 7920, + "time_per_iteration": 2.8652052879333496 + }, + { + "auxiliary_loss_clip": 0.01431699, + "auxiliary_loss_mlp": 0.01244729, + "balance_loss_clip": 1.11608887, + "balance_loss_mlp": 1.02481151, + "epoch": 0.4762362843829851, + "flos": 24136090928640.0, + "grad_norm": 1.748258142624229, + "language_loss": 0.77752137, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.80428565, + "num_input_tokens_seen": 170283460, + "step": 7921, + "time_per_iteration": 2.797205924987793 + }, + { + "auxiliary_loss_clip": 0.0142431, + "auxiliary_loss_mlp": 0.01238309, + "balance_loss_clip": 1.10951877, + "balance_loss_mlp": 1.01915479, + "epoch": 0.47629640763565306, + "flos": 22454074058400.0, + "grad_norm": 1.4886017834159122, + "language_loss": 0.7812885, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80791473, + "num_input_tokens_seen": 170304225, + "step": 7922, + "time_per_iteration": 2.8646240234375 + }, + { + "auxiliary_loss_clip": 0.01427262, + "auxiliary_loss_mlp": 0.01257811, + "balance_loss_clip": 1.1116581, + "balance_loss_mlp": 1.04113626, + "epoch": 0.4763565308883211, + "flos": 11182963960800.0, + "grad_norm": 1.7970471939461254, + "language_loss": 0.72794354, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.75479424, + "num_input_tokens_seen": 170322110, + "step": 7923, + "time_per_iteration": 2.7581326961517334 + }, + { + "auxiliary_loss_clip": 0.01426011, + "auxiliary_loss_mlp": 0.01245214, + "balance_loss_clip": 1.11140478, + "balance_loss_mlp": 1.03082848, + "epoch": 0.47641665414098905, + "flos": 22384323443520.0, + "grad_norm": 3.054094555913353, + "language_loss": 0.8242166, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.85092884, + "num_input_tokens_seen": 170340700, + "step": 7924, + "time_per_iteration": 2.7685365676879883 + }, + { + "auxiliary_loss_clip": 0.01434839, + "auxiliary_loss_mlp": 0.01251744, + "balance_loss_clip": 1.11851728, + "balance_loss_mlp": 1.03125441, + "epoch": 0.476476777393657, + "flos": 25048499853600.0, + "grad_norm": 2.6877578277341825, + "language_loss": 0.80412745, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.83099329, + "num_input_tokens_seen": 170359780, + "step": 7925, + "time_per_iteration": 2.8260436058044434 + }, + { + "auxiliary_loss_clip": 0.01418076, + "auxiliary_loss_mlp": 0.01248576, + "balance_loss_clip": 1.10289812, + "balance_loss_mlp": 1.03361797, + "epoch": 0.476536900646325, + "flos": 27272288026080.0, + "grad_norm": 1.7315462789954867, + "language_loss": 0.72013217, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74679869, + "num_input_tokens_seen": 170381260, + "step": 7926, + "time_per_iteration": 2.8218917846679688 + }, + { + "auxiliary_loss_clip": 0.01426492, + "auxiliary_loss_mlp": 0.01249162, + "balance_loss_clip": 1.11160696, + "balance_loss_mlp": 1.02943599, + "epoch": 0.47659702389899294, + "flos": 25303758920640.0, + "grad_norm": 2.374215259258239, + "language_loss": 0.68694061, + "learning_rate": 2.248031062546432e-06, + "loss": 0.71369714, + "num_input_tokens_seen": 170400595, + "step": 7927, + "time_per_iteration": 2.8340871334075928 + }, + { + "auxiliary_loss_clip": 0.01430422, + "auxiliary_loss_mlp": 0.01248598, + "balance_loss_clip": 1.11513782, + "balance_loss_mlp": 1.03421187, + "epoch": 0.4766571471516609, + "flos": 25995158271360.0, + "grad_norm": 1.74051447903114, + "language_loss": 0.6825453, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70933557, + "num_input_tokens_seen": 170421110, + "step": 7928, + "time_per_iteration": 2.81640362739563 + }, + { + "auxiliary_loss_clip": 0.01427861, + "auxiliary_loss_mlp": 0.01249188, + "balance_loss_clip": 1.11204076, + "balance_loss_mlp": 1.03423011, + "epoch": 0.4767172704043289, + "flos": 16033948223040.0, + "grad_norm": 2.2008513212124945, + "language_loss": 0.78642464, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.81319511, + "num_input_tokens_seen": 170436700, + "step": 7929, + "time_per_iteration": 2.817810297012329 + }, + { + "auxiliary_loss_clip": 0.01421977, + "auxiliary_loss_mlp": 0.0124458, + "balance_loss_clip": 1.1055783, + "balance_loss_mlp": 1.03095651, + "epoch": 0.47677739365699684, + "flos": 39238286865120.0, + "grad_norm": 1.8419552633063838, + "language_loss": 0.66745239, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.69411802, + "num_input_tokens_seen": 170459555, + "step": 7930, + "time_per_iteration": 2.987015724182129 + }, + { + "auxiliary_loss_clip": 0.01427716, + "auxiliary_loss_mlp": 0.01241156, + "balance_loss_clip": 1.11148548, + "balance_loss_mlp": 1.02638888, + "epoch": 0.4768375169096648, + "flos": 24720114565440.0, + "grad_norm": 1.989808729988306, + "language_loss": 0.80121571, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82790446, + "num_input_tokens_seen": 170479175, + "step": 7931, + "time_per_iteration": 2.852747917175293 + }, + { + "auxiliary_loss_clip": 0.01428872, + "auxiliary_loss_mlp": 0.01243756, + "balance_loss_clip": 1.11180544, + "balance_loss_mlp": 1.02784383, + "epoch": 0.47689764016233277, + "flos": 22530879311040.0, + "grad_norm": 1.7103702476082518, + "language_loss": 0.75964171, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78636795, + "num_input_tokens_seen": 170498450, + "step": 7932, + "time_per_iteration": 2.7858386039733887 + }, + { + "auxiliary_loss_clip": 0.01427784, + "auxiliary_loss_mlp": 0.01243275, + "balance_loss_clip": 1.11113811, + "balance_loss_mlp": 1.02888894, + "epoch": 0.47695776341500074, + "flos": 15122335789440.0, + "grad_norm": 2.198540733567346, + "language_loss": 0.7970978, + "learning_rate": 2.245712162906593e-06, + "loss": 0.82380837, + "num_input_tokens_seen": 170516255, + "step": 7933, + "time_per_iteration": 2.805109739303589 + }, + { + "auxiliary_loss_clip": 0.01424894, + "auxiliary_loss_mlp": 0.0125476, + "balance_loss_clip": 1.10775709, + "balance_loss_mlp": 1.0390389, + "epoch": 0.4770178866676687, + "flos": 14680392497280.0, + "grad_norm": 2.3070423014537718, + "language_loss": 0.74316669, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76996326, + "num_input_tokens_seen": 170532705, + "step": 7934, + "time_per_iteration": 2.7371504306793213 + }, + { + "auxiliary_loss_clip": 0.01429892, + "auxiliary_loss_mlp": 0.01249052, + "balance_loss_clip": 1.11269712, + "balance_loss_mlp": 1.03123283, + "epoch": 0.47707800992033667, + "flos": 22567821703200.0, + "grad_norm": 2.017113383285747, + "language_loss": 0.8011542, + "learning_rate": 2.244939121664211e-06, + "loss": 0.82794368, + "num_input_tokens_seen": 170551925, + "step": 7935, + "time_per_iteration": 4.421812057495117 + }, + { + "auxiliary_loss_clip": 0.01430965, + "auxiliary_loss_mlp": 0.01254659, + "balance_loss_clip": 1.11343074, + "balance_loss_mlp": 1.03798413, + "epoch": 0.4771381331730047, + "flos": 30920785881120.0, + "grad_norm": 5.123421837669012, + "language_loss": 0.71149749, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73835373, + "num_input_tokens_seen": 170572320, + "step": 7936, + "time_per_iteration": 2.856466293334961 + }, + { + "auxiliary_loss_clip": 0.01426982, + "auxiliary_loss_mlp": 0.01254362, + "balance_loss_clip": 1.10917056, + "balance_loss_mlp": 1.03978539, + "epoch": 0.47719825642567265, + "flos": 25741264618080.0, + "grad_norm": 2.1478171680665588, + "language_loss": 0.67892122, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.70573473, + "num_input_tokens_seen": 170589470, + "step": 7937, + "time_per_iteration": 2.812023639678955 + }, + { + "auxiliary_loss_clip": 0.01462477, + "auxiliary_loss_mlp": 0.01204254, + "balance_loss_clip": 1.16862547, + "balance_loss_mlp": 1.00588989, + "epoch": 0.4772583796783406, + "flos": 66362236035840.0, + "grad_norm": 0.7030506524964013, + "language_loss": 0.56318188, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58984917, + "num_input_tokens_seen": 170662265, + "step": 7938, + "time_per_iteration": 3.4423816204071045 + }, + { + "auxiliary_loss_clip": 0.01421195, + "auxiliary_loss_mlp": 0.01239457, + "balance_loss_clip": 1.10508776, + "balance_loss_mlp": 1.02564359, + "epoch": 0.4773185029310086, + "flos": 22053093543360.0, + "grad_norm": 1.679902535695165, + "language_loss": 0.89146692, + "learning_rate": 2.243392927839317e-06, + "loss": 0.91807342, + "num_input_tokens_seen": 170679680, + "step": 7939, + "time_per_iteration": 2.8343923091888428 + }, + { + "auxiliary_loss_clip": 0.01421151, + "auxiliary_loss_mlp": 0.01242525, + "balance_loss_clip": 1.10392416, + "balance_loss_mlp": 1.02928352, + "epoch": 0.47737862618367655, + "flos": 16729709312160.0, + "grad_norm": 2.0439223682363052, + "language_loss": 0.76990676, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.79654348, + "num_input_tokens_seen": 170697340, + "step": 7940, + "time_per_iteration": 2.7192482948303223 + }, + { + "auxiliary_loss_clip": 0.01424018, + "auxiliary_loss_mlp": 0.01248818, + "balance_loss_clip": 1.1080662, + "balance_loss_mlp": 1.03519487, + "epoch": 0.4774387494363445, + "flos": 19611481762080.0, + "grad_norm": 1.5960334658605289, + "language_loss": 0.84892035, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87564874, + "num_input_tokens_seen": 170714905, + "step": 7941, + "time_per_iteration": 2.760594367980957 + }, + { + "auxiliary_loss_clip": 0.0143038, + "auxiliary_loss_mlp": 0.01252058, + "balance_loss_clip": 1.11295569, + "balance_loss_mlp": 1.03576517, + "epoch": 0.4774988726890125, + "flos": 16655824527840.0, + "grad_norm": 2.1622486037479245, + "language_loss": 0.75902855, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78585291, + "num_input_tokens_seen": 170731810, + "step": 7942, + "time_per_iteration": 4.27311897277832 + }, + { + "auxiliary_loss_clip": 0.01426656, + "auxiliary_loss_mlp": 0.01257997, + "balance_loss_clip": 1.10888267, + "balance_loss_mlp": 1.04380131, + "epoch": 0.47755899594168044, + "flos": 20487668929920.0, + "grad_norm": 2.3911086329699627, + "language_loss": 0.64790797, + "learning_rate": 2.241846586342682e-06, + "loss": 0.6747545, + "num_input_tokens_seen": 170750270, + "step": 7943, + "time_per_iteration": 4.32343864440918 + }, + { + "auxiliary_loss_clip": 0.01425424, + "auxiliary_loss_mlp": 0.01252606, + "balance_loss_clip": 1.10833192, + "balance_loss_mlp": 1.0391736, + "epoch": 0.4776191191943484, + "flos": 21654957640320.0, + "grad_norm": 1.8204238461652928, + "language_loss": 0.73682404, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.76360428, + "num_input_tokens_seen": 170769015, + "step": 7944, + "time_per_iteration": 2.7761614322662354 + }, + { + "auxiliary_loss_clip": 0.01425588, + "auxiliary_loss_mlp": 0.01253761, + "balance_loss_clip": 1.10863662, + "balance_loss_mlp": 1.0388031, + "epoch": 0.4776792424470164, + "flos": 18772199058240.0, + "grad_norm": 2.0375313009531006, + "language_loss": 0.68293929, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70973283, + "num_input_tokens_seen": 170785725, + "step": 7945, + "time_per_iteration": 2.7885684967041016 + }, + { + "auxiliary_loss_clip": 0.01419373, + "auxiliary_loss_mlp": 0.01237187, + "balance_loss_clip": 1.10265207, + "balance_loss_mlp": 1.0229919, + "epoch": 0.47773936569968434, + "flos": 29718413258400.0, + "grad_norm": 1.7962617226990993, + "language_loss": 0.75794935, + "learning_rate": 2.240686733875009e-06, + "loss": 0.78451502, + "num_input_tokens_seen": 170804600, + "step": 7946, + "time_per_iteration": 2.9144675731658936 + }, + { + "auxiliary_loss_clip": 0.01426385, + "auxiliary_loss_mlp": 0.01250313, + "balance_loss_clip": 1.10859108, + "balance_loss_mlp": 1.03382862, + "epoch": 0.4777994889523523, + "flos": 24793923493440.0, + "grad_norm": 2.0207944422864923, + "language_loss": 0.79103833, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81780529, + "num_input_tokens_seen": 170824230, + "step": 7947, + "time_per_iteration": 2.8080294132232666 + }, + { + "auxiliary_loss_clip": 0.01419507, + "auxiliary_loss_mlp": 0.01260941, + "balance_loss_clip": 1.1016283, + "balance_loss_mlp": 1.04884386, + "epoch": 0.47785961220502027, + "flos": 17860093558560.0, + "grad_norm": 2.2552899692003314, + "language_loss": 0.74123609, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.76804054, + "num_input_tokens_seen": 170843365, + "step": 7948, + "time_per_iteration": 2.7528998851776123 + }, + { + "auxiliary_loss_clip": 0.01431896, + "auxiliary_loss_mlp": 0.01247084, + "balance_loss_clip": 1.11488831, + "balance_loss_mlp": 1.03212583, + "epoch": 0.4779197354576883, + "flos": 20268631620000.0, + "grad_norm": 1.484117568867538, + "language_loss": 0.77772129, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80451107, + "num_input_tokens_seen": 170863515, + "step": 7949, + "time_per_iteration": 2.7678964138031006 + }, + { + "auxiliary_loss_clip": 0.01416518, + "auxiliary_loss_mlp": 0.01240522, + "balance_loss_clip": 1.09952319, + "balance_loss_mlp": 1.02689862, + "epoch": 0.47797985871035625, + "flos": 17058549738240.0, + "grad_norm": 2.1826556429552264, + "language_loss": 0.74022925, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.76679963, + "num_input_tokens_seen": 170881245, + "step": 7950, + "time_per_iteration": 4.189391374588013 + }, + { + "auxiliary_loss_clip": 0.01424129, + "auxiliary_loss_mlp": 0.01245153, + "balance_loss_clip": 1.10535049, + "balance_loss_mlp": 1.03248405, + "epoch": 0.4780399819630242, + "flos": 31361136190560.0, + "grad_norm": 2.6212354348133, + "language_loss": 0.74039972, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76709259, + "num_input_tokens_seen": 170901285, + "step": 7951, + "time_per_iteration": 2.8087058067321777 + }, + { + "auxiliary_loss_clip": 0.01419601, + "auxiliary_loss_mlp": 0.01252968, + "balance_loss_clip": 1.10156429, + "balance_loss_mlp": 1.03972673, + "epoch": 0.4781001052156922, + "flos": 24901906057920.0, + "grad_norm": 1.9456648806117482, + "language_loss": 0.79833603, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82506168, + "num_input_tokens_seen": 170919740, + "step": 7952, + "time_per_iteration": 2.826709270477295 + }, + { + "auxiliary_loss_clip": 0.01425363, + "auxiliary_loss_mlp": 0.01249901, + "balance_loss_clip": 1.10612035, + "balance_loss_mlp": 1.03475237, + "epoch": 0.47816022846836015, + "flos": 18699717615840.0, + "grad_norm": 1.7806091753755204, + "language_loss": 0.78222883, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80898142, + "num_input_tokens_seen": 170938510, + "step": 7953, + "time_per_iteration": 2.749222755432129 + }, + { + "auxiliary_loss_clip": 0.01429385, + "auxiliary_loss_mlp": 0.01253732, + "balance_loss_clip": 1.11008728, + "balance_loss_mlp": 1.03877354, + "epoch": 0.4782203517210281, + "flos": 11978818557120.0, + "grad_norm": 2.545382239548308, + "language_loss": 0.83898669, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.8658179, + "num_input_tokens_seen": 170951170, + "step": 7954, + "time_per_iteration": 2.716552495956421 + }, + { + "auxiliary_loss_clip": 0.01423466, + "auxiliary_loss_mlp": 0.01247523, + "balance_loss_clip": 1.10576916, + "balance_loss_mlp": 1.03561676, + "epoch": 0.4782804749736961, + "flos": 20815599080160.0, + "grad_norm": 1.4901584064096782, + "language_loss": 0.70604569, + "learning_rate": 2.237206685204768e-06, + "loss": 0.7327556, + "num_input_tokens_seen": 170970990, + "step": 7955, + "time_per_iteration": 2.8090906143188477 + }, + { + "auxiliary_loss_clip": 0.01430904, + "auxiliary_loss_mlp": 0.01260361, + "balance_loss_clip": 1.11199522, + "balance_loss_mlp": 1.04521179, + "epoch": 0.47834059822636404, + "flos": 23842486127520.0, + "grad_norm": 2.176291416412908, + "language_loss": 0.81875563, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.84566832, + "num_input_tokens_seen": 170991215, + "step": 7956, + "time_per_iteration": 2.818819046020508 + }, + { + "auxiliary_loss_clip": 0.01429044, + "auxiliary_loss_mlp": 0.01245443, + "balance_loss_clip": 1.11118472, + "balance_loss_mlp": 1.03391767, + "epoch": 0.478400721479032, + "flos": 22635751766400.0, + "grad_norm": 1.994963169759939, + "language_loss": 0.84884441, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87558925, + "num_input_tokens_seen": 171007325, + "step": 7957, + "time_per_iteration": 2.8123104572296143 + }, + { + "auxiliary_loss_clip": 0.01425636, + "auxiliary_loss_mlp": 0.01243117, + "balance_loss_clip": 1.10551405, + "balance_loss_mlp": 1.02911258, + "epoch": 0.4784608447317, + "flos": 19357019186400.0, + "grad_norm": 2.244662430935276, + "language_loss": 0.80004269, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.82673025, + "num_input_tokens_seen": 171025650, + "step": 7958, + "time_per_iteration": 2.8218772411346436 + }, + { + "auxiliary_loss_clip": 0.01429856, + "auxiliary_loss_mlp": 0.01247999, + "balance_loss_clip": 1.11015201, + "balance_loss_mlp": 1.0336132, + "epoch": 0.47852096798436794, + "flos": 24023215631520.0, + "grad_norm": 2.273778067118334, + "language_loss": 0.83044845, + "learning_rate": 2.235659762404047e-06, + "loss": 0.85722697, + "num_input_tokens_seen": 171045045, + "step": 7959, + "time_per_iteration": 2.827169179916382 + }, + { + "auxiliary_loss_clip": 0.01424641, + "auxiliary_loss_mlp": 0.01240116, + "balance_loss_clip": 1.10737455, + "balance_loss_mlp": 1.02649307, + "epoch": 0.4785810912370359, + "flos": 25668821103840.0, + "grad_norm": 2.2386506613306714, + "language_loss": 0.7291345, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75578207, + "num_input_tokens_seen": 171062910, + "step": 7960, + "time_per_iteration": 2.826195240020752 + }, + { + "auxiliary_loss_clip": 0.01428412, + "auxiliary_loss_mlp": 0.01244664, + "balance_loss_clip": 1.1090672, + "balance_loss_mlp": 1.03256679, + "epoch": 0.47864121448970387, + "flos": 21434175635040.0, + "grad_norm": 1.6810808008941218, + "language_loss": 0.77580053, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.80253124, + "num_input_tokens_seen": 171080875, + "step": 7961, + "time_per_iteration": 2.75931715965271 + }, + { + "auxiliary_loss_clip": 0.01426719, + "auxiliary_loss_mlp": 0.012408, + "balance_loss_clip": 1.1083405, + "balance_loss_mlp": 1.02526975, + "epoch": 0.47870133774237184, + "flos": 16145761531680.0, + "grad_norm": 1.740132518973838, + "language_loss": 0.77637643, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.80305159, + "num_input_tokens_seen": 171099190, + "step": 7962, + "time_per_iteration": 2.7827823162078857 + }, + { + "auxiliary_loss_clip": 0.01431369, + "auxiliary_loss_mlp": 0.01244532, + "balance_loss_clip": 1.11510968, + "balance_loss_mlp": 1.02881134, + "epoch": 0.47876146099503986, + "flos": 26909236035360.0, + "grad_norm": 1.7348405305934242, + "language_loss": 0.64921337, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.6759724, + "num_input_tokens_seen": 171119060, + "step": 7963, + "time_per_iteration": 2.8006114959716797 + }, + { + "auxiliary_loss_clip": 0.01430239, + "auxiliary_loss_mlp": 0.01241974, + "balance_loss_clip": 1.11357856, + "balance_loss_mlp": 1.02682483, + "epoch": 0.4788215842477078, + "flos": 45335564923680.0, + "grad_norm": 1.8453112264265326, + "language_loss": 0.77488673, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.80160886, + "num_input_tokens_seen": 171141900, + "step": 7964, + "time_per_iteration": 2.9624862670898438 + }, + { + "auxiliary_loss_clip": 0.01428008, + "auxiliary_loss_mlp": 0.01251799, + "balance_loss_clip": 1.11087179, + "balance_loss_mlp": 1.03207207, + "epoch": 0.4788817075003758, + "flos": 22239398486880.0, + "grad_norm": 1.7219841563519962, + "language_loss": 0.76403904, + "learning_rate": 2.233339110409044e-06, + "loss": 0.79083717, + "num_input_tokens_seen": 171161045, + "step": 7965, + "time_per_iteration": 2.7599332332611084 + }, + { + "auxiliary_loss_clip": 0.01423249, + "auxiliary_loss_mlp": 0.01234079, + "balance_loss_clip": 1.10457325, + "balance_loss_mlp": 1.02083743, + "epoch": 0.47894183075304375, + "flos": 16473008975040.0, + "grad_norm": 2.326807002848438, + "language_loss": 0.74681675, + "learning_rate": 2.232952304022137e-06, + "loss": 0.77339005, + "num_input_tokens_seen": 171179675, + "step": 7966, + "time_per_iteration": 2.7295258045196533 + }, + { + "auxiliary_loss_clip": 0.01433807, + "auxiliary_loss_mlp": 0.01260824, + "balance_loss_clip": 1.11471248, + "balance_loss_mlp": 1.0485363, + "epoch": 0.4790019540057117, + "flos": 24285150054720.0, + "grad_norm": 1.6290196614217674, + "language_loss": 0.72990501, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75685132, + "num_input_tokens_seen": 171201175, + "step": 7967, + "time_per_iteration": 2.8069562911987305 + }, + { + "auxiliary_loss_clip": 0.01430291, + "auxiliary_loss_mlp": 0.01237523, + "balance_loss_clip": 1.1120019, + "balance_loss_mlp": 1.02428126, + "epoch": 0.4790620772583797, + "flos": 25668859032000.0, + "grad_norm": 2.181538500346203, + "language_loss": 0.79594857, + "learning_rate": 2.232178664762267e-06, + "loss": 0.82262671, + "num_input_tokens_seen": 171221750, + "step": 7968, + "time_per_iteration": 2.8086581230163574 + }, + { + "auxiliary_loss_clip": 0.0147006, + "auxiliary_loss_mlp": 0.01215919, + "balance_loss_clip": 1.17421246, + "balance_loss_mlp": 1.01755524, + "epoch": 0.47912220051104765, + "flos": 69435813012480.0, + "grad_norm": 0.761562228406424, + "language_loss": 0.6216234, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64848322, + "num_input_tokens_seen": 171292235, + "step": 7969, + "time_per_iteration": 3.449336528778076 + }, + { + "auxiliary_loss_clip": 0.01432017, + "auxiliary_loss_mlp": 0.01246944, + "balance_loss_clip": 1.1144886, + "balance_loss_mlp": 1.03007817, + "epoch": 0.4791823237637156, + "flos": 24171136912800.0, + "grad_norm": 1.5971327635348573, + "language_loss": 0.77870655, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.80549622, + "num_input_tokens_seen": 171312215, + "step": 7970, + "time_per_iteration": 2.811232328414917 + }, + { + "auxiliary_loss_clip": 0.01423384, + "auxiliary_loss_mlp": 0.01233862, + "balance_loss_clip": 1.104509, + "balance_loss_mlp": 1.02195597, + "epoch": 0.4792424470163836, + "flos": 24753643423200.0, + "grad_norm": 1.8419957938087719, + "language_loss": 0.70223534, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72880781, + "num_input_tokens_seen": 171332975, + "step": 7971, + "time_per_iteration": 2.972675085067749 + }, + { + "auxiliary_loss_clip": 0.01422918, + "auxiliary_loss_mlp": 0.0124177, + "balance_loss_clip": 1.1044805, + "balance_loss_mlp": 1.02814651, + "epoch": 0.47930257026905154, + "flos": 23260131329760.0, + "grad_norm": 1.3415825778521115, + "language_loss": 0.7995261, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82617295, + "num_input_tokens_seen": 171353880, + "step": 7972, + "time_per_iteration": 2.906919240951538 + }, + { + "auxiliary_loss_clip": 0.0142652, + "auxiliary_loss_mlp": 0.01249189, + "balance_loss_clip": 1.1069181, + "balance_loss_mlp": 1.03403974, + "epoch": 0.4793626935217195, + "flos": 14065836327360.0, + "grad_norm": 2.8948232299048446, + "language_loss": 0.70001477, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.72677183, + "num_input_tokens_seen": 171370930, + "step": 7973, + "time_per_iteration": 5.560562610626221 + }, + { + "auxiliary_loss_clip": 0.01437681, + "auxiliary_loss_mlp": 0.01234316, + "balance_loss_clip": 1.1186167, + "balance_loss_mlp": 1.01992953, + "epoch": 0.4794228167743875, + "flos": 21801020441760.0, + "grad_norm": 1.8690475748673114, + "language_loss": 0.78592896, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.81264889, + "num_input_tokens_seen": 171387575, + "step": 7974, + "time_per_iteration": 2.888857126235962 + }, + { + "auxiliary_loss_clip": 0.01462131, + "auxiliary_loss_mlp": 0.01205711, + "balance_loss_clip": 1.16588473, + "balance_loss_mlp": 1.00505829, + "epoch": 0.47948294002705544, + "flos": 66975578504640.0, + "grad_norm": 0.7543894461951912, + "language_loss": 0.5393554, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56603384, + "num_input_tokens_seen": 171449980, + "step": 7975, + "time_per_iteration": 3.475104808807373 + }, + { + "auxiliary_loss_clip": 0.01435614, + "auxiliary_loss_mlp": 0.01245393, + "balance_loss_clip": 1.1177808, + "balance_loss_mlp": 1.02471232, + "epoch": 0.47954306327972346, + "flos": 12424137455520.0, + "grad_norm": 1.9942542106204173, + "language_loss": 0.90302134, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92983139, + "num_input_tokens_seen": 171465290, + "step": 7976, + "time_per_iteration": 2.865514039993286 + }, + { + "auxiliary_loss_clip": 0.01435737, + "auxiliary_loss_mlp": 0.01253825, + "balance_loss_clip": 1.11591136, + "balance_loss_mlp": 1.03505182, + "epoch": 0.4796031865323914, + "flos": 18363519126720.0, + "grad_norm": 2.241117434405845, + "language_loss": 0.73745346, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.7643491, + "num_input_tokens_seen": 171481130, + "step": 7977, + "time_per_iteration": 2.9630751609802246 + }, + { + "auxiliary_loss_clip": 0.01431691, + "auxiliary_loss_mlp": 0.0123776, + "balance_loss_clip": 1.11321771, + "balance_loss_mlp": 1.02318311, + "epoch": 0.4796633097850594, + "flos": 21837242198880.0, + "grad_norm": 1.5906913965583769, + "language_loss": 0.78814244, + "learning_rate": 2.228309942555734e-06, + "loss": 0.81483692, + "num_input_tokens_seen": 171501140, + "step": 7978, + "time_per_iteration": 2.9179251194000244 + }, + { + "auxiliary_loss_clip": 0.01433895, + "auxiliary_loss_mlp": 0.01245557, + "balance_loss_clip": 1.11502755, + "balance_loss_mlp": 1.03098035, + "epoch": 0.47972343303772735, + "flos": 23439419563680.0, + "grad_norm": 1.7097322722000663, + "language_loss": 0.89619368, + "learning_rate": 2.22792302247656e-06, + "loss": 0.92298818, + "num_input_tokens_seen": 171519835, + "step": 7979, + "time_per_iteration": 2.8680107593536377 + }, + { + "auxiliary_loss_clip": 0.01426797, + "auxiliary_loss_mlp": 0.01239045, + "balance_loss_clip": 1.10822213, + "balance_loss_mlp": 1.02465892, + "epoch": 0.4797835562903953, + "flos": 24902095698720.0, + "grad_norm": 1.7140124065294058, + "language_loss": 0.76643145, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79308981, + "num_input_tokens_seen": 171540980, + "step": 7980, + "time_per_iteration": 3.0600876808166504 + }, + { + "auxiliary_loss_clip": 0.01430369, + "auxiliary_loss_mlp": 0.01248767, + "balance_loss_clip": 1.11231256, + "balance_loss_mlp": 1.03190207, + "epoch": 0.4798436795430633, + "flos": 35045893730880.0, + "grad_norm": 1.7755935676553751, + "language_loss": 0.71692616, + "learning_rate": 2.227149156404295e-06, + "loss": 0.74371755, + "num_input_tokens_seen": 171563600, + "step": 7981, + "time_per_iteration": 4.391411066055298 + }, + { + "auxiliary_loss_clip": 0.01428773, + "auxiliary_loss_mlp": 0.01240185, + "balance_loss_clip": 1.11113238, + "balance_loss_mlp": 1.02675223, + "epoch": 0.47990380279573125, + "flos": 20592048319200.0, + "grad_norm": 1.8554396919697138, + "language_loss": 0.69900298, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72569251, + "num_input_tokens_seen": 171580700, + "step": 7982, + "time_per_iteration": 4.345616817474365 + }, + { + "auxiliary_loss_clip": 0.01424633, + "auxiliary_loss_mlp": 0.01232447, + "balance_loss_clip": 1.10709691, + "balance_loss_mlp": 1.02111292, + "epoch": 0.4799639260483992, + "flos": 26361813437280.0, + "grad_norm": 1.7667529828154322, + "language_loss": 0.71193671, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73850751, + "num_input_tokens_seen": 171602035, + "step": 7983, + "time_per_iteration": 2.97721266746521 + }, + { + "auxiliary_loss_clip": 0.01443379, + "auxiliary_loss_mlp": 0.0122773, + "balance_loss_clip": 1.14778578, + "balance_loss_mlp": 1.0286026, + "epoch": 0.4800240493010672, + "flos": 70985838792960.0, + "grad_norm": 0.8118706325839085, + "language_loss": 0.5929085, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61961961, + "num_input_tokens_seen": 171659215, + "step": 7984, + "time_per_iteration": 3.3610880374908447 + }, + { + "auxiliary_loss_clip": 0.01426238, + "auxiliary_loss_mlp": 0.0123889, + "balance_loss_clip": 1.10737443, + "balance_loss_mlp": 1.0246948, + "epoch": 0.48008417255373514, + "flos": 17088247851840.0, + "grad_norm": 9.340065435205954, + "language_loss": 0.66874832, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.69539958, + "num_input_tokens_seen": 171675710, + "step": 7985, + "time_per_iteration": 2.8739500045776367 + }, + { + "auxiliary_loss_clip": 0.01428261, + "auxiliary_loss_mlp": 0.0124267, + "balance_loss_clip": 1.11080384, + "balance_loss_mlp": 1.02828407, + "epoch": 0.4801442958064031, + "flos": 15415219955520.0, + "grad_norm": 1.7616968926622443, + "language_loss": 0.70049214, + "learning_rate": 2.225214340743835e-06, + "loss": 0.7272014, + "num_input_tokens_seen": 171692510, + "step": 7986, + "time_per_iteration": 2.9631409645080566 + }, + { + "auxiliary_loss_clip": 0.01424286, + "auxiliary_loss_mlp": 0.01237495, + "balance_loss_clip": 1.10487819, + "balance_loss_mlp": 1.02062988, + "epoch": 0.4802044190590711, + "flos": 11475355060800.0, + "grad_norm": 1.9263105448155335, + "language_loss": 0.79172277, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.8183406, + "num_input_tokens_seen": 171710235, + "step": 7987, + "time_per_iteration": 4.322758436203003 + }, + { + "auxiliary_loss_clip": 0.01425283, + "auxiliary_loss_mlp": 0.01247599, + "balance_loss_clip": 1.10731983, + "balance_loss_mlp": 1.03492963, + "epoch": 0.48026454231173904, + "flos": 20952634979520.0, + "grad_norm": 2.2652390551510146, + "language_loss": 0.74931014, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.776039, + "num_input_tokens_seen": 171726715, + "step": 7988, + "time_per_iteration": 2.919869899749756 + }, + { + "auxiliary_loss_clip": 0.01431972, + "auxiliary_loss_mlp": 0.01250641, + "balance_loss_clip": 1.11355209, + "balance_loss_mlp": 1.03511083, + "epoch": 0.48032466556440706, + "flos": 20450157615360.0, + "grad_norm": 2.1858426410794283, + "language_loss": 0.78870964, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81553572, + "num_input_tokens_seen": 171743605, + "step": 7989, + "time_per_iteration": 2.884883403778076 + }, + { + "auxiliary_loss_clip": 0.01419422, + "auxiliary_loss_mlp": 0.01252824, + "balance_loss_clip": 1.10078621, + "balance_loss_mlp": 1.03824687, + "epoch": 0.480384788817075, + "flos": 37123732886400.0, + "grad_norm": 1.7574112066182945, + "language_loss": 0.73355722, + "learning_rate": 2.223666334404724e-06, + "loss": 0.76027972, + "num_input_tokens_seen": 171765445, + "step": 7990, + "time_per_iteration": 2.9872629642486572 + }, + { + "auxiliary_loss_clip": 0.01433377, + "auxiliary_loss_mlp": 0.01215157, + "balance_loss_clip": 1.13768542, + "balance_loss_mlp": 1.01526642, + "epoch": 0.480444912069743, + "flos": 69559118917920.0, + "grad_norm": 0.7677474221416786, + "language_loss": 0.59052283, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61700821, + "num_input_tokens_seen": 171830115, + "step": 7991, + "time_per_iteration": 3.5045647621154785 + }, + { + "auxiliary_loss_clip": 0.01426571, + "auxiliary_loss_mlp": 0.01246723, + "balance_loss_clip": 1.10821629, + "balance_loss_mlp": 1.03176498, + "epoch": 0.48050503532241096, + "flos": 29824651127520.0, + "grad_norm": 1.9146861054375086, + "language_loss": 0.67307293, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69980586, + "num_input_tokens_seen": 171849135, + "step": 7992, + "time_per_iteration": 2.935256004333496 + }, + { + "auxiliary_loss_clip": 0.01422904, + "auxiliary_loss_mlp": 0.01240294, + "balance_loss_clip": 1.10427845, + "balance_loss_mlp": 1.0264802, + "epoch": 0.4805651585750789, + "flos": 23950658332800.0, + "grad_norm": 1.5586840620888898, + "language_loss": 0.76219797, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78882992, + "num_input_tokens_seen": 171868880, + "step": 7993, + "time_per_iteration": 2.9566493034362793 + }, + { + "auxiliary_loss_clip": 0.01429948, + "auxiliary_loss_mlp": 0.01245587, + "balance_loss_clip": 1.11156082, + "balance_loss_mlp": 1.03043783, + "epoch": 0.4806252818277469, + "flos": 25667759115360.0, + "grad_norm": 1.667741907326475, + "language_loss": 0.78235179, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80910712, + "num_input_tokens_seen": 171889455, + "step": 7994, + "time_per_iteration": 2.898442268371582 + }, + { + "auxiliary_loss_clip": 0.01426027, + "auxiliary_loss_mlp": 0.01251471, + "balance_loss_clip": 1.10759616, + "balance_loss_mlp": 1.03670311, + "epoch": 0.48068540508041485, + "flos": 13153692899520.0, + "grad_norm": 2.249592379053931, + "language_loss": 0.79385608, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.82063103, + "num_input_tokens_seen": 171906070, + "step": 7995, + "time_per_iteration": 2.950080633163452 + }, + { + "auxiliary_loss_clip": 0.01422364, + "auxiliary_loss_mlp": 0.01245274, + "balance_loss_clip": 1.10458159, + "balance_loss_mlp": 1.02993488, + "epoch": 0.4807455283330828, + "flos": 21178764855360.0, + "grad_norm": 1.4335208120966576, + "language_loss": 0.82701039, + "learning_rate": 2.2213440707461e-06, + "loss": 0.85368681, + "num_input_tokens_seen": 171926515, + "step": 7996, + "time_per_iteration": 2.9585471153259277 + }, + { + "auxiliary_loss_clip": 0.01427371, + "auxiliary_loss_mlp": 0.01237371, + "balance_loss_clip": 1.1095109, + "balance_loss_mlp": 1.02241254, + "epoch": 0.4808056515857508, + "flos": 12277619516160.0, + "grad_norm": 2.1618350527405537, + "language_loss": 0.80765462, + "learning_rate": 2.220956997340516e-06, + "loss": 0.83430207, + "num_input_tokens_seen": 171943845, + "step": 7997, + "time_per_iteration": 2.995959997177124 + }, + { + "auxiliary_loss_clip": 0.01426463, + "auxiliary_loss_mlp": 0.01243761, + "balance_loss_clip": 1.109658, + "balance_loss_mlp": 1.02918482, + "epoch": 0.48086577483841875, + "flos": 24828324698880.0, + "grad_norm": 2.8570331558669153, + "language_loss": 0.72637659, + "learning_rate": 2.220569915556221e-06, + "loss": 0.75307882, + "num_input_tokens_seen": 171964970, + "step": 7998, + "time_per_iteration": 2.9075489044189453 + }, + { + "auxiliary_loss_clip": 0.01426478, + "auxiliary_loss_mlp": 0.01240498, + "balance_loss_clip": 1.10736835, + "balance_loss_mlp": 1.02668452, + "epoch": 0.4809258980910867, + "flos": 24467851823040.0, + "grad_norm": 1.8024933258268816, + "language_loss": 0.70973116, + "learning_rate": 2.220182825407892e-06, + "loss": 0.7364009, + "num_input_tokens_seen": 171986340, + "step": 7999, + "time_per_iteration": 2.933763027191162 + }, + { + "auxiliary_loss_clip": 0.01420083, + "auxiliary_loss_mlp": 0.01246068, + "balance_loss_clip": 1.10245061, + "balance_loss_mlp": 1.03225398, + "epoch": 0.4809860213437547, + "flos": 21218134649760.0, + "grad_norm": 7.0595913640590275, + "language_loss": 0.714122, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.74078345, + "num_input_tokens_seen": 172007300, + "step": 8000, + "time_per_iteration": 2.8976097106933594 + }, + { + "auxiliary_loss_clip": 0.01429881, + "auxiliary_loss_mlp": 0.0124561, + "balance_loss_clip": 1.1129725, + "balance_loss_mlp": 1.0285542, + "epoch": 0.48104614459642264, + "flos": 37635047511840.0, + "grad_norm": 1.3336376629756475, + "language_loss": 0.74837977, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77513474, + "num_input_tokens_seen": 172029585, + "step": 8001, + "time_per_iteration": 3.0526578426361084 + }, + { + "auxiliary_loss_clip": 0.01423238, + "auxiliary_loss_mlp": 0.01250299, + "balance_loss_clip": 1.10603142, + "balance_loss_mlp": 1.03553164, + "epoch": 0.48110626784909066, + "flos": 18408123007200.0, + "grad_norm": 1.9166912118453747, + "language_loss": 0.81609833, + "learning_rate": 2.219021504925493e-06, + "loss": 0.84283376, + "num_input_tokens_seen": 172047495, + "step": 8002, + "time_per_iteration": 2.922572612762451 + }, + { + "auxiliary_loss_clip": 0.01429174, + "auxiliary_loss_mlp": 0.01242379, + "balance_loss_clip": 1.11127019, + "balance_loss_mlp": 1.02589488, + "epoch": 0.48116639110175863, + "flos": 28442383420320.0, + "grad_norm": 1.7046986442126209, + "language_loss": 0.71362102, + "learning_rate": 2.218634381467819e-06, + "loss": 0.74033648, + "num_input_tokens_seen": 172067625, + "step": 8003, + "time_per_iteration": 2.930922508239746 + }, + { + "auxiliary_loss_clip": 0.01430363, + "auxiliary_loss_mlp": 0.01239958, + "balance_loss_clip": 1.112257, + "balance_loss_mlp": 1.02747965, + "epoch": 0.4812265143544266, + "flos": 21727363226400.0, + "grad_norm": 1.8208562664449792, + "language_loss": 0.834014, + "learning_rate": 2.218247249719507e-06, + "loss": 0.86071724, + "num_input_tokens_seen": 172087885, + "step": 8004, + "time_per_iteration": 2.916066884994507 + }, + { + "auxiliary_loss_clip": 0.01435686, + "auxiliary_loss_mlp": 0.01248239, + "balance_loss_clip": 1.11607218, + "balance_loss_mlp": 1.02622342, + "epoch": 0.48128663760709456, + "flos": 13226363982720.0, + "grad_norm": 2.178935889490051, + "language_loss": 0.77953112, + "learning_rate": 2.217860109695239e-06, + "loss": 0.80637038, + "num_input_tokens_seen": 172105815, + "step": 8005, + "time_per_iteration": 2.856236219406128 + }, + { + "auxiliary_loss_clip": 0.01425003, + "auxiliary_loss_mlp": 0.01234787, + "balance_loss_clip": 1.10815048, + "balance_loss_mlp": 1.01925707, + "epoch": 0.4813467608597625, + "flos": 24245742332160.0, + "grad_norm": 4.521617606702993, + "language_loss": 0.70515847, + "learning_rate": 2.217472961409692e-06, + "loss": 0.73175639, + "num_input_tokens_seen": 172126125, + "step": 8006, + "time_per_iteration": 2.9300332069396973 + }, + { + "auxiliary_loss_clip": 0.01429724, + "auxiliary_loss_mlp": 0.01236617, + "balance_loss_clip": 1.11217713, + "balance_loss_mlp": 1.0229938, + "epoch": 0.4814068841124305, + "flos": 27482336362080.0, + "grad_norm": 3.12949045595963, + "language_loss": 0.70407295, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.73073643, + "num_input_tokens_seen": 172141945, + "step": 8007, + "time_per_iteration": 2.9516537189483643 + }, + { + "auxiliary_loss_clip": 0.01432962, + "auxiliary_loss_mlp": 0.01240974, + "balance_loss_clip": 1.11648595, + "balance_loss_mlp": 1.02639735, + "epoch": 0.48146700736509845, + "flos": 19574918651520.0, + "grad_norm": 1.8954238433069384, + "language_loss": 0.71668094, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.7434203, + "num_input_tokens_seen": 172161095, + "step": 8008, + "time_per_iteration": 2.903247833251953 + }, + { + "auxiliary_loss_clip": 0.01436682, + "auxiliary_loss_mlp": 0.01241095, + "balance_loss_clip": 1.11827445, + "balance_loss_mlp": 1.02289414, + "epoch": 0.4815271306177664, + "flos": 20629673418240.0, + "grad_norm": 1.934644899129187, + "language_loss": 0.60818404, + "learning_rate": 2.216311467132199e-06, + "loss": 0.63496178, + "num_input_tokens_seen": 172178750, + "step": 8009, + "time_per_iteration": 2.931520700454712 + }, + { + "auxiliary_loss_clip": 0.01440516, + "auxiliary_loss_mlp": 0.01201241, + "balance_loss_clip": 1.14686549, + "balance_loss_mlp": 0.99982452, + "epoch": 0.4815872538704344, + "flos": 67697472460320.0, + "grad_norm": 0.8614774384276758, + "language_loss": 0.61230624, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63872379, + "num_input_tokens_seen": 172240235, + "step": 8010, + "time_per_iteration": 3.4794743061065674 + }, + { + "auxiliary_loss_clip": 0.01435648, + "auxiliary_loss_mlp": 0.01243698, + "balance_loss_clip": 1.12011409, + "balance_loss_mlp": 1.02912176, + "epoch": 0.48164737712310235, + "flos": 22823004913920.0, + "grad_norm": 2.0324845572128054, + "language_loss": 0.73240131, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75919473, + "num_input_tokens_seen": 172259875, + "step": 8011, + "time_per_iteration": 2.932661294937134 + }, + { + "auxiliary_loss_clip": 0.01429757, + "auxiliary_loss_mlp": 0.01234779, + "balance_loss_clip": 1.11382878, + "balance_loss_mlp": 1.02249074, + "epoch": 0.4817075003757703, + "flos": 23736210330240.0, + "grad_norm": 2.622950400283698, + "language_loss": 0.79562742, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.82227272, + "num_input_tokens_seen": 172280150, + "step": 8012, + "time_per_iteration": 4.606944799423218 + }, + { + "auxiliary_loss_clip": 0.01434142, + "auxiliary_loss_mlp": 0.01243375, + "balance_loss_clip": 1.11761832, + "balance_loss_mlp": 1.02822566, + "epoch": 0.4817676236284383, + "flos": 28185379657920.0, + "grad_norm": 2.1106110501723223, + "language_loss": 0.73724794, + "learning_rate": 2.214762693328326e-06, + "loss": 0.76402307, + "num_input_tokens_seen": 172300810, + "step": 8013, + "time_per_iteration": 3.0594024658203125 + }, + { + "auxiliary_loss_clip": 0.01434205, + "auxiliary_loss_mlp": 0.01242202, + "balance_loss_clip": 1.11882448, + "balance_loss_mlp": 1.02762532, + "epoch": 0.48182774688110624, + "flos": 17093633650560.0, + "grad_norm": 2.277970352403747, + "language_loss": 0.90729564, + "learning_rate": 2.214375479481094e-06, + "loss": 0.93405968, + "num_input_tokens_seen": 172317930, + "step": 8014, + "time_per_iteration": 2.8852338790893555 + }, + { + "auxiliary_loss_clip": 0.01435489, + "auxiliary_loss_mlp": 0.01248004, + "balance_loss_clip": 1.11954117, + "balance_loss_mlp": 1.03228343, + "epoch": 0.4818878701337742, + "flos": 12569517550080.0, + "grad_norm": 2.26827148151365, + "language_loss": 0.74393672, + "learning_rate": 2.213988257504722e-06, + "loss": 0.77077168, + "num_input_tokens_seen": 172336340, + "step": 8015, + "time_per_iteration": 2.925473690032959 + }, + { + "auxiliary_loss_clip": 0.0143074, + "auxiliary_loss_mlp": 0.01243573, + "balance_loss_clip": 1.1156224, + "balance_loss_mlp": 1.02651632, + "epoch": 0.48194799338644223, + "flos": 24610842443520.0, + "grad_norm": 1.9591940087155069, + "language_loss": 0.80406523, + "learning_rate": 2.213601027413894e-06, + "loss": 0.8308084, + "num_input_tokens_seen": 172354315, + "step": 8016, + "time_per_iteration": 2.9665122032165527 + }, + { + "auxiliary_loss_clip": 0.01431094, + "auxiliary_loss_mlp": 0.01242471, + "balance_loss_clip": 1.11567283, + "balance_loss_mlp": 1.03075492, + "epoch": 0.4820081166391102, + "flos": 21107307473280.0, + "grad_norm": 2.078674042755252, + "language_loss": 0.77751613, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.80425179, + "num_input_tokens_seen": 172372695, + "step": 8017, + "time_per_iteration": 2.9675703048706055 + }, + { + "auxiliary_loss_clip": 0.01436111, + "auxiliary_loss_mlp": 0.01251615, + "balance_loss_clip": 1.12309003, + "balance_loss_mlp": 1.04047191, + "epoch": 0.48206823989177816, + "flos": 25266930312960.0, + "grad_norm": 1.9897887907190834, + "language_loss": 0.79994738, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82682467, + "num_input_tokens_seen": 172390905, + "step": 8018, + "time_per_iteration": 3.0628662109375 + }, + { + "auxiliary_loss_clip": 0.01440587, + "auxiliary_loss_mlp": 0.01246822, + "balance_loss_clip": 1.12576008, + "balance_loss_mlp": 1.02938426, + "epoch": 0.4821283631444461, + "flos": 24647405554080.0, + "grad_norm": 2.4863515812429298, + "language_loss": 0.76221925, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.78909338, + "num_input_tokens_seen": 172412295, + "step": 8019, + "time_per_iteration": 2.922001838684082 + }, + { + "auxiliary_loss_clip": 0.01426847, + "auxiliary_loss_mlp": 0.01238476, + "balance_loss_clip": 1.11186802, + "balance_loss_mlp": 1.02313662, + "epoch": 0.4821884863971141, + "flos": 23954413220640.0, + "grad_norm": 1.633535443051149, + "language_loss": 0.79138857, + "learning_rate": 2.212052026199701e-06, + "loss": 0.81804174, + "num_input_tokens_seen": 172432625, + "step": 8020, + "time_per_iteration": 5.854142904281616 + }, + { + "auxiliary_loss_clip": 0.01439793, + "auxiliary_loss_mlp": 0.01237294, + "balance_loss_clip": 1.12485909, + "balance_loss_mlp": 1.02081037, + "epoch": 0.48224860964978206, + "flos": 17162322276960.0, + "grad_norm": 2.3795212834339847, + "language_loss": 0.69430184, + "learning_rate": 2.211664755756855e-06, + "loss": 0.72107279, + "num_input_tokens_seen": 172450010, + "step": 8021, + "time_per_iteration": 2.8962044715881348 + }, + { + "auxiliary_loss_clip": 0.01435325, + "auxiliary_loss_mlp": 0.01249415, + "balance_loss_clip": 1.12078404, + "balance_loss_mlp": 1.03216815, + "epoch": 0.48230873290245, + "flos": 23078074340160.0, + "grad_norm": 2.0352631214813752, + "language_loss": 0.62663114, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.6534785, + "num_input_tokens_seen": 172469080, + "step": 8022, + "time_per_iteration": 2.978440046310425 + }, + { + "auxiliary_loss_clip": 0.01435441, + "auxiliary_loss_mlp": 0.01243664, + "balance_loss_clip": 1.12178874, + "balance_loss_mlp": 1.02946901, + "epoch": 0.482368856155118, + "flos": 19355502060000.0, + "grad_norm": 2.5680464727869854, + "language_loss": 0.66629577, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.69308686, + "num_input_tokens_seen": 172484850, + "step": 8023, + "time_per_iteration": 3.0043768882751465 + }, + { + "auxiliary_loss_clip": 0.01431119, + "auxiliary_loss_mlp": 0.0125474, + "balance_loss_clip": 1.11713433, + "balance_loss_mlp": 1.04207098, + "epoch": 0.48242897940778595, + "flos": 20080771621920.0, + "grad_norm": 2.172275963254186, + "language_loss": 0.76776546, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.79462409, + "num_input_tokens_seen": 172503525, + "step": 8024, + "time_per_iteration": 2.9471540451049805 + }, + { + "auxiliary_loss_clip": 0.01432563, + "auxiliary_loss_mlp": 0.01253747, + "balance_loss_clip": 1.11879396, + "balance_loss_mlp": 1.04031432, + "epoch": 0.4824891026604539, + "flos": 23406232059360.0, + "grad_norm": 1.5049692531289351, + "language_loss": 0.75130224, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77816534, + "num_input_tokens_seen": 172524360, + "step": 8025, + "time_per_iteration": 4.577250957489014 + }, + { + "auxiliary_loss_clip": 0.01426925, + "auxiliary_loss_mlp": 0.01240129, + "balance_loss_clip": 1.11442745, + "balance_loss_mlp": 1.0266968, + "epoch": 0.4825492259131219, + "flos": 20370356038080.0, + "grad_norm": 2.003612430573367, + "language_loss": 0.71002996, + "learning_rate": 2.209728283441112e-06, + "loss": 0.73670053, + "num_input_tokens_seen": 172541480, + "step": 8026, + "time_per_iteration": 2.868849277496338 + }, + { + "auxiliary_loss_clip": 0.01437289, + "auxiliary_loss_mlp": 0.01246954, + "balance_loss_clip": 1.12300539, + "balance_loss_mlp": 1.02913439, + "epoch": 0.48260934916578985, + "flos": 14320754040960.0, + "grad_norm": 1.9429808051306352, + "language_loss": 0.75094539, + "learning_rate": 2.209340965060465e-06, + "loss": 0.7777878, + "num_input_tokens_seen": 172559005, + "step": 8027, + "time_per_iteration": 2.975450277328491 + }, + { + "auxiliary_loss_clip": 0.01436977, + "auxiliary_loss_mlp": 0.01242601, + "balance_loss_clip": 1.12243652, + "balance_loss_mlp": 1.02459145, + "epoch": 0.4826694724184578, + "flos": 22122995870880.0, + "grad_norm": 4.030152369710449, + "language_loss": 0.67440772, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.70120358, + "num_input_tokens_seen": 172578435, + "step": 8028, + "time_per_iteration": 2.9519076347351074 + }, + { + "auxiliary_loss_clip": 0.01435422, + "auxiliary_loss_mlp": 0.01241957, + "balance_loss_clip": 1.12082636, + "balance_loss_mlp": 1.02566373, + "epoch": 0.48272959567112583, + "flos": 16183083205440.0, + "grad_norm": 1.8511075737685014, + "language_loss": 0.73065817, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75743204, + "num_input_tokens_seen": 172596095, + "step": 8029, + "time_per_iteration": 2.939260959625244 + }, + { + "auxiliary_loss_clip": 0.01430447, + "auxiliary_loss_mlp": 0.01250147, + "balance_loss_clip": 1.11566424, + "balance_loss_mlp": 1.03633261, + "epoch": 0.4827897189237938, + "flos": 23182415801280.0, + "grad_norm": 2.3809955540588557, + "language_loss": 0.84517753, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.87198341, + "num_input_tokens_seen": 172615255, + "step": 8030, + "time_per_iteration": 2.9543983936309814 + }, + { + "auxiliary_loss_clip": 0.01428594, + "auxiliary_loss_mlp": 0.01241583, + "balance_loss_clip": 1.1142745, + "balance_loss_mlp": 1.02891314, + "epoch": 0.48284984217646176, + "flos": 21654767999520.0, + "grad_norm": 1.8895933487810477, + "language_loss": 0.73600364, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76270539, + "num_input_tokens_seen": 172633185, + "step": 8031, + "time_per_iteration": 2.9116950035095215 + }, + { + "auxiliary_loss_clip": 0.01436146, + "auxiliary_loss_mlp": 0.01260129, + "balance_loss_clip": 1.1192677, + "balance_loss_mlp": 1.04402602, + "epoch": 0.48290996542912973, + "flos": 31470142815360.0, + "grad_norm": 1.7030186590459913, + "language_loss": 0.71763086, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.74459362, + "num_input_tokens_seen": 172654280, + "step": 8032, + "time_per_iteration": 2.945566415786743 + }, + { + "auxiliary_loss_clip": 0.0142873, + "auxiliary_loss_mlp": 0.01234106, + "balance_loss_clip": 1.11189842, + "balance_loss_mlp": 1.02010155, + "epoch": 0.4829700886817977, + "flos": 24464248647840.0, + "grad_norm": 1.635542851102972, + "language_loss": 0.74262595, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76925433, + "num_input_tokens_seen": 172675545, + "step": 8033, + "time_per_iteration": 2.9288063049316406 + }, + { + "auxiliary_loss_clip": 0.01433469, + "auxiliary_loss_mlp": 0.01255735, + "balance_loss_clip": 1.11725545, + "balance_loss_mlp": 1.0419215, + "epoch": 0.48303021193446566, + "flos": 25704511866720.0, + "grad_norm": 2.262367097544231, + "language_loss": 0.83133805, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85823011, + "num_input_tokens_seen": 172696455, + "step": 8034, + "time_per_iteration": 2.997187852859497 + }, + { + "auxiliary_loss_clip": 0.01433975, + "auxiliary_loss_mlp": 0.01235665, + "balance_loss_clip": 1.11770749, + "balance_loss_mlp": 1.02108812, + "epoch": 0.4830903351871336, + "flos": 20087371121760.0, + "grad_norm": 1.9451898713834943, + "language_loss": 0.79827374, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.82497019, + "num_input_tokens_seen": 172716720, + "step": 8035, + "time_per_iteration": 2.9569931030273438 + }, + { + "auxiliary_loss_clip": 0.01431263, + "auxiliary_loss_mlp": 0.01249488, + "balance_loss_clip": 1.11630976, + "balance_loss_mlp": 1.03548336, + "epoch": 0.4831504584398016, + "flos": 39455048485440.0, + "grad_norm": 2.947012354611563, + "language_loss": 0.69529426, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.72210175, + "num_input_tokens_seen": 172737435, + "step": 8036, + "time_per_iteration": 3.0340499877929688 + }, + { + "auxiliary_loss_clip": 0.01436573, + "auxiliary_loss_mlp": 0.01236715, + "balance_loss_clip": 1.12290597, + "balance_loss_mlp": 1.01965833, + "epoch": 0.48321058169246955, + "flos": 20008403964000.0, + "grad_norm": 1.8487725583166623, + "language_loss": 0.72752774, + "learning_rate": 2.205467347074847e-06, + "loss": 0.7542606, + "num_input_tokens_seen": 172755700, + "step": 8037, + "time_per_iteration": 2.9961671829223633 + }, + { + "auxiliary_loss_clip": 0.01441685, + "auxiliary_loss_mlp": 0.01248467, + "balance_loss_clip": 1.12739587, + "balance_loss_mlp": 1.0281682, + "epoch": 0.4832707049451375, + "flos": 20743610703840.0, + "grad_norm": 2.2721759496329113, + "language_loss": 0.6948545, + "learning_rate": 2.205079942181525e-06, + "loss": 0.72175598, + "num_input_tokens_seen": 172775185, + "step": 8038, + "time_per_iteration": 3.1325154304504395 + }, + { + "auxiliary_loss_clip": 0.01435928, + "auxiliary_loss_mlp": 0.01238966, + "balance_loss_clip": 1.12150979, + "balance_loss_mlp": 1.02400804, + "epoch": 0.4833308281978055, + "flos": 33148366869600.0, + "grad_norm": 1.5124655888294192, + "language_loss": 0.79144138, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81819034, + "num_input_tokens_seen": 172796990, + "step": 8039, + "time_per_iteration": 3.0651137828826904 + }, + { + "auxiliary_loss_clip": 0.0143909, + "auxiliary_loss_mlp": 0.01237943, + "balance_loss_clip": 1.12474823, + "balance_loss_mlp": 1.02565539, + "epoch": 0.48339095145047345, + "flos": 19101267053280.0, + "grad_norm": 1.612941733755518, + "language_loss": 0.77648246, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.80325282, + "num_input_tokens_seen": 172814915, + "step": 8040, + "time_per_iteration": 2.9302635192871094 + }, + { + "auxiliary_loss_clip": 0.01437643, + "auxiliary_loss_mlp": 0.01243146, + "balance_loss_clip": 1.12234044, + "balance_loss_mlp": 1.03028584, + "epoch": 0.4834510747031414, + "flos": 34462325232000.0, + "grad_norm": 1.6216053535434634, + "language_loss": 0.75536209, + "learning_rate": 2.203917680900409e-06, + "loss": 0.78217, + "num_input_tokens_seen": 172837060, + "step": 8041, + "time_per_iteration": 2.9134585857391357 + }, + { + "auxiliary_loss_clip": 0.01436533, + "auxiliary_loss_mlp": 0.01242164, + "balance_loss_clip": 1.12306201, + "balance_loss_mlp": 1.02758718, + "epoch": 0.48351119795580944, + "flos": 27383304843360.0, + "grad_norm": 2.0169742624203804, + "language_loss": 0.6697793, + "learning_rate": 2.203530244988624e-06, + "loss": 0.69656634, + "num_input_tokens_seen": 172856545, + "step": 8042, + "time_per_iteration": 2.8773610591888428 + }, + { + "auxiliary_loss_clip": 0.01453074, + "auxiliary_loss_mlp": 0.01202042, + "balance_loss_clip": 1.15907025, + "balance_loss_mlp": 1.00062561, + "epoch": 0.4835713212084774, + "flos": 67150315359360.0, + "grad_norm": 0.6910323407849255, + "language_loss": 0.5836941, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.61024523, + "num_input_tokens_seen": 172923055, + "step": 8043, + "time_per_iteration": 3.4669389724731445 + }, + { + "auxiliary_loss_clip": 0.01436006, + "auxiliary_loss_mlp": 0.01239669, + "balance_loss_clip": 1.12101841, + "balance_loss_mlp": 1.0228039, + "epoch": 0.48363144446114537, + "flos": 17969403608640.0, + "grad_norm": 2.1617729072254726, + "language_loss": 0.72035992, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.74711668, + "num_input_tokens_seen": 172940700, + "step": 8044, + "time_per_iteration": 2.8552417755126953 + }, + { + "auxiliary_loss_clip": 0.0144094, + "auxiliary_loss_mlp": 0.0123708, + "balance_loss_clip": 1.12731433, + "balance_loss_mlp": 1.020787, + "epoch": 0.48369156771381333, + "flos": 20595537709920.0, + "grad_norm": 1.378353844956512, + "language_loss": 0.76034725, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78712744, + "num_input_tokens_seen": 172961125, + "step": 8045, + "time_per_iteration": 2.8520896434783936 + }, + { + "auxiliary_loss_clip": 0.01439452, + "auxiliary_loss_mlp": 0.01242164, + "balance_loss_clip": 1.12521231, + "balance_loss_mlp": 1.02758718, + "epoch": 0.4837516909664813, + "flos": 22677321394080.0, + "grad_norm": 1.7149335962973107, + "language_loss": 0.690714, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71753013, + "num_input_tokens_seen": 172980405, + "step": 8046, + "time_per_iteration": 2.8477249145507812 + }, + { + "auxiliary_loss_clip": 0.01437256, + "auxiliary_loss_mlp": 0.0123115, + "balance_loss_clip": 1.12210691, + "balance_loss_mlp": 1.01638222, + "epoch": 0.48381181421914926, + "flos": 25520899822560.0, + "grad_norm": 2.4857327161727247, + "language_loss": 0.82340944, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.85009348, + "num_input_tokens_seen": 172999105, + "step": 8047, + "time_per_iteration": 2.849274158477783 + }, + { + "auxiliary_loss_clip": 0.01438492, + "auxiliary_loss_mlp": 0.01250048, + "balance_loss_clip": 1.12394142, + "balance_loss_mlp": 1.03528082, + "epoch": 0.4838719374718172, + "flos": 24209824000320.0, + "grad_norm": 1.6637514211665625, + "language_loss": 0.80703557, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.83392096, + "num_input_tokens_seen": 173019935, + "step": 8048, + "time_per_iteration": 2.851665735244751 + }, + { + "auxiliary_loss_clip": 0.01440133, + "auxiliary_loss_mlp": 0.01243961, + "balance_loss_clip": 1.12490046, + "balance_loss_mlp": 1.02957487, + "epoch": 0.4839320607244852, + "flos": 26727027333120.0, + "grad_norm": 1.8924873216277343, + "language_loss": 0.81314421, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83998519, + "num_input_tokens_seen": 173039700, + "step": 8049, + "time_per_iteration": 2.877901315689087 + }, + { + "auxiliary_loss_clip": 0.01443942, + "auxiliary_loss_mlp": 0.01243585, + "balance_loss_clip": 1.13001657, + "balance_loss_mlp": 1.03148746, + "epoch": 0.48399218397715316, + "flos": 20450992034880.0, + "grad_norm": 2.143403556207344, + "language_loss": 0.72927725, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.75615251, + "num_input_tokens_seen": 173059170, + "step": 8050, + "time_per_iteration": 5.183178424835205 + }, + { + "auxiliary_loss_clip": 0.01454009, + "auxiliary_loss_mlp": 0.01204239, + "balance_loss_clip": 1.16071916, + "balance_loss_mlp": 1.00434875, + "epoch": 0.4840523072298211, + "flos": 67186954326240.0, + "grad_norm": 0.69528945371689, + "language_loss": 0.56277204, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58935452, + "num_input_tokens_seen": 173119000, + "step": 8051, + "time_per_iteration": 3.402425765991211 + }, + { + "auxiliary_loss_clip": 0.01440283, + "auxiliary_loss_mlp": 0.01249843, + "balance_loss_clip": 1.12594962, + "balance_loss_mlp": 1.03297782, + "epoch": 0.4841124304824891, + "flos": 22413262993920.0, + "grad_norm": 4.410953222830091, + "language_loss": 0.75451165, + "learning_rate": 2.199655463811236e-06, + "loss": 0.78141296, + "num_input_tokens_seen": 173137570, + "step": 8052, + "time_per_iteration": 2.78054141998291 + }, + { + "auxiliary_loss_clip": 0.01437839, + "auxiliary_loss_mlp": 0.01243311, + "balance_loss_clip": 1.12502122, + "balance_loss_mlp": 1.028162, + "epoch": 0.48417255373515705, + "flos": 13845623244480.0, + "grad_norm": 2.4373424747721764, + "language_loss": 0.66571355, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.69252509, + "num_input_tokens_seen": 173154355, + "step": 8053, + "time_per_iteration": 2.785947799682617 + }, + { + "auxiliary_loss_clip": 0.0143809, + "auxiliary_loss_mlp": 0.01245462, + "balance_loss_clip": 1.12465847, + "balance_loss_mlp": 1.03183937, + "epoch": 0.484232676987825, + "flos": 31652389445760.0, + "grad_norm": 2.6520662050881536, + "language_loss": 0.6891523, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71598774, + "num_input_tokens_seen": 173174845, + "step": 8054, + "time_per_iteration": 2.879377841949463 + }, + { + "auxiliary_loss_clip": 0.01436749, + "auxiliary_loss_mlp": 0.01238576, + "balance_loss_clip": 1.12290144, + "balance_loss_mlp": 1.02571559, + "epoch": 0.48429280024049304, + "flos": 24097555553760.0, + "grad_norm": 1.8240641619736069, + "language_loss": 0.69777513, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.72452843, + "num_input_tokens_seen": 173195025, + "step": 8055, + "time_per_iteration": 2.9243531227111816 + }, + { + "auxiliary_loss_clip": 0.01447958, + "auxiliary_loss_mlp": 0.01255943, + "balance_loss_clip": 1.13401079, + "balance_loss_mlp": 1.04174805, + "epoch": 0.484352923493161, + "flos": 17532011695680.0, + "grad_norm": 4.759992899440454, + "language_loss": 0.63434178, + "learning_rate": 2.198105338530685e-06, + "loss": 0.66138077, + "num_input_tokens_seen": 173213065, + "step": 8056, + "time_per_iteration": 3.0061182975769043 + }, + { + "auxiliary_loss_clip": 0.01441032, + "auxiliary_loss_mlp": 0.01243452, + "balance_loss_clip": 1.12737226, + "balance_loss_mlp": 1.03040087, + "epoch": 0.48441304674582897, + "flos": 29169321821280.0, + "grad_norm": 2.2770084474617875, + "language_loss": 0.67393386, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.70077866, + "num_input_tokens_seen": 173234545, + "step": 8057, + "time_per_iteration": 3.135573387145996 + }, + { + "auxiliary_loss_clip": 0.01440576, + "auxiliary_loss_mlp": 0.01241196, + "balance_loss_clip": 1.12629461, + "balance_loss_mlp": 1.02909851, + "epoch": 0.48447316999849693, + "flos": 15888454344000.0, + "grad_norm": 1.6508098002739942, + "language_loss": 0.81716609, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.84398377, + "num_input_tokens_seen": 173252175, + "step": 8058, + "time_per_iteration": 4.506786584854126 + }, + { + "auxiliary_loss_clip": 0.01445523, + "auxiliary_loss_mlp": 0.01241671, + "balance_loss_clip": 1.13197303, + "balance_loss_mlp": 1.02194452, + "epoch": 0.4845332932511649, + "flos": 24383271297600.0, + "grad_norm": 1.6461634612178915, + "language_loss": 0.79503465, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82190663, + "num_input_tokens_seen": 173268790, + "step": 8059, + "time_per_iteration": 2.996187686920166 + }, + { + "auxiliary_loss_clip": 0.01446016, + "auxiliary_loss_mlp": 0.01252216, + "balance_loss_clip": 1.1331594, + "balance_loss_mlp": 1.03649473, + "epoch": 0.48459341650383286, + "flos": 37119143579040.0, + "grad_norm": 2.2353748984864503, + "language_loss": 0.67149949, + "learning_rate": 2.196555093055352e-06, + "loss": 0.6984818, + "num_input_tokens_seen": 173288030, + "step": 8060, + "time_per_iteration": 3.11019229888916 + }, + { + "auxiliary_loss_clip": 0.01450359, + "auxiliary_loss_mlp": 0.0125199, + "balance_loss_clip": 1.13710809, + "balance_loss_mlp": 1.03531504, + "epoch": 0.48465353975650083, + "flos": 22969029787200.0, + "grad_norm": 2.392657567023864, + "language_loss": 0.67156792, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69859135, + "num_input_tokens_seen": 173305965, + "step": 8061, + "time_per_iteration": 2.9853098392486572 + }, + { + "auxiliary_loss_clip": 0.01453546, + "auxiliary_loss_mlp": 0.01255139, + "balance_loss_clip": 1.13959336, + "balance_loss_mlp": 1.0403713, + "epoch": 0.4847136630091688, + "flos": 17709365593440.0, + "grad_norm": 2.049307561839567, + "language_loss": 0.82198751, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84907436, + "num_input_tokens_seen": 173321985, + "step": 8062, + "time_per_iteration": 2.9128506183624268 + }, + { + "auxiliary_loss_clip": 0.01446021, + "auxiliary_loss_mlp": 0.01237333, + "balance_loss_clip": 1.13233411, + "balance_loss_mlp": 1.02332878, + "epoch": 0.48477378626183676, + "flos": 22020474961440.0, + "grad_norm": 2.6207793897137104, + "language_loss": 0.74462831, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.77146184, + "num_input_tokens_seen": 173341315, + "step": 8063, + "time_per_iteration": 2.9869678020477295 + }, + { + "auxiliary_loss_clip": 0.01447594, + "auxiliary_loss_mlp": 0.01250678, + "balance_loss_clip": 1.13522434, + "balance_loss_mlp": 1.03495717, + "epoch": 0.4848339095145047, + "flos": 27965356215840.0, + "grad_norm": 2.0243189463476066, + "language_loss": 0.78916562, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.8161484, + "num_input_tokens_seen": 173361055, + "step": 8064, + "time_per_iteration": 4.389641284942627 + }, + { + "auxiliary_loss_clip": 0.01448238, + "auxiliary_loss_mlp": 0.01240411, + "balance_loss_clip": 1.13603616, + "balance_loss_mlp": 1.02716982, + "epoch": 0.4848940327671727, + "flos": 21690875972160.0, + "grad_norm": 2.628478529858658, + "language_loss": 0.78607285, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81295931, + "num_input_tokens_seen": 173379255, + "step": 8065, + "time_per_iteration": 2.8842649459838867 + }, + { + "auxiliary_loss_clip": 0.01441665, + "auxiliary_loss_mlp": 0.0123299, + "balance_loss_clip": 1.12776411, + "balance_loss_mlp": 1.02032018, + "epoch": 0.48495415601984065, + "flos": 20633693803200.0, + "grad_norm": 1.8045840640618582, + "language_loss": 0.75984418, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78659081, + "num_input_tokens_seen": 173398370, + "step": 8066, + "time_per_iteration": 2.891489028930664 + }, + { + "auxiliary_loss_clip": 0.01444802, + "auxiliary_loss_mlp": 0.01236752, + "balance_loss_clip": 1.13290131, + "balance_loss_mlp": 1.02293849, + "epoch": 0.4850142792725086, + "flos": 25630437441600.0, + "grad_norm": 1.5230185766415814, + "language_loss": 0.71917754, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74599308, + "num_input_tokens_seen": 173419595, + "step": 8067, + "time_per_iteration": 2.8798913955688477 + }, + { + "auxiliary_loss_clip": 0.01441963, + "auxiliary_loss_mlp": 0.01243555, + "balance_loss_clip": 1.12888992, + "balance_loss_mlp": 1.02821577, + "epoch": 0.4850744025251766, + "flos": 13773672796320.0, + "grad_norm": 3.941374056286987, + "language_loss": 0.7866798, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81353498, + "num_input_tokens_seen": 173435390, + "step": 8068, + "time_per_iteration": 2.8712213039398193 + }, + { + "auxiliary_loss_clip": 0.0143768, + "auxiliary_loss_mlp": 0.01232302, + "balance_loss_clip": 1.12493324, + "balance_loss_mlp": 1.0217303, + "epoch": 0.4851345257778446, + "flos": 20263132036800.0, + "grad_norm": 1.588997643924757, + "language_loss": 0.84700191, + "learning_rate": 2.193066606145638e-06, + "loss": 0.87370169, + "num_input_tokens_seen": 173454095, + "step": 8069, + "time_per_iteration": 2.816417932510376 + }, + { + "auxiliary_loss_clip": 0.01439691, + "auxiliary_loss_mlp": 0.01237283, + "balance_loss_clip": 1.12736988, + "balance_loss_mlp": 1.02289665, + "epoch": 0.48519464903051257, + "flos": 27092165372640.0, + "grad_norm": 1.6773843847858683, + "language_loss": 0.77871203, + "learning_rate": 2.192678959687493e-06, + "loss": 0.80548179, + "num_input_tokens_seen": 173475300, + "step": 8070, + "time_per_iteration": 2.8803577423095703 + }, + { + "auxiliary_loss_clip": 0.01436184, + "auxiliary_loss_mlp": 0.01249129, + "balance_loss_clip": 1.12319732, + "balance_loss_mlp": 1.03340769, + "epoch": 0.48525477228318054, + "flos": 17129058916320.0, + "grad_norm": 1.8934921513568777, + "language_loss": 0.77608764, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80294085, + "num_input_tokens_seen": 173492005, + "step": 8071, + "time_per_iteration": 2.929567337036133 + }, + { + "auxiliary_loss_clip": 0.01432986, + "auxiliary_loss_mlp": 0.01244228, + "balance_loss_clip": 1.12090886, + "balance_loss_mlp": 1.03003299, + "epoch": 0.4853148955358485, + "flos": 28182648830400.0, + "grad_norm": 2.2838559544437893, + "language_loss": 0.71899068, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.74576283, + "num_input_tokens_seen": 173511995, + "step": 8072, + "time_per_iteration": 2.904823064804077 + }, + { + "auxiliary_loss_clip": 0.01440618, + "auxiliary_loss_mlp": 0.01248956, + "balance_loss_clip": 1.1278863, + "balance_loss_mlp": 1.03361654, + "epoch": 0.48537501878851647, + "flos": 17495221016160.0, + "grad_norm": 2.1673074034379107, + "language_loss": 0.88037026, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.9072659, + "num_input_tokens_seen": 173530215, + "step": 8073, + "time_per_iteration": 2.9065375328063965 + }, + { + "auxiliary_loss_clip": 0.01431725, + "auxiliary_loss_mlp": 0.01237145, + "balance_loss_clip": 1.1204499, + "balance_loss_mlp": 1.02447581, + "epoch": 0.48543514204118443, + "flos": 28587384233280.0, + "grad_norm": 2.05352291157772, + "language_loss": 0.60895061, + "learning_rate": 2.19112830093786e-06, + "loss": 0.63563931, + "num_input_tokens_seen": 173550920, + "step": 8074, + "time_per_iteration": 2.9766252040863037 + }, + { + "auxiliary_loss_clip": 0.01431115, + "auxiliary_loss_mlp": 0.0123791, + "balance_loss_clip": 1.11887813, + "balance_loss_mlp": 1.02180791, + "epoch": 0.4854952652938524, + "flos": 20962117019520.0, + "grad_norm": 1.7269897677598929, + "language_loss": 0.73434031, + "learning_rate": 2.19074061809469e-06, + "loss": 0.76103055, + "num_input_tokens_seen": 173569065, + "step": 8075, + "time_per_iteration": 2.8642241954803467 + }, + { + "auxiliary_loss_clip": 0.01440418, + "auxiliary_loss_mlp": 0.01242279, + "balance_loss_clip": 1.12887979, + "balance_loss_mlp": 1.03037226, + "epoch": 0.48555538854652036, + "flos": 66534166571040.0, + "grad_norm": 1.5116496694046426, + "language_loss": 0.81628835, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84311533, + "num_input_tokens_seen": 173596085, + "step": 8076, + "time_per_iteration": 3.3035237789154053 + }, + { + "auxiliary_loss_clip": 0.01435429, + "auxiliary_loss_mlp": 0.0124878, + "balance_loss_clip": 1.12319469, + "balance_loss_mlp": 1.03153288, + "epoch": 0.4856155117991883, + "flos": 15926838006240.0, + "grad_norm": 1.8802765872083067, + "language_loss": 0.86310464, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88994676, + "num_input_tokens_seen": 173613900, + "step": 8077, + "time_per_iteration": 2.8176753520965576 + }, + { + "auxiliary_loss_clip": 0.01411088, + "auxiliary_loss_mlp": 0.01282562, + "balance_loss_clip": 1.12651122, + "balance_loss_mlp": 1.084198, + "epoch": 0.4856756350518563, + "flos": 71053958496960.0, + "grad_norm": 0.9118908567198245, + "language_loss": 0.5840019, + "learning_rate": 2.189577526226564e-06, + "loss": 0.61093843, + "num_input_tokens_seen": 173671305, + "step": 8078, + "time_per_iteration": 3.3307535648345947 + }, + { + "auxiliary_loss_clip": 0.01432848, + "auxiliary_loss_mlp": 0.01247745, + "balance_loss_clip": 1.12023664, + "balance_loss_mlp": 1.03202367, + "epoch": 0.48573575830452426, + "flos": 29828292230880.0, + "grad_norm": 1.5999930842059173, + "language_loss": 0.72230393, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.74910992, + "num_input_tokens_seen": 173692070, + "step": 8079, + "time_per_iteration": 2.958030939102173 + }, + { + "auxiliary_loss_clip": 0.01430816, + "auxiliary_loss_mlp": 0.01241369, + "balance_loss_clip": 1.11826968, + "balance_loss_mlp": 1.02641106, + "epoch": 0.4857958815571922, + "flos": 17641435530240.0, + "grad_norm": 2.5354052211196327, + "language_loss": 0.79476309, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.82148492, + "num_input_tokens_seen": 173709785, + "step": 8080, + "time_per_iteration": 2.760127544403076 + }, + { + "auxiliary_loss_clip": 0.01431312, + "auxiliary_loss_mlp": 0.0123985, + "balance_loss_clip": 1.12043512, + "balance_loss_mlp": 1.02527356, + "epoch": 0.4858560048098602, + "flos": 21107610898560.0, + "grad_norm": 2.5932751383440484, + "language_loss": 0.83918595, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86589754, + "num_input_tokens_seen": 173728770, + "step": 8081, + "time_per_iteration": 2.808026075363159 + }, + { + "auxiliary_loss_clip": 0.01428754, + "auxiliary_loss_mlp": 0.01238499, + "balance_loss_clip": 1.11757088, + "balance_loss_mlp": 1.02430415, + "epoch": 0.4859161280625282, + "flos": 22093032260160.0, + "grad_norm": 1.7675839092490655, + "language_loss": 0.83043355, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85710609, + "num_input_tokens_seen": 173747355, + "step": 8082, + "time_per_iteration": 2.8457579612731934 + }, + { + "auxiliary_loss_clip": 0.01436887, + "auxiliary_loss_mlp": 0.01238456, + "balance_loss_clip": 1.12519121, + "balance_loss_mlp": 1.02273524, + "epoch": 0.4859762513151962, + "flos": 17495372728800.0, + "grad_norm": 2.1517409633033413, + "language_loss": 0.87243605, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89918947, + "num_input_tokens_seen": 173764825, + "step": 8083, + "time_per_iteration": 2.8606369495391846 + }, + { + "auxiliary_loss_clip": 0.01428874, + "auxiliary_loss_mlp": 0.0123773, + "balance_loss_clip": 1.11796224, + "balance_loss_mlp": 1.0250603, + "epoch": 0.48603637456786414, + "flos": 18006194288160.0, + "grad_norm": 1.8386324631858124, + "language_loss": 0.80643791, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.8331039, + "num_input_tokens_seen": 173783215, + "step": 8084, + "time_per_iteration": 2.77095103263855 + }, + { + "auxiliary_loss_clip": 0.0143037, + "auxiliary_loss_mlp": 0.01242722, + "balance_loss_clip": 1.11898208, + "balance_loss_mlp": 1.02661896, + "epoch": 0.4860964978205321, + "flos": 22494240344160.0, + "grad_norm": 1.8530267628684487, + "language_loss": 0.68366367, + "learning_rate": 2.186863394279098e-06, + "loss": 0.71039456, + "num_input_tokens_seen": 173801905, + "step": 8085, + "time_per_iteration": 2.8310928344726562 + }, + { + "auxiliary_loss_clip": 0.01432329, + "auxiliary_loss_mlp": 0.01241224, + "balance_loss_clip": 1.12054777, + "balance_loss_mlp": 1.02683759, + "epoch": 0.48615662107320007, + "flos": 23375054747520.0, + "grad_norm": 1.4312891319501428, + "language_loss": 0.77386898, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.80060446, + "num_input_tokens_seen": 173824690, + "step": 8086, + "time_per_iteration": 2.8967463970184326 + }, + { + "auxiliary_loss_clip": 0.01432377, + "auxiliary_loss_mlp": 0.01235223, + "balance_loss_clip": 1.12030625, + "balance_loss_mlp": 1.01988339, + "epoch": 0.48621674432586803, + "flos": 34421476239360.0, + "grad_norm": 3.350584281852091, + "language_loss": 0.70204902, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.72872496, + "num_input_tokens_seen": 173844450, + "step": 8087, + "time_per_iteration": 2.8956456184387207 + }, + { + "auxiliary_loss_clip": 0.01432367, + "auxiliary_loss_mlp": 0.01251518, + "balance_loss_clip": 1.12016594, + "balance_loss_mlp": 1.03541517, + "epoch": 0.486276867578536, + "flos": 33110248704480.0, + "grad_norm": 2.033486832478368, + "language_loss": 0.73124182, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.7580806, + "num_input_tokens_seen": 173864975, + "step": 8088, + "time_per_iteration": 4.562969207763672 + }, + { + "auxiliary_loss_clip": 0.01429961, + "auxiliary_loss_mlp": 0.01243582, + "balance_loss_clip": 1.11841774, + "balance_loss_mlp": 1.0314846, + "epoch": 0.48633699083120396, + "flos": 21472862722560.0, + "grad_norm": 2.168043393940634, + "language_loss": 0.75282371, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77955914, + "num_input_tokens_seen": 173883805, + "step": 8089, + "time_per_iteration": 2.843839168548584 + }, + { + "auxiliary_loss_clip": 0.01428283, + "auxiliary_loss_mlp": 0.0123707, + "balance_loss_clip": 1.11790085, + "balance_loss_mlp": 1.02211189, + "epoch": 0.48639711408387193, + "flos": 20086422917760.0, + "grad_norm": 1.7018022807295587, + "language_loss": 0.84419966, + "learning_rate": 2.184924515731926e-06, + "loss": 0.87085325, + "num_input_tokens_seen": 173903520, + "step": 8090, + "time_per_iteration": 2.836690902709961 + }, + { + "auxiliary_loss_clip": 0.01431091, + "auxiliary_loss_mlp": 0.01239249, + "balance_loss_clip": 1.11925137, + "balance_loss_mlp": 1.02429128, + "epoch": 0.4864572373365399, + "flos": 20781084090240.0, + "grad_norm": 1.621997921579332, + "language_loss": 0.75846833, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78517175, + "num_input_tokens_seen": 173924255, + "step": 8091, + "time_per_iteration": 2.8310036659240723 + }, + { + "auxiliary_loss_clip": 0.01428816, + "auxiliary_loss_mlp": 0.01241813, + "balance_loss_clip": 1.11720979, + "balance_loss_mlp": 1.02838099, + "epoch": 0.48651736058920786, + "flos": 26027435499840.0, + "grad_norm": 1.5501536632248598, + "language_loss": 0.80455959, + "learning_rate": 2.184148915123631e-06, + "loss": 0.83126593, + "num_input_tokens_seen": 173943285, + "step": 8092, + "time_per_iteration": 2.801637649536133 + }, + { + "auxiliary_loss_clip": 0.01428758, + "auxiliary_loss_mlp": 0.01238945, + "balance_loss_clip": 1.11763859, + "balance_loss_mlp": 1.02532244, + "epoch": 0.4865774838418758, + "flos": 20487706858080.0, + "grad_norm": 1.5054872370988723, + "language_loss": 0.71715325, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.74383026, + "num_input_tokens_seen": 173962205, + "step": 8093, + "time_per_iteration": 2.8618907928466797 + }, + { + "auxiliary_loss_clip": 0.01425587, + "auxiliary_loss_mlp": 0.0124164, + "balance_loss_clip": 1.11449397, + "balance_loss_mlp": 1.0291611, + "epoch": 0.4866376070945438, + "flos": 23549829530400.0, + "grad_norm": 1.682756382285089, + "language_loss": 0.68124533, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.70791763, + "num_input_tokens_seen": 173980945, + "step": 8094, + "time_per_iteration": 2.8266053199768066 + }, + { + "auxiliary_loss_clip": 0.01445721, + "auxiliary_loss_mlp": 0.01250631, + "balance_loss_clip": 1.13338864, + "balance_loss_mlp": 1.03452873, + "epoch": 0.4866977303472118, + "flos": 16692463494720.0, + "grad_norm": 2.7693505807650354, + "language_loss": 0.67015797, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.6971215, + "num_input_tokens_seen": 173998860, + "step": 8095, + "time_per_iteration": 4.204641580581665 + }, + { + "auxiliary_loss_clip": 0.01434104, + "auxiliary_loss_mlp": 0.01259378, + "balance_loss_clip": 1.12195086, + "balance_loss_mlp": 1.04670858, + "epoch": 0.4867578535998798, + "flos": 17898022082880.0, + "grad_norm": 2.10282214928931, + "language_loss": 0.78808856, + "learning_rate": 2.182597630229345e-06, + "loss": 0.8150233, + "num_input_tokens_seen": 174016665, + "step": 8096, + "time_per_iteration": 2.8683993816375732 + }, + { + "auxiliary_loss_clip": 0.01425152, + "auxiliary_loss_mlp": 0.01246029, + "balance_loss_clip": 1.11455798, + "balance_loss_mlp": 1.03602958, + "epoch": 0.48681797685254774, + "flos": 22639999720320.0, + "grad_norm": 2.424297366282243, + "language_loss": 0.68030834, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.70702016, + "num_input_tokens_seen": 174034800, + "step": 8097, + "time_per_iteration": 4.166038513183594 + }, + { + "auxiliary_loss_clip": 0.01426322, + "auxiliary_loss_mlp": 0.01246067, + "balance_loss_clip": 1.11499143, + "balance_loss_mlp": 1.03339767, + "epoch": 0.4868781001052157, + "flos": 20888156378880.0, + "grad_norm": 1.6393687870932996, + "language_loss": 0.71560681, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.74233067, + "num_input_tokens_seen": 174054445, + "step": 8098, + "time_per_iteration": 2.734358310699463 + }, + { + "auxiliary_loss_clip": 0.01437936, + "auxiliary_loss_mlp": 0.01269549, + "balance_loss_clip": 1.12545514, + "balance_loss_mlp": 1.05630755, + "epoch": 0.48693822335788367, + "flos": 41978206539360.0, + "grad_norm": 2.077602204174622, + "language_loss": 0.66508651, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.69216144, + "num_input_tokens_seen": 174077890, + "step": 8099, + "time_per_iteration": 2.9439120292663574 + }, + { + "auxiliary_loss_clip": 0.01429702, + "auxiliary_loss_mlp": 0.01248839, + "balance_loss_clip": 1.11828971, + "balance_loss_mlp": 1.03502536, + "epoch": 0.48699834661055164, + "flos": 24245590619520.0, + "grad_norm": 1.7310087841736383, + "language_loss": 0.66837406, + "learning_rate": 2.181046234549138e-06, + "loss": 0.69515949, + "num_input_tokens_seen": 174097460, + "step": 8100, + "time_per_iteration": 3.042076587677002 + }, + { + "auxiliary_loss_clip": 0.01429944, + "auxiliary_loss_mlp": 0.01239971, + "balance_loss_clip": 1.11876941, + "balance_loss_mlp": 1.02920914, + "epoch": 0.4870584698632196, + "flos": 25926393788640.0, + "grad_norm": 1.6212213349337936, + "language_loss": 0.76632106, + "learning_rate": 2.180658368429088e-06, + "loss": 0.79302019, + "num_input_tokens_seen": 174120775, + "step": 8101, + "time_per_iteration": 2.8376739025115967 + }, + { + "auxiliary_loss_clip": 0.0144255, + "auxiliary_loss_mlp": 0.01192337, + "balance_loss_clip": 1.16204226, + "balance_loss_mlp": 0.98939514, + "epoch": 0.48711859311588757, + "flos": 70218658249920.0, + "grad_norm": 0.6957084846708891, + "language_loss": 0.5228011, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54914999, + "num_input_tokens_seen": 174189135, + "step": 8102, + "time_per_iteration": 4.815408706665039 + }, + { + "auxiliary_loss_clip": 0.01432465, + "auxiliary_loss_mlp": 0.01246194, + "balance_loss_clip": 1.11952448, + "balance_loss_mlp": 1.03428769, + "epoch": 0.48717871636855553, + "flos": 12344260021920.0, + "grad_norm": 2.02185200145914, + "language_loss": 0.73760056, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.76438719, + "num_input_tokens_seen": 174203250, + "step": 8103, + "time_per_iteration": 2.7422702312469482 + }, + { + "auxiliary_loss_clip": 0.01431163, + "auxiliary_loss_mlp": 0.01246601, + "balance_loss_clip": 1.1193552, + "balance_loss_mlp": 1.03545725, + "epoch": 0.4872388396212235, + "flos": 23479396208640.0, + "grad_norm": 1.9131730236486615, + "language_loss": 0.63058114, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.65735877, + "num_input_tokens_seen": 174224145, + "step": 8104, + "time_per_iteration": 2.8032729625701904 + }, + { + "auxiliary_loss_clip": 0.01435051, + "auxiliary_loss_mlp": 0.01241688, + "balance_loss_clip": 1.12272501, + "balance_loss_mlp": 1.02901888, + "epoch": 0.48729896287389146, + "flos": 31430090314080.0, + "grad_norm": 1.945353093359264, + "language_loss": 0.68850416, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71527159, + "num_input_tokens_seen": 174244435, + "step": 8105, + "time_per_iteration": 2.8921613693237305 + }, + { + "auxiliary_loss_clip": 0.01427908, + "auxiliary_loss_mlp": 0.01230433, + "balance_loss_clip": 1.11527514, + "balance_loss_mlp": 1.02005267, + "epoch": 0.4873590861265594, + "flos": 19059697425600.0, + "grad_norm": 1.6892412829013448, + "language_loss": 0.73531127, + "learning_rate": 2.178718935364259e-06, + "loss": 0.7618947, + "num_input_tokens_seen": 174262710, + "step": 8106, + "time_per_iteration": 2.743138313293457 + }, + { + "auxiliary_loss_clip": 0.01441754, + "auxiliary_loss_mlp": 0.01254486, + "balance_loss_clip": 1.1296984, + "balance_loss_mlp": 1.03952742, + "epoch": 0.4874192093792274, + "flos": 24350349290400.0, + "grad_norm": 1.97650648210541, + "language_loss": 0.76710343, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.79406589, + "num_input_tokens_seen": 174281545, + "step": 8107, + "time_per_iteration": 2.8446969985961914 + }, + { + "auxiliary_loss_clip": 0.01431106, + "auxiliary_loss_mlp": 0.01243491, + "balance_loss_clip": 1.11972344, + "balance_loss_mlp": 1.03463674, + "epoch": 0.4874793326318954, + "flos": 23114865019680.0, + "grad_norm": 1.6787889024266698, + "language_loss": 0.75451851, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.78126454, + "num_input_tokens_seen": 174300290, + "step": 8108, + "time_per_iteration": 2.856928586959839 + }, + { + "auxiliary_loss_clip": 0.01433728, + "auxiliary_loss_mlp": 0.0123682, + "balance_loss_clip": 1.12040174, + "balance_loss_mlp": 1.02663016, + "epoch": 0.4875394558845634, + "flos": 19028178760320.0, + "grad_norm": 1.7491740794464765, + "language_loss": 0.74021971, + "learning_rate": 2.177555194083212e-06, + "loss": 0.76692516, + "num_input_tokens_seen": 174318490, + "step": 8109, + "time_per_iteration": 2.7509939670562744 + }, + { + "auxiliary_loss_clip": 0.01442646, + "auxiliary_loss_mlp": 0.01246517, + "balance_loss_clip": 1.12806511, + "balance_loss_mlp": 1.03518236, + "epoch": 0.48759957913723134, + "flos": 21435730689600.0, + "grad_norm": 2.363697452231887, + "language_loss": 0.78575695, + "learning_rate": 2.177167266837428e-06, + "loss": 0.81264859, + "num_input_tokens_seen": 174335505, + "step": 8110, + "time_per_iteration": 2.8188223838806152 + }, + { + "auxiliary_loss_clip": 0.01440664, + "auxiliary_loss_mlp": 0.01246314, + "balance_loss_clip": 1.12624252, + "balance_loss_mlp": 1.03269124, + "epoch": 0.4876597023898993, + "flos": 17750593867680.0, + "grad_norm": 1.884903763336032, + "language_loss": 0.72335929, + "learning_rate": 2.176779332873444e-06, + "loss": 0.750229, + "num_input_tokens_seen": 174353990, + "step": 8111, + "time_per_iteration": 2.727429151535034 + }, + { + "auxiliary_loss_clip": 0.0143112, + "auxiliary_loss_mlp": 0.01252744, + "balance_loss_clip": 1.11932862, + "balance_loss_mlp": 1.04141009, + "epoch": 0.4877198256425673, + "flos": 17021569417920.0, + "grad_norm": 1.6017677516805604, + "language_loss": 0.76063997, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78747863, + "num_input_tokens_seen": 174373425, + "step": 8112, + "time_per_iteration": 2.792288064956665 + }, + { + "auxiliary_loss_clip": 0.01426178, + "auxiliary_loss_mlp": 0.01244321, + "balance_loss_clip": 1.11377406, + "balance_loss_mlp": 1.03031695, + "epoch": 0.48777994889523524, + "flos": 22386371564160.0, + "grad_norm": 2.201063619083876, + "language_loss": 0.75350839, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.78021336, + "num_input_tokens_seen": 174393070, + "step": 8113, + "time_per_iteration": 2.7862744331359863 + }, + { + "auxiliary_loss_clip": 0.01429466, + "auxiliary_loss_mlp": 0.01209419, + "balance_loss_clip": 1.14844966, + "balance_loss_mlp": 1.00952911, + "epoch": 0.4878400721479032, + "flos": 61248293290080.0, + "grad_norm": 0.7914361380741053, + "language_loss": 0.48844904, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.51483792, + "num_input_tokens_seen": 174446880, + "step": 8114, + "time_per_iteration": 3.2369773387908936 + }, + { + "auxiliary_loss_clip": 0.01431829, + "auxiliary_loss_mlp": 0.01245887, + "balance_loss_clip": 1.12014198, + "balance_loss_mlp": 1.03684163, + "epoch": 0.48790019540057117, + "flos": 24539005779840.0, + "grad_norm": 2.2061240846422687, + "language_loss": 0.76604372, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.79282093, + "num_input_tokens_seen": 174468485, + "step": 8115, + "time_per_iteration": 2.7814018726348877 + }, + { + "auxiliary_loss_clip": 0.0143599, + "auxiliary_loss_mlp": 0.01259487, + "balance_loss_clip": 1.12341583, + "balance_loss_mlp": 1.04681778, + "epoch": 0.48796031865323913, + "flos": 21836028497760.0, + "grad_norm": 2.1525718741030517, + "language_loss": 0.722875, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74982977, + "num_input_tokens_seen": 174486360, + "step": 8116, + "time_per_iteration": 2.8509936332702637 + }, + { + "auxiliary_loss_clip": 0.01429445, + "auxiliary_loss_mlp": 0.01243469, + "balance_loss_clip": 1.11844289, + "balance_loss_mlp": 1.03156245, + "epoch": 0.4880204419059071, + "flos": 18590938560000.0, + "grad_norm": 2.120231877518941, + "language_loss": 0.63603997, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.66276908, + "num_input_tokens_seen": 174505075, + "step": 8117, + "time_per_iteration": 2.7933061122894287 + }, + { + "auxiliary_loss_clip": 0.01427242, + "auxiliary_loss_mlp": 0.0124894, + "balance_loss_clip": 1.11455894, + "balance_loss_mlp": 1.0393219, + "epoch": 0.48808056515857506, + "flos": 19174203633600.0, + "grad_norm": 2.052023923774165, + "language_loss": 0.7935403, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.82030219, + "num_input_tokens_seen": 174523385, + "step": 8118, + "time_per_iteration": 2.7709577083587646 + }, + { + "auxiliary_loss_clip": 0.01435345, + "auxiliary_loss_mlp": 0.01256733, + "balance_loss_clip": 1.12367916, + "balance_loss_mlp": 1.0444454, + "epoch": 0.48814068841124303, + "flos": 20122568818560.0, + "grad_norm": 1.7662359641184557, + "language_loss": 0.63246304, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65938383, + "num_input_tokens_seen": 174542200, + "step": 8119, + "time_per_iteration": 2.7911617755889893 + }, + { + "auxiliary_loss_clip": 0.01435237, + "auxiliary_loss_mlp": 0.01241895, + "balance_loss_clip": 1.12245381, + "balance_loss_mlp": 1.02750897, + "epoch": 0.488200811663911, + "flos": 22967778157920.0, + "grad_norm": 1.8538405434520715, + "language_loss": 0.72298348, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74975473, + "num_input_tokens_seen": 174563620, + "step": 8120, + "time_per_iteration": 2.7459819316864014 + }, + { + "auxiliary_loss_clip": 0.01433773, + "auxiliary_loss_mlp": 0.01244294, + "balance_loss_clip": 1.12301159, + "balance_loss_mlp": 1.0323875, + "epoch": 0.48826093491657896, + "flos": 33913461363840.0, + "grad_norm": 2.179043595239803, + "language_loss": 0.64366305, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.67044377, + "num_input_tokens_seen": 174586465, + "step": 8121, + "time_per_iteration": 2.918403387069702 + }, + { + "auxiliary_loss_clip": 0.01432307, + "auxiliary_loss_mlp": 0.01241275, + "balance_loss_clip": 1.11989117, + "balance_loss_mlp": 1.02822363, + "epoch": 0.488321058169247, + "flos": 23072233403520.0, + "grad_norm": 2.5757146282711414, + "language_loss": 0.82257456, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.8493104, + "num_input_tokens_seen": 174604035, + "step": 8122, + "time_per_iteration": 2.8132338523864746 + }, + { + "auxiliary_loss_clip": 0.01435613, + "auxiliary_loss_mlp": 0.01249056, + "balance_loss_clip": 1.12260699, + "balance_loss_mlp": 1.03753066, + "epoch": 0.48838118142191494, + "flos": 19319735440800.0, + "grad_norm": 1.8014307538334435, + "language_loss": 0.8524909, + "learning_rate": 2.172123606640866e-06, + "loss": 0.87933761, + "num_input_tokens_seen": 174621715, + "step": 8123, + "time_per_iteration": 2.8071274757385254 + }, + { + "auxiliary_loss_clip": 0.01431982, + "auxiliary_loss_mlp": 0.01248841, + "balance_loss_clip": 1.11931205, + "balance_loss_mlp": 1.03655291, + "epoch": 0.4884413046745829, + "flos": 25413031042560.0, + "grad_norm": 1.4546121985147598, + "language_loss": 0.85379374, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.880602, + "num_input_tokens_seen": 174643835, + "step": 8124, + "time_per_iteration": 2.853503465652466 + }, + { + "auxiliary_loss_clip": 0.01429919, + "auxiliary_loss_mlp": 0.01248061, + "balance_loss_clip": 1.1183238, + "balance_loss_mlp": 1.03710842, + "epoch": 0.4885014279272509, + "flos": 20993408115840.0, + "grad_norm": 1.9591312076128158, + "language_loss": 0.79095888, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81773865, + "num_input_tokens_seen": 174660955, + "step": 8125, + "time_per_iteration": 2.7490146160125732 + }, + { + "auxiliary_loss_clip": 0.01433958, + "auxiliary_loss_mlp": 0.01249612, + "balance_loss_clip": 1.12267685, + "balance_loss_mlp": 1.03579831, + "epoch": 0.48856155117991884, + "flos": 13773293514720.0, + "grad_norm": 2.0040248292474936, + "language_loss": 0.72793424, + "learning_rate": 2.170959527233356e-06, + "loss": 0.75476992, + "num_input_tokens_seen": 174678270, + "step": 8126, + "time_per_iteration": 2.8205652236938477 + }, + { + "auxiliary_loss_clip": 0.01431135, + "auxiliary_loss_mlp": 0.01249511, + "balance_loss_clip": 1.1191417, + "balance_loss_mlp": 1.03779495, + "epoch": 0.4886216744325868, + "flos": 32090501993760.0, + "grad_norm": 1.7364915742586606, + "language_loss": 0.68793172, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.71473813, + "num_input_tokens_seen": 174698360, + "step": 8127, + "time_per_iteration": 4.352795362472534 + }, + { + "auxiliary_loss_clip": 0.01430783, + "auxiliary_loss_mlp": 0.0125481, + "balance_loss_clip": 1.11947775, + "balance_loss_mlp": 1.04099607, + "epoch": 0.48868179768525477, + "flos": 19612126540800.0, + "grad_norm": 1.7243386433427945, + "language_loss": 0.76180822, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78866416, + "num_input_tokens_seen": 174716755, + "step": 8128, + "time_per_iteration": 2.8311476707458496 + }, + { + "auxiliary_loss_clip": 0.01428829, + "auxiliary_loss_mlp": 0.01249974, + "balance_loss_clip": 1.11768484, + "balance_loss_mlp": 1.038831, + "epoch": 0.48874192093792274, + "flos": 21288985181280.0, + "grad_norm": 2.717835780908157, + "language_loss": 0.75988412, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78667223, + "num_input_tokens_seen": 174735560, + "step": 8129, + "time_per_iteration": 2.7725913524627686 + }, + { + "auxiliary_loss_clip": 0.01432706, + "auxiliary_loss_mlp": 0.01245489, + "balance_loss_clip": 1.12020612, + "balance_loss_mlp": 1.03243828, + "epoch": 0.4888020441905907, + "flos": 14175184305600.0, + "grad_norm": 2.2693940871953027, + "language_loss": 0.64954376, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67632574, + "num_input_tokens_seen": 174752730, + "step": 8130, + "time_per_iteration": 2.7096872329711914 + }, + { + "auxiliary_loss_clip": 0.01426457, + "auxiliary_loss_mlp": 0.01233268, + "balance_loss_clip": 1.11449623, + "balance_loss_mlp": 1.02193379, + "epoch": 0.48886216744325867, + "flos": 24100210524960.0, + "grad_norm": 2.0748116872592726, + "language_loss": 0.7240203, + "learning_rate": 2.169019265427658e-06, + "loss": 0.75061756, + "num_input_tokens_seen": 174772520, + "step": 8131, + "time_per_iteration": 2.8373405933380127 + }, + { + "auxiliary_loss_clip": 0.01432718, + "auxiliary_loss_mlp": 0.01246306, + "balance_loss_clip": 1.12009072, + "balance_loss_mlp": 1.03497171, + "epoch": 0.48892229069592663, + "flos": 38434239786240.0, + "grad_norm": 1.4355729252775749, + "language_loss": 0.69610906, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.72289932, + "num_input_tokens_seen": 174796540, + "step": 8132, + "time_per_iteration": 2.9072518348693848 + }, + { + "auxiliary_loss_clip": 0.01431905, + "auxiliary_loss_mlp": 0.01240707, + "balance_loss_clip": 1.120507, + "balance_loss_mlp": 1.02689362, + "epoch": 0.4889824139485946, + "flos": 23845672092960.0, + "grad_norm": 1.3970460756302754, + "language_loss": 0.70461857, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.73134464, + "num_input_tokens_seen": 174817840, + "step": 8133, + "time_per_iteration": 2.8285815715789795 + }, + { + "auxiliary_loss_clip": 0.01430782, + "auxiliary_loss_mlp": 0.0123407, + "balance_loss_clip": 1.1187768, + "balance_loss_mlp": 1.02330828, + "epoch": 0.48904253720126256, + "flos": 24428140675200.0, + "grad_norm": 1.8338892327920027, + "language_loss": 0.70638824, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.73303682, + "num_input_tokens_seen": 174837885, + "step": 8134, + "time_per_iteration": 4.2376298904418945 + }, + { + "auxiliary_loss_clip": 0.01432864, + "auxiliary_loss_mlp": 0.01238496, + "balance_loss_clip": 1.1210649, + "balance_loss_mlp": 1.02372861, + "epoch": 0.4891026604539306, + "flos": 24172995392640.0, + "grad_norm": 2.0571882633769647, + "language_loss": 0.80695176, + "learning_rate": 2.167466940528718e-06, + "loss": 0.83366537, + "num_input_tokens_seen": 174855240, + "step": 8135, + "time_per_iteration": 4.256767511367798 + }, + { + "auxiliary_loss_clip": 0.01429209, + "auxiliary_loss_mlp": 0.01241765, + "balance_loss_clip": 1.11669397, + "balance_loss_mlp": 1.03252923, + "epoch": 0.48916278370659855, + "flos": 21473128219680.0, + "grad_norm": 1.7029681295580317, + "language_loss": 0.74360287, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.77031261, + "num_input_tokens_seen": 174875145, + "step": 8136, + "time_per_iteration": 2.8094475269317627 + }, + { + "auxiliary_loss_clip": 0.01434571, + "auxiliary_loss_mlp": 0.01243852, + "balance_loss_clip": 1.1219238, + "balance_loss_mlp": 1.03366244, + "epoch": 0.4892229069592665, + "flos": 22311955785600.0, + "grad_norm": 1.5986411454616314, + "language_loss": 0.73698884, + "learning_rate": 2.166690739918204e-06, + "loss": 0.76377308, + "num_input_tokens_seen": 174894770, + "step": 8137, + "time_per_iteration": 2.785618305206299 + }, + { + "auxiliary_loss_clip": 0.01436285, + "auxiliary_loss_mlp": 0.01232967, + "balance_loss_clip": 1.12188816, + "balance_loss_mlp": 1.02029765, + "epoch": 0.4892830302119345, + "flos": 12788516931840.0, + "grad_norm": 2.7057281730501517, + "language_loss": 0.75077832, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.77747083, + "num_input_tokens_seen": 174912780, + "step": 8138, + "time_per_iteration": 2.777723789215088 + }, + { + "auxiliary_loss_clip": 0.01443906, + "auxiliary_loss_mlp": 0.01237806, + "balance_loss_clip": 1.13110662, + "balance_loss_mlp": 1.02704394, + "epoch": 0.48934315346460244, + "flos": 20816016289920.0, + "grad_norm": 1.6406848287019595, + "language_loss": 0.74137878, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76819599, + "num_input_tokens_seen": 174931250, + "step": 8139, + "time_per_iteration": 2.8235371112823486 + }, + { + "auxiliary_loss_clip": 0.01437368, + "auxiliary_loss_mlp": 0.01244687, + "balance_loss_clip": 1.12415481, + "balance_loss_mlp": 1.03354383, + "epoch": 0.4894032767172704, + "flos": 19757582491680.0, + "grad_norm": 2.213770451285317, + "language_loss": 0.62076712, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64758766, + "num_input_tokens_seen": 174951105, + "step": 8140, + "time_per_iteration": 4.276026010513306 + }, + { + "auxiliary_loss_clip": 0.01446489, + "auxiliary_loss_mlp": 0.01257738, + "balance_loss_clip": 1.13286114, + "balance_loss_mlp": 1.04487824, + "epoch": 0.4894633999699384, + "flos": 17820761692320.0, + "grad_norm": 1.8749837550073991, + "language_loss": 0.82433045, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.85137272, + "num_input_tokens_seen": 174969120, + "step": 8141, + "time_per_iteration": 2.741208076477051 + }, + { + "auxiliary_loss_clip": 0.01452571, + "auxiliary_loss_mlp": 0.01261034, + "balance_loss_clip": 1.14080644, + "balance_loss_mlp": 1.04493105, + "epoch": 0.48952352322260634, + "flos": 25525754627040.0, + "grad_norm": 1.707555781041536, + "language_loss": 0.72512102, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.75225711, + "num_input_tokens_seen": 174991295, + "step": 8142, + "time_per_iteration": 2.895463228225708 + }, + { + "auxiliary_loss_clip": 0.01437067, + "auxiliary_loss_mlp": 0.01240146, + "balance_loss_clip": 1.12414682, + "balance_loss_mlp": 1.02995598, + "epoch": 0.4895836464752743, + "flos": 29057849866080.0, + "grad_norm": 1.6284916895014165, + "language_loss": 0.67172986, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.69850194, + "num_input_tokens_seen": 175012830, + "step": 8143, + "time_per_iteration": 2.9091956615448 + }, + { + "auxiliary_loss_clip": 0.0143801, + "auxiliary_loss_mlp": 0.01233289, + "balance_loss_clip": 1.1262598, + "balance_loss_mlp": 1.02252722, + "epoch": 0.48964376972794227, + "flos": 33549764594400.0, + "grad_norm": 1.675555431574428, + "language_loss": 0.75136358, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77807653, + "num_input_tokens_seen": 175035695, + "step": 8144, + "time_per_iteration": 2.842548370361328 + }, + { + "auxiliary_loss_clip": 0.01442004, + "auxiliary_loss_mlp": 0.01240735, + "balance_loss_clip": 1.12925506, + "balance_loss_mlp": 1.02882814, + "epoch": 0.48970389298061023, + "flos": 22056203652480.0, + "grad_norm": 1.6239663061689815, + "language_loss": 0.75694841, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.78377581, + "num_input_tokens_seen": 175056425, + "step": 8145, + "time_per_iteration": 2.7959165573120117 + }, + { + "auxiliary_loss_clip": 0.01442164, + "auxiliary_loss_mlp": 0.01249688, + "balance_loss_clip": 1.12836814, + "balance_loss_mlp": 1.03663719, + "epoch": 0.4897640162332782, + "flos": 20086384989600.0, + "grad_norm": 1.9492099634794033, + "language_loss": 0.79851973, + "learning_rate": 2.163197525984761e-06, + "loss": 0.82543826, + "num_input_tokens_seen": 175074800, + "step": 8146, + "time_per_iteration": 2.7658286094665527 + }, + { + "auxiliary_loss_clip": 0.01435823, + "auxiliary_loss_mlp": 0.01239645, + "balance_loss_clip": 1.12422347, + "balance_loss_mlp": 1.02773905, + "epoch": 0.48982413948594616, + "flos": 23808540060000.0, + "grad_norm": 1.6021835695344089, + "language_loss": 0.74458337, + "learning_rate": 2.162809359964687e-06, + "loss": 0.77133805, + "num_input_tokens_seen": 175094500, + "step": 8147, + "time_per_iteration": 2.819795608520508 + }, + { + "auxiliary_loss_clip": 0.01442329, + "auxiliary_loss_mlp": 0.0124527, + "balance_loss_clip": 1.12933242, + "balance_loss_mlp": 1.03241014, + "epoch": 0.4898842627386142, + "flos": 17641587242880.0, + "grad_norm": 2.7644975446331963, + "language_loss": 0.82841253, + "learning_rate": 2.162421187770864e-06, + "loss": 0.85528851, + "num_input_tokens_seen": 175112920, + "step": 8148, + "time_per_iteration": 2.776304244995117 + }, + { + "auxiliary_loss_clip": 0.01439602, + "auxiliary_loss_mlp": 0.01237075, + "balance_loss_clip": 1.12640357, + "balance_loss_mlp": 1.02612233, + "epoch": 0.48994438599128215, + "flos": 16619792411520.0, + "grad_norm": 1.7938425961025126, + "language_loss": 0.73829234, + "learning_rate": 2.162033009418015e-06, + "loss": 0.76505911, + "num_input_tokens_seen": 175129910, + "step": 8149, + "time_per_iteration": 2.7664713859558105 + }, + { + "auxiliary_loss_clip": 0.01443112, + "auxiliary_loss_mlp": 0.01262396, + "balance_loss_clip": 1.12901962, + "balance_loss_mlp": 1.05010796, + "epoch": 0.4900045092439501, + "flos": 26617262145120.0, + "grad_norm": 1.8169625075519635, + "language_loss": 0.76294708, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.79000217, + "num_input_tokens_seen": 175148705, + "step": 8150, + "time_per_iteration": 2.862532377243042 + }, + { + "auxiliary_loss_clip": 0.01447489, + "auxiliary_loss_mlp": 0.01255296, + "balance_loss_clip": 1.13226914, + "balance_loss_mlp": 1.03843081, + "epoch": 0.4900646324966181, + "flos": 19904214215520.0, + "grad_norm": 2.1547989632951006, + "language_loss": 0.72360873, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.75063652, + "num_input_tokens_seen": 175167425, + "step": 8151, + "time_per_iteration": 2.8056745529174805 + }, + { + "auxiliary_loss_clip": 0.01471508, + "auxiliary_loss_mlp": 0.01222732, + "balance_loss_clip": 1.18459356, + "balance_loss_mlp": 1.02436829, + "epoch": 0.49012475574928605, + "flos": 59195525012640.0, + "grad_norm": 0.8295357866879324, + "language_loss": 0.54227114, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56921351, + "num_input_tokens_seen": 175227985, + "step": 8152, + "time_per_iteration": 3.2749228477478027 + }, + { + "auxiliary_loss_clip": 0.01450063, + "auxiliary_loss_mlp": 0.01239311, + "balance_loss_clip": 1.13576531, + "balance_loss_mlp": 1.0245434, + "epoch": 0.490184879001954, + "flos": 45263311050240.0, + "grad_norm": 1.916146291426928, + "language_loss": 0.61024386, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.63713759, + "num_input_tokens_seen": 175251895, + "step": 8153, + "time_per_iteration": 3.0120439529418945 + }, + { + "auxiliary_loss_clip": 0.01444831, + "auxiliary_loss_mlp": 0.01244574, + "balance_loss_clip": 1.13131297, + "balance_loss_mlp": 1.03514731, + "epoch": 0.490245002254622, + "flos": 28004688082080.0, + "grad_norm": 1.9032158592561206, + "language_loss": 0.76775587, + "learning_rate": 2.160092025783549e-06, + "loss": 0.79464996, + "num_input_tokens_seen": 175272770, + "step": 8154, + "time_per_iteration": 2.860480546951294 + }, + { + "auxiliary_loss_clip": 0.01484135, + "auxiliary_loss_mlp": 0.01209022, + "balance_loss_clip": 1.19729125, + "balance_loss_mlp": 1.01065826, + "epoch": 0.49030512550728994, + "flos": 58958168037120.0, + "grad_norm": 0.9505072040270206, + "language_loss": 0.6699481, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69687963, + "num_input_tokens_seen": 175336320, + "step": 8155, + "time_per_iteration": 3.313713550567627 + }, + { + "auxiliary_loss_clip": 0.01458986, + "auxiliary_loss_mlp": 0.01243195, + "balance_loss_clip": 1.14146113, + "balance_loss_mlp": 1.03167045, + "epoch": 0.4903652487599579, + "flos": 19793880105120.0, + "grad_norm": 2.054560774601803, + "language_loss": 0.77058125, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.79760301, + "num_input_tokens_seen": 175353540, + "step": 8156, + "time_per_iteration": 2.8388214111328125 + }, + { + "auxiliary_loss_clip": 0.01459166, + "auxiliary_loss_mlp": 0.01250184, + "balance_loss_clip": 1.14218366, + "balance_loss_mlp": 1.03865886, + "epoch": 0.49042537201262587, + "flos": 21764229762240.0, + "grad_norm": 2.445654239711952, + "language_loss": 0.83806461, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.86515808, + "num_input_tokens_seen": 175370445, + "step": 8157, + "time_per_iteration": 2.7956454753875732 + }, + { + "auxiliary_loss_clip": 0.01447218, + "auxiliary_loss_mlp": 0.01239367, + "balance_loss_clip": 1.13128948, + "balance_loss_mlp": 1.02784228, + "epoch": 0.49048549526529384, + "flos": 18955052539200.0, + "grad_norm": 1.8161109160094333, + "language_loss": 0.79779285, + "learning_rate": 2.158539129514956e-06, + "loss": 0.82465875, + "num_input_tokens_seen": 175389020, + "step": 8158, + "time_per_iteration": 2.7860450744628906 + }, + { + "auxiliary_loss_clip": 0.0145177, + "auxiliary_loss_mlp": 0.01263016, + "balance_loss_clip": 1.13690162, + "balance_loss_mlp": 1.05091858, + "epoch": 0.4905456185179618, + "flos": 26908818825600.0, + "grad_norm": 2.240815443265037, + "language_loss": 0.6941185, + "learning_rate": 2.158150890381454e-06, + "loss": 0.72126639, + "num_input_tokens_seen": 175409545, + "step": 8159, + "time_per_iteration": 2.8103513717651367 + }, + { + "auxiliary_loss_clip": 0.01456214, + "auxiliary_loss_mlp": 0.01247446, + "balance_loss_clip": 1.14242101, + "balance_loss_mlp": 1.03611219, + "epoch": 0.49060574177062977, + "flos": 20414428924320.0, + "grad_norm": 1.895657031897974, + "language_loss": 0.73671204, + "learning_rate": 2.157762645250854e-06, + "loss": 0.76374865, + "num_input_tokens_seen": 175429335, + "step": 8160, + "time_per_iteration": 2.6824402809143066 + }, + { + "auxiliary_loss_clip": 0.01447551, + "auxiliary_loss_mlp": 0.01254901, + "balance_loss_clip": 1.13406706, + "balance_loss_mlp": 1.04261267, + "epoch": 0.4906658650232978, + "flos": 17495638225920.0, + "grad_norm": 1.812072157711243, + "language_loss": 0.71924174, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.74626625, + "num_input_tokens_seen": 175446955, + "step": 8161, + "time_per_iteration": 2.665800094604492 + }, + { + "auxiliary_loss_clip": 0.01454844, + "auxiliary_loss_mlp": 0.01245737, + "balance_loss_clip": 1.1427114, + "balance_loss_mlp": 1.03478396, + "epoch": 0.49072598827596575, + "flos": 26616844935360.0, + "grad_norm": 1.6908638035801182, + "language_loss": 0.68783724, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.71484303, + "num_input_tokens_seen": 175468195, + "step": 8162, + "time_per_iteration": 2.7862393856048584 + }, + { + "auxiliary_loss_clip": 0.01454025, + "auxiliary_loss_mlp": 0.0124504, + "balance_loss_clip": 1.14133775, + "balance_loss_mlp": 1.03218043, + "epoch": 0.4907861115286337, + "flos": 20414694421440.0, + "grad_norm": 2.2881965621756306, + "language_loss": 0.63935691, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.66634756, + "num_input_tokens_seen": 175487455, + "step": 8163, + "time_per_iteration": 2.7679452896118164 + }, + { + "auxiliary_loss_clip": 0.01456195, + "auxiliary_loss_mlp": 0.01248358, + "balance_loss_clip": 1.14380205, + "balance_loss_mlp": 1.03511667, + "epoch": 0.4908462347813017, + "flos": 14066025968160.0, + "grad_norm": 2.1864253781186935, + "language_loss": 0.76729864, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.79434419, + "num_input_tokens_seen": 175504450, + "step": 8164, + "time_per_iteration": 4.394232988357544 + }, + { + "auxiliary_loss_clip": 0.01453846, + "auxiliary_loss_mlp": 0.0124111, + "balance_loss_clip": 1.14058709, + "balance_loss_mlp": 1.03034782, + "epoch": 0.49090635803396965, + "flos": 18737153074080.0, + "grad_norm": 2.395601194086355, + "language_loss": 0.76809549, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.79504502, + "num_input_tokens_seen": 175523600, + "step": 8165, + "time_per_iteration": 2.8412325382232666 + }, + { + "auxiliary_loss_clip": 0.01457347, + "auxiliary_loss_mlp": 0.01247733, + "balance_loss_clip": 1.14535022, + "balance_loss_mlp": 1.03792429, + "epoch": 0.4909664812866376, + "flos": 20560605510240.0, + "grad_norm": 1.7536092432302386, + "language_loss": 0.77008593, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79713672, + "num_input_tokens_seen": 175542720, + "step": 8166, + "time_per_iteration": 2.754575729370117 + }, + { + "auxiliary_loss_clip": 0.01495766, + "auxiliary_loss_mlp": 0.01199478, + "balance_loss_clip": 1.21013391, + "balance_loss_mlp": 1.00035095, + "epoch": 0.4910266045393056, + "flos": 54690714345600.0, + "grad_norm": 0.7986263528495358, + "language_loss": 0.54194826, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56890076, + "num_input_tokens_seen": 175598640, + "step": 8167, + "time_per_iteration": 3.305689811706543 + }, + { + "auxiliary_loss_clip": 0.01454286, + "auxiliary_loss_mlp": 0.01249361, + "balance_loss_clip": 1.14319611, + "balance_loss_mlp": 1.03974378, + "epoch": 0.49108672779197354, + "flos": 16247941087680.0, + "grad_norm": 2.6781694756912295, + "language_loss": 0.85762346, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.88465995, + "num_input_tokens_seen": 175615675, + "step": 8168, + "time_per_iteration": 2.778892755508423 + }, + { + "auxiliary_loss_clip": 0.014525, + "auxiliary_loss_mlp": 0.01237182, + "balance_loss_clip": 1.14075184, + "balance_loss_mlp": 1.02756429, + "epoch": 0.4911468510446415, + "flos": 19827295178400.0, + "grad_norm": 1.8100610046285481, + "language_loss": 0.73680174, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.76369858, + "num_input_tokens_seen": 175632255, + "step": 8169, + "time_per_iteration": 2.8006057739257812 + }, + { + "auxiliary_loss_clip": 0.01446107, + "auxiliary_loss_mlp": 0.01240128, + "balance_loss_clip": 1.1340282, + "balance_loss_mlp": 1.03108287, + "epoch": 0.4912069742973095, + "flos": 21214683187200.0, + "grad_norm": 1.6298564891363638, + "language_loss": 0.77969635, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80655873, + "num_input_tokens_seen": 175651625, + "step": 8170, + "time_per_iteration": 2.784217119216919 + }, + { + "auxiliary_loss_clip": 0.01456233, + "auxiliary_loss_mlp": 0.01244551, + "balance_loss_clip": 1.14545941, + "balance_loss_mlp": 1.03035545, + "epoch": 0.49126709754997744, + "flos": 19539189960480.0, + "grad_norm": 5.845518737941721, + "language_loss": 0.76259863, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.78960645, + "num_input_tokens_seen": 175669265, + "step": 8171, + "time_per_iteration": 2.824960231781006 + }, + { + "auxiliary_loss_clip": 0.01457567, + "auxiliary_loss_mlp": 0.01249456, + "balance_loss_clip": 1.14484048, + "balance_loss_mlp": 1.03869402, + "epoch": 0.4913272208026454, + "flos": 12241321902720.0, + "grad_norm": 2.480590791418281, + "language_loss": 0.81459558, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.8416658, + "num_input_tokens_seen": 175686065, + "step": 8172, + "time_per_iteration": 4.241030693054199 + }, + { + "auxiliary_loss_clip": 0.01492609, + "auxiliary_loss_mlp": 0.01217697, + "balance_loss_clip": 1.20751011, + "balance_loss_mlp": 1.01933289, + "epoch": 0.49138734405531337, + "flos": 65472167161440.0, + "grad_norm": 0.6889968074502546, + "language_loss": 0.53212786, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55923092, + "num_input_tokens_seen": 175748595, + "step": 8173, + "time_per_iteration": 4.8196799755096436 + }, + { + "auxiliary_loss_clip": 0.01457312, + "auxiliary_loss_mlp": 0.01251389, + "balance_loss_clip": 1.14503479, + "balance_loss_mlp": 1.03814697, + "epoch": 0.4914474673079814, + "flos": 18440855373600.0, + "grad_norm": 1.7051293869456943, + "language_loss": 0.62622023, + "learning_rate": 2.152326591972107e-06, + "loss": 0.6533072, + "num_input_tokens_seen": 175766770, + "step": 8174, + "time_per_iteration": 2.7805933952331543 + }, + { + "auxiliary_loss_clip": 0.01450453, + "auxiliary_loss_mlp": 0.01246167, + "balance_loss_clip": 1.13838983, + "balance_loss_mlp": 1.03654909, + "epoch": 0.49150759056064935, + "flos": 21686969371680.0, + "grad_norm": 2.9047529897880717, + "language_loss": 0.69521844, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.7221846, + "num_input_tokens_seen": 175783605, + "step": 8175, + "time_per_iteration": 2.827977180480957 + }, + { + "auxiliary_loss_clip": 0.01455883, + "auxiliary_loss_mlp": 0.01244435, + "balance_loss_clip": 1.14357948, + "balance_loss_mlp": 1.03310096, + "epoch": 0.4915677138133173, + "flos": 22384626868800.0, + "grad_norm": 2.0993785821030437, + "language_loss": 0.74605656, + "learning_rate": 2.151549919570068e-06, + "loss": 0.77305979, + "num_input_tokens_seen": 175801390, + "step": 8176, + "time_per_iteration": 2.764237880706787 + }, + { + "auxiliary_loss_clip": 0.01455265, + "auxiliary_loss_mlp": 0.01252694, + "balance_loss_clip": 1.1424824, + "balance_loss_mlp": 1.04193187, + "epoch": 0.4916278370659853, + "flos": 18404709472800.0, + "grad_norm": 1.559082096410341, + "language_loss": 0.7034229, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.73050249, + "num_input_tokens_seen": 175819830, + "step": 8177, + "time_per_iteration": 2.764456033706665 + }, + { + "auxiliary_loss_clip": 0.01488521, + "auxiliary_loss_mlp": 0.01206192, + "balance_loss_clip": 1.20348513, + "balance_loss_mlp": 1.00782776, + "epoch": 0.49168796031865325, + "flos": 66616167617280.0, + "grad_norm": 0.6825706644496875, + "language_loss": 0.46180975, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48875684, + "num_input_tokens_seen": 175881765, + "step": 8178, + "time_per_iteration": 4.737999200820923 + }, + { + "auxiliary_loss_clip": 0.01457232, + "auxiliary_loss_mlp": 0.01252787, + "balance_loss_clip": 1.14587533, + "balance_loss_mlp": 1.04164362, + "epoch": 0.4917480835713212, + "flos": 20961434312640.0, + "grad_norm": 1.8608118490273822, + "language_loss": 0.6577388, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68483901, + "num_input_tokens_seen": 175901795, + "step": 8179, + "time_per_iteration": 2.7311644554138184 + }, + { + "auxiliary_loss_clip": 0.01454793, + "auxiliary_loss_mlp": 0.01253846, + "balance_loss_clip": 1.14196634, + "balance_loss_mlp": 1.04232085, + "epoch": 0.4918082068239892, + "flos": 15774403273920.0, + "grad_norm": 2.0703531737905942, + "language_loss": 0.70139027, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72847664, + "num_input_tokens_seen": 175917770, + "step": 8180, + "time_per_iteration": 2.8840537071228027 + }, + { + "auxiliary_loss_clip": 0.01454298, + "auxiliary_loss_mlp": 0.01244804, + "balance_loss_clip": 1.14234495, + "balance_loss_mlp": 1.03347015, + "epoch": 0.49186833007665715, + "flos": 24606935843040.0, + "grad_norm": 1.8010314202497684, + "language_loss": 0.84366429, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.87065536, + "num_input_tokens_seen": 175937000, + "step": 8181, + "time_per_iteration": 2.7912821769714355 + }, + { + "auxiliary_loss_clip": 0.01451866, + "auxiliary_loss_mlp": 0.01245929, + "balance_loss_clip": 1.13965762, + "balance_loss_mlp": 1.03650212, + "epoch": 0.4919284533293251, + "flos": 22092728834880.0, + "grad_norm": 2.0770236487796443, + "language_loss": 0.72877538, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.75575328, + "num_input_tokens_seen": 175955170, + "step": 8182, + "time_per_iteration": 2.808638572692871 + }, + { + "auxiliary_loss_clip": 0.01451945, + "auxiliary_loss_mlp": 0.0125057, + "balance_loss_clip": 1.13909364, + "balance_loss_mlp": 1.04095221, + "epoch": 0.4919885765819931, + "flos": 23370313727520.0, + "grad_norm": 2.009077275503603, + "language_loss": 0.72339141, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.75041664, + "num_input_tokens_seen": 175973725, + "step": 8183, + "time_per_iteration": 2.7985785007476807 + }, + { + "auxiliary_loss_clip": 0.01460751, + "auxiliary_loss_mlp": 0.0124421, + "balance_loss_clip": 1.1489073, + "balance_loss_mlp": 1.03020549, + "epoch": 0.49204869983466104, + "flos": 21362604468480.0, + "grad_norm": 4.868580148589285, + "language_loss": 0.77194655, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79899609, + "num_input_tokens_seen": 175993885, + "step": 8184, + "time_per_iteration": 2.770582914352417 + }, + { + "auxiliary_loss_clip": 0.0145309, + "auxiliary_loss_mlp": 0.01243365, + "balance_loss_clip": 1.14107752, + "balance_loss_mlp": 1.0333662, + "epoch": 0.492108823087329, + "flos": 21144856716000.0, + "grad_norm": 1.615317778617372, + "language_loss": 0.70679116, + "learning_rate": 2.148054610995789e-06, + "loss": 0.73375571, + "num_input_tokens_seen": 176014210, + "step": 8185, + "time_per_iteration": 2.842646598815918 + }, + { + "auxiliary_loss_clip": 0.01452916, + "auxiliary_loss_mlp": 0.01245074, + "balance_loss_clip": 1.14006352, + "balance_loss_mlp": 1.03335881, + "epoch": 0.49216894633999697, + "flos": 25118819390880.0, + "grad_norm": 3.970031355353206, + "language_loss": 0.75164795, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77862787, + "num_input_tokens_seen": 176033890, + "step": 8186, + "time_per_iteration": 2.809227705001831 + }, + { + "auxiliary_loss_clip": 0.01458033, + "auxiliary_loss_mlp": 0.01244508, + "balance_loss_clip": 1.14503992, + "balance_loss_mlp": 1.03107572, + "epoch": 0.49222906959266494, + "flos": 22640151432960.0, + "grad_norm": 2.1740620962531834, + "language_loss": 0.67803377, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.70505917, + "num_input_tokens_seen": 176052720, + "step": 8187, + "time_per_iteration": 2.8425567150115967 + }, + { + "auxiliary_loss_clip": 0.01456649, + "auxiliary_loss_mlp": 0.01244323, + "balance_loss_clip": 1.14450204, + "balance_loss_mlp": 1.03279757, + "epoch": 0.49228919284533296, + "flos": 20412532516320.0, + "grad_norm": 1.6100516934486269, + "language_loss": 0.66718417, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69419384, + "num_input_tokens_seen": 176072545, + "step": 8188, + "time_per_iteration": 2.784874677658081 + }, + { + "auxiliary_loss_clip": 0.01455787, + "auxiliary_loss_mlp": 0.01235025, + "balance_loss_clip": 1.14311755, + "balance_loss_mlp": 1.02483559, + "epoch": 0.4923493160980009, + "flos": 27124632241920.0, + "grad_norm": 1.754223454822692, + "language_loss": 0.74698377, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.77389187, + "num_input_tokens_seen": 176091490, + "step": 8189, + "time_per_iteration": 2.8412482738494873 + }, + { + "auxiliary_loss_clip": 0.01462108, + "auxiliary_loss_mlp": 0.01248396, + "balance_loss_clip": 1.14934087, + "balance_loss_mlp": 1.03610826, + "epoch": 0.4924094393506689, + "flos": 35739644627520.0, + "grad_norm": 1.616210126290573, + "language_loss": 0.64559853, + "learning_rate": 2.146112575713104e-06, + "loss": 0.67270356, + "num_input_tokens_seen": 176113200, + "step": 8190, + "time_per_iteration": 2.921051263809204 + }, + { + "auxiliary_loss_clip": 0.01458433, + "auxiliary_loss_mlp": 0.01242472, + "balance_loss_clip": 1.14662337, + "balance_loss_mlp": 1.03037453, + "epoch": 0.49246956260333685, + "flos": 20414580636960.0, + "grad_norm": 1.869795875376121, + "language_loss": 0.71672809, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.7437371, + "num_input_tokens_seen": 176132485, + "step": 8191, + "time_per_iteration": 2.8336095809936523 + }, + { + "auxiliary_loss_clip": 0.01461165, + "auxiliary_loss_mlp": 0.01238983, + "balance_loss_clip": 1.14952445, + "balance_loss_mlp": 1.02955675, + "epoch": 0.4925296858560048, + "flos": 38979690120000.0, + "grad_norm": 1.7776223949229477, + "language_loss": 0.71710235, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.74410385, + "num_input_tokens_seen": 176155755, + "step": 8192, + "time_per_iteration": 2.9402549266815186 + }, + { + "auxiliary_loss_clip": 0.01520889, + "auxiliary_loss_mlp": 0.0118911, + "balance_loss_clip": 1.23837948, + "balance_loss_mlp": 0.9899826, + "epoch": 0.4925898091086728, + "flos": 64286103647520.0, + "grad_norm": 0.7181355265061635, + "language_loss": 0.52067822, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54777819, + "num_input_tokens_seen": 176216295, + "step": 8193, + "time_per_iteration": 3.3714969158172607 + }, + { + "auxiliary_loss_clip": 0.01461582, + "auxiliary_loss_mlp": 0.0123764, + "balance_loss_clip": 1.14858127, + "balance_loss_mlp": 1.02706873, + "epoch": 0.49264993236134075, + "flos": 23038439048640.0, + "grad_norm": 1.5150215351789087, + "language_loss": 0.77231658, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79930878, + "num_input_tokens_seen": 176235925, + "step": 8194, + "time_per_iteration": 2.8719136714935303 + }, + { + "auxiliary_loss_clip": 0.01455645, + "auxiliary_loss_mlp": 0.01240972, + "balance_loss_clip": 1.14318824, + "balance_loss_mlp": 1.03059125, + "epoch": 0.4927100556140087, + "flos": 24720569703360.0, + "grad_norm": 1.9933337770814035, + "language_loss": 0.70085251, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72781873, + "num_input_tokens_seen": 176253865, + "step": 8195, + "time_per_iteration": 2.7911360263824463 + }, + { + "auxiliary_loss_clip": 0.01466027, + "auxiliary_loss_mlp": 0.0124972, + "balance_loss_clip": 1.15407252, + "balance_loss_mlp": 1.03819466, + "epoch": 0.4927701788666767, + "flos": 23507349626880.0, + "grad_norm": 2.193378111686875, + "language_loss": 0.80734813, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83450556, + "num_input_tokens_seen": 176271525, + "step": 8196, + "time_per_iteration": 2.8048365116119385 + }, + { + "auxiliary_loss_clip": 0.0145676, + "auxiliary_loss_mlp": 0.01238372, + "balance_loss_clip": 1.14572716, + "balance_loss_mlp": 1.02493954, + "epoch": 0.49283030211934464, + "flos": 22931063334720.0, + "grad_norm": 2.548127211047754, + "language_loss": 0.7059775, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.73292887, + "num_input_tokens_seen": 176290810, + "step": 8197, + "time_per_iteration": 2.7912909984588623 + }, + { + "auxiliary_loss_clip": 0.01456575, + "auxiliary_loss_mlp": 0.01237964, + "balance_loss_clip": 1.14698601, + "balance_loss_mlp": 1.02548575, + "epoch": 0.4928904253720126, + "flos": 16874899765920.0, + "grad_norm": 1.9228495166590014, + "language_loss": 0.84220123, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86914659, + "num_input_tokens_seen": 176309165, + "step": 8198, + "time_per_iteration": 2.7491633892059326 + }, + { + "auxiliary_loss_clip": 0.01462125, + "auxiliary_loss_mlp": 0.01237535, + "balance_loss_clip": 1.15112972, + "balance_loss_mlp": 1.02276707, + "epoch": 0.4929505486246806, + "flos": 14868328351680.0, + "grad_norm": 2.142488735919082, + "language_loss": 0.76077306, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78776968, + "num_input_tokens_seen": 176324960, + "step": 8199, + "time_per_iteration": 2.7878243923187256 + }, + { + "auxiliary_loss_clip": 0.01463608, + "auxiliary_loss_mlp": 0.01236779, + "balance_loss_clip": 1.15433741, + "balance_loss_mlp": 1.02143979, + "epoch": 0.49301067187734854, + "flos": 23844648032640.0, + "grad_norm": 2.4053674225638377, + "language_loss": 0.59943092, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.6264348, + "num_input_tokens_seen": 176346195, + "step": 8200, + "time_per_iteration": 2.7803542613983154 + }, + { + "auxiliary_loss_clip": 0.01459142, + "auxiliary_loss_mlp": 0.01235932, + "balance_loss_clip": 1.1500802, + "balance_loss_mlp": 1.02536082, + "epoch": 0.49307079513001656, + "flos": 22493443852800.0, + "grad_norm": 1.5065681367813684, + "language_loss": 0.79192114, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81887186, + "num_input_tokens_seen": 176366735, + "step": 8201, + "time_per_iteration": 2.814922332763672 + }, + { + "auxiliary_loss_clip": 0.01462075, + "auxiliary_loss_mlp": 0.01241687, + "balance_loss_clip": 1.15168118, + "balance_loss_mlp": 1.0276829, + "epoch": 0.4931309183826845, + "flos": 15926420796480.0, + "grad_norm": 2.301928669161659, + "language_loss": 0.68154228, + "learning_rate": 2.141451129398785e-06, + "loss": 0.70857996, + "num_input_tokens_seen": 176384475, + "step": 8202, + "time_per_iteration": 4.2677161693573 + }, + { + "auxiliary_loss_clip": 0.01461006, + "auxiliary_loss_mlp": 0.01226188, + "balance_loss_clip": 1.15163827, + "balance_loss_mlp": 1.01428139, + "epoch": 0.4931910416353525, + "flos": 27311581964160.0, + "grad_norm": 2.38949686960702, + "language_loss": 0.74968147, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77655339, + "num_input_tokens_seen": 176402645, + "step": 8203, + "time_per_iteration": 2.768420457839966 + }, + { + "auxiliary_loss_clip": 0.01464809, + "auxiliary_loss_mlp": 0.01239707, + "balance_loss_clip": 1.15530777, + "balance_loss_mlp": 1.02741885, + "epoch": 0.49325116488802045, + "flos": 20807823807360.0, + "grad_norm": 2.0679515933505597, + "language_loss": 0.79826605, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82531124, + "num_input_tokens_seen": 176416715, + "step": 8204, + "time_per_iteration": 2.7960140705108643 + }, + { + "auxiliary_loss_clip": 0.01460262, + "auxiliary_loss_mlp": 0.0123987, + "balance_loss_clip": 1.15176225, + "balance_loss_mlp": 1.02891779, + "epoch": 0.4933112881406884, + "flos": 19868068314720.0, + "grad_norm": 2.503855468756379, + "language_loss": 0.66211945, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68912077, + "num_input_tokens_seen": 176435755, + "step": 8205, + "time_per_iteration": 2.7530791759490967 + }, + { + "auxiliary_loss_clip": 0.01464804, + "auxiliary_loss_mlp": 0.01240165, + "balance_loss_clip": 1.15562761, + "balance_loss_mlp": 1.02558827, + "epoch": 0.4933714113933564, + "flos": 21829732423200.0, + "grad_norm": 1.971209267677435, + "language_loss": 0.66205466, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68910432, + "num_input_tokens_seen": 176453915, + "step": 8206, + "time_per_iteration": 2.7391817569732666 + }, + { + "auxiliary_loss_clip": 0.01460268, + "auxiliary_loss_mlp": 0.01234457, + "balance_loss_clip": 1.15051126, + "balance_loss_mlp": 1.02293241, + "epoch": 0.49343153464602435, + "flos": 27892533420000.0, + "grad_norm": 2.3635736134962975, + "language_loss": 0.76565671, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.79260385, + "num_input_tokens_seen": 176475175, + "step": 8207, + "time_per_iteration": 2.799133062362671 + }, + { + "auxiliary_loss_clip": 0.01467611, + "auxiliary_loss_mlp": 0.01241824, + "balance_loss_clip": 1.15871072, + "balance_loss_mlp": 1.0283916, + "epoch": 0.4934916578986923, + "flos": 24683399742240.0, + "grad_norm": 2.1423901423748646, + "language_loss": 0.59925061, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62634498, + "num_input_tokens_seen": 176494250, + "step": 8208, + "time_per_iteration": 2.745659112930298 + }, + { + "auxiliary_loss_clip": 0.01462283, + "auxiliary_loss_mlp": 0.01236135, + "balance_loss_clip": 1.15281296, + "balance_loss_mlp": 1.02422833, + "epoch": 0.4935517811513603, + "flos": 23407445760480.0, + "grad_norm": 3.4429305377394193, + "language_loss": 0.78760946, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.81459361, + "num_input_tokens_seen": 176513325, + "step": 8209, + "time_per_iteration": 2.771686553955078 + }, + { + "auxiliary_loss_clip": 0.0147156, + "auxiliary_loss_mlp": 0.01229764, + "balance_loss_clip": 1.1619854, + "balance_loss_mlp": 1.01862037, + "epoch": 0.49361190440402825, + "flos": 21946666033440.0, + "grad_norm": 1.9436544716426094, + "language_loss": 0.78710121, + "learning_rate": 2.138343067844089e-06, + "loss": 0.81411445, + "num_input_tokens_seen": 176532915, + "step": 8210, + "time_per_iteration": 4.0880820751190186 + }, + { + "auxiliary_loss_clip": 0.01468177, + "auxiliary_loss_mlp": 0.0124953, + "balance_loss_clip": 1.15856338, + "balance_loss_mlp": 1.03304565, + "epoch": 0.4936720276566962, + "flos": 25117833258720.0, + "grad_norm": 1.8740806017017482, + "language_loss": 0.81431699, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.84149408, + "num_input_tokens_seen": 176552775, + "step": 8211, + "time_per_iteration": 4.256127595901489 + }, + { + "auxiliary_loss_clip": 0.01467375, + "auxiliary_loss_mlp": 0.01244996, + "balance_loss_clip": 1.15776598, + "balance_loss_mlp": 1.03404307, + "epoch": 0.4937321509093642, + "flos": 26361320371200.0, + "grad_norm": 2.4818634211607535, + "language_loss": 0.91161323, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93873698, + "num_input_tokens_seen": 176572185, + "step": 8212, + "time_per_iteration": 2.7632861137390137 + }, + { + "auxiliary_loss_clip": 0.01470928, + "auxiliary_loss_mlp": 0.01241079, + "balance_loss_clip": 1.16119015, + "balance_loss_mlp": 1.03050792, + "epoch": 0.49379227416203214, + "flos": 22963340563200.0, + "grad_norm": 2.282419796420648, + "language_loss": 0.64685196, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.67397201, + "num_input_tokens_seen": 176591490, + "step": 8213, + "time_per_iteration": 2.743192672729492 + }, + { + "auxiliary_loss_clip": 0.01475307, + "auxiliary_loss_mlp": 0.01246894, + "balance_loss_clip": 1.1659286, + "balance_loss_mlp": 1.037467, + "epoch": 0.49385239741470016, + "flos": 32491861790400.0, + "grad_norm": 1.8415070299075857, + "language_loss": 0.75417936, + "learning_rate": 2.136788910691711e-06, + "loss": 0.7814014, + "num_input_tokens_seen": 176612715, + "step": 8214, + "time_per_iteration": 2.8956308364868164 + }, + { + "auxiliary_loss_clip": 0.01476463, + "auxiliary_loss_mlp": 0.0125232, + "balance_loss_clip": 1.1689229, + "balance_loss_mlp": 1.04155815, + "epoch": 0.4939125206673681, + "flos": 22495112691840.0, + "grad_norm": 1.944443312280644, + "language_loss": 0.84570014, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.87298799, + "num_input_tokens_seen": 176631950, + "step": 8215, + "time_per_iteration": 2.7808635234832764 + }, + { + "auxiliary_loss_clip": 0.01474091, + "auxiliary_loss_mlp": 0.01233046, + "balance_loss_clip": 1.16648805, + "balance_loss_mlp": 1.02266502, + "epoch": 0.4939726439200361, + "flos": 31179079200960.0, + "grad_norm": 1.9357309620105017, + "language_loss": 0.84085077, + "learning_rate": 2.136011800934292e-06, + "loss": 0.86792213, + "num_input_tokens_seen": 176653060, + "step": 8216, + "time_per_iteration": 4.351099252700806 + }, + { + "auxiliary_loss_clip": 0.01472492, + "auxiliary_loss_mlp": 0.01234129, + "balance_loss_clip": 1.16293621, + "balance_loss_mlp": 1.02432024, + "epoch": 0.49403276717270406, + "flos": 22676411118240.0, + "grad_norm": 1.695870664282719, + "language_loss": 0.74863368, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.77569985, + "num_input_tokens_seen": 176673895, + "step": 8217, + "time_per_iteration": 2.7916483879089355 + }, + { + "auxiliary_loss_clip": 0.01478209, + "auxiliary_loss_mlp": 0.0123183, + "balance_loss_clip": 1.1699754, + "balance_loss_mlp": 1.02087736, + "epoch": 0.494092890425372, + "flos": 20743193494080.0, + "grad_norm": 1.8256169750882802, + "language_loss": 0.78899729, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.81609762, + "num_input_tokens_seen": 176692550, + "step": 8218, + "time_per_iteration": 2.9233956336975098 + }, + { + "auxiliary_loss_clip": 0.01476771, + "auxiliary_loss_mlp": 0.01239685, + "balance_loss_clip": 1.16861725, + "balance_loss_mlp": 1.03102112, + "epoch": 0.49415301367804, + "flos": 18370990974240.0, + "grad_norm": 3.031828094896445, + "language_loss": 0.77139926, + "learning_rate": 2.134846097653142e-06, + "loss": 0.79856384, + "num_input_tokens_seen": 176709335, + "step": 8219, + "time_per_iteration": 2.7752904891967773 + }, + { + "auxiliary_loss_clip": 0.01475506, + "auxiliary_loss_mlp": 0.01243235, + "balance_loss_clip": 1.16687584, + "balance_loss_mlp": 1.03457069, + "epoch": 0.49421313693070795, + "flos": 17532580618080.0, + "grad_norm": 2.0333758264344954, + "language_loss": 0.62014067, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64732814, + "num_input_tokens_seen": 176727715, + "step": 8220, + "time_per_iteration": 2.779113531112671 + }, + { + "auxiliary_loss_clip": 0.01473795, + "auxiliary_loss_mlp": 0.01242219, + "balance_loss_clip": 1.16600108, + "balance_loss_mlp": 1.0324111, + "epoch": 0.4942732601833759, + "flos": 20814195738240.0, + "grad_norm": 1.9904697278352212, + "language_loss": 0.72493041, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.75209057, + "num_input_tokens_seen": 176747530, + "step": 8221, + "time_per_iteration": 2.767402172088623 + }, + { + "auxiliary_loss_clip": 0.01469569, + "auxiliary_loss_mlp": 0.01234515, + "balance_loss_clip": 1.16178191, + "balance_loss_mlp": 1.02718592, + "epoch": 0.4943333834360439, + "flos": 15050726694720.0, + "grad_norm": 1.8448725482362298, + "language_loss": 0.79133987, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81838059, + "num_input_tokens_seen": 176765260, + "step": 8222, + "time_per_iteration": 2.8131673336029053 + }, + { + "auxiliary_loss_clip": 0.01487235, + "auxiliary_loss_mlp": 0.0124049, + "balance_loss_clip": 1.18002272, + "balance_loss_mlp": 1.02858388, + "epoch": 0.49439350668871185, + "flos": 16072256028960.0, + "grad_norm": 2.6310440365097887, + "language_loss": 0.71745288, + "learning_rate": 2.133291755093088e-06, + "loss": 0.74473017, + "num_input_tokens_seen": 176781770, + "step": 8223, + "time_per_iteration": 2.7353014945983887 + }, + { + "auxiliary_loss_clip": 0.01479537, + "auxiliary_loss_mlp": 0.01249236, + "balance_loss_clip": 1.17091227, + "balance_loss_mlp": 1.03752077, + "epoch": 0.4944536299413798, + "flos": 20881822376160.0, + "grad_norm": 2.810235624698318, + "language_loss": 0.75319713, + "learning_rate": 2.132903156780144e-06, + "loss": 0.78048491, + "num_input_tokens_seen": 176800655, + "step": 8224, + "time_per_iteration": 2.7343835830688477 + }, + { + "auxiliary_loss_clip": 0.01482898, + "auxiliary_loss_mlp": 0.01245283, + "balance_loss_clip": 1.17477965, + "balance_loss_mlp": 1.03204107, + "epoch": 0.4945137531940478, + "flos": 26611003998720.0, + "grad_norm": 2.1324188622855114, + "language_loss": 0.63468134, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.66196311, + "num_input_tokens_seen": 176820610, + "step": 8225, + "time_per_iteration": 2.7753334045410156 + }, + { + "auxiliary_loss_clip": 0.01471838, + "auxiliary_loss_mlp": 0.01249826, + "balance_loss_clip": 1.16275191, + "balance_loss_mlp": 1.04058993, + "epoch": 0.49457387644671574, + "flos": 23990445336960.0, + "grad_norm": 2.095652071016562, + "language_loss": 0.76573402, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.79295063, + "num_input_tokens_seen": 176840520, + "step": 8226, + "time_per_iteration": 2.859924793243408 + }, + { + "auxiliary_loss_clip": 0.01471793, + "auxiliary_loss_mlp": 0.01238609, + "balance_loss_clip": 1.16345489, + "balance_loss_mlp": 1.02803731, + "epoch": 0.49463399969938376, + "flos": 26978872865760.0, + "grad_norm": 1.81832495465654, + "language_loss": 0.70833421, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73543817, + "num_input_tokens_seen": 176860265, + "step": 8227, + "time_per_iteration": 2.7556469440460205 + }, + { + "auxiliary_loss_clip": 0.01476974, + "auxiliary_loss_mlp": 0.01246165, + "balance_loss_clip": 1.16859341, + "balance_loss_mlp": 1.03254187, + "epoch": 0.49469412295205173, + "flos": 29684239621920.0, + "grad_norm": 1.8974452570414648, + "language_loss": 0.71540022, + "learning_rate": 2.131348713278718e-06, + "loss": 0.74263161, + "num_input_tokens_seen": 176882910, + "step": 8228, + "time_per_iteration": 2.8157272338867188 + }, + { + "auxiliary_loss_clip": 0.01473804, + "auxiliary_loss_mlp": 0.01236335, + "balance_loss_clip": 1.16503668, + "balance_loss_mlp": 1.02824354, + "epoch": 0.4947542462047197, + "flos": 24133966951680.0, + "grad_norm": 1.943478083456998, + "language_loss": 0.84118533, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.86828667, + "num_input_tokens_seen": 176903030, + "step": 8229, + "time_per_iteration": 2.813723087310791 + }, + { + "auxiliary_loss_clip": 0.01472529, + "auxiliary_loss_mlp": 0.01240794, + "balance_loss_clip": 1.16511178, + "balance_loss_mlp": 1.03022313, + "epoch": 0.49481436945738766, + "flos": 20046977267040.0, + "grad_norm": 2.5155504081643665, + "language_loss": 0.74760258, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.77473581, + "num_input_tokens_seen": 176919025, + "step": 8230, + "time_per_iteration": 2.729515790939331 + }, + { + "auxiliary_loss_clip": 0.01468599, + "auxiliary_loss_mlp": 0.01245986, + "balance_loss_clip": 1.15998459, + "balance_loss_mlp": 1.03369856, + "epoch": 0.4948744927100556, + "flos": 15671389298400.0, + "grad_norm": 2.2751187008305513, + "language_loss": 0.7955237, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.82266951, + "num_input_tokens_seen": 176937945, + "step": 8231, + "time_per_iteration": 2.7332468032836914 + }, + { + "auxiliary_loss_clip": 0.01658102, + "auxiliary_loss_mlp": 0.01202156, + "balance_loss_clip": 1.37890494, + "balance_loss_mlp": 1.00455475, + "epoch": 0.4949346159627236, + "flos": 68879287656000.0, + "grad_norm": 0.9218509706500893, + "language_loss": 0.60099256, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62959504, + "num_input_tokens_seen": 177004575, + "step": 8232, + "time_per_iteration": 3.469805955886841 + }, + { + "auxiliary_loss_clip": 0.01472336, + "auxiliary_loss_mlp": 0.0124273, + "balance_loss_clip": 1.1626792, + "balance_loss_mlp": 1.02853441, + "epoch": 0.49499473921539155, + "flos": 24792596007840.0, + "grad_norm": 1.7703568740970979, + "language_loss": 0.68929231, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.716443, + "num_input_tokens_seen": 177024155, + "step": 8233, + "time_per_iteration": 2.788675308227539 + }, + { + "auxiliary_loss_clip": 0.01475687, + "auxiliary_loss_mlp": 0.01239916, + "balance_loss_clip": 1.16604221, + "balance_loss_mlp": 1.02762842, + "epoch": 0.4950548624680595, + "flos": 32710671531360.0, + "grad_norm": 2.8554197533528494, + "language_loss": 0.66780722, + "learning_rate": 2.129016898898633e-06, + "loss": 0.69496328, + "num_input_tokens_seen": 177046185, + "step": 8234, + "time_per_iteration": 2.8844048976898193 + }, + { + "auxiliary_loss_clip": 0.01648269, + "auxiliary_loss_mlp": 0.01207733, + "balance_loss_clip": 1.36683977, + "balance_loss_mlp": 1.01089478, + "epoch": 0.4951149857207275, + "flos": 50088351722400.0, + "grad_norm": 0.793865090596792, + "language_loss": 0.58010995, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60867, + "num_input_tokens_seen": 177099025, + "step": 8235, + "time_per_iteration": 3.180217981338501 + }, + { + "auxiliary_loss_clip": 0.0147444, + "auxiliary_loss_mlp": 0.01235734, + "balance_loss_clip": 1.16498649, + "balance_loss_mlp": 1.02306521, + "epoch": 0.49517510897339545, + "flos": 22238981277120.0, + "grad_norm": 1.7142116265916323, + "language_loss": 0.77224988, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.79935157, + "num_input_tokens_seen": 177118365, + "step": 8236, + "time_per_iteration": 2.8007426261901855 + }, + { + "auxiliary_loss_clip": 0.0147299, + "auxiliary_loss_mlp": 0.01230012, + "balance_loss_clip": 1.16600502, + "balance_loss_mlp": 1.01886868, + "epoch": 0.4952352322260634, + "flos": 25376467932000.0, + "grad_norm": 1.7546271645950642, + "language_loss": 0.73042035, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.75745034, + "num_input_tokens_seen": 177136415, + "step": 8237, + "time_per_iteration": 2.773793935775757 + }, + { + "auxiliary_loss_clip": 0.01464355, + "auxiliary_loss_mlp": 0.01244964, + "balance_loss_clip": 1.15638304, + "balance_loss_mlp": 1.03210378, + "epoch": 0.4952953554787314, + "flos": 24611070012480.0, + "grad_norm": 2.3321216284054347, + "language_loss": 0.75913435, + "learning_rate": 2.127462257935406e-06, + "loss": 0.78622752, + "num_input_tokens_seen": 177155690, + "step": 8238, + "time_per_iteration": 2.8770785331726074 + }, + { + "auxiliary_loss_clip": 0.0146444, + "auxiliary_loss_mlp": 0.01238432, + "balance_loss_clip": 1.15498078, + "balance_loss_mlp": 1.02576303, + "epoch": 0.49535547873139935, + "flos": 17313201954720.0, + "grad_norm": 2.456684640868065, + "language_loss": 0.7396602, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.76668894, + "num_input_tokens_seen": 177173350, + "step": 8239, + "time_per_iteration": 2.7826247215270996 + }, + { + "auxiliary_loss_clip": 0.01464073, + "auxiliary_loss_mlp": 0.01245092, + "balance_loss_clip": 1.15492582, + "balance_loss_mlp": 1.03146899, + "epoch": 0.4954156019840673, + "flos": 20742397002720.0, + "grad_norm": 5.74994366699081, + "language_loss": 0.7833643, + "learning_rate": 2.126684908394552e-06, + "loss": 0.81045592, + "num_input_tokens_seen": 177191115, + "step": 8240, + "time_per_iteration": 4.268972158432007 + }, + { + "auxiliary_loss_clip": 0.01462522, + "auxiliary_loss_mlp": 0.01230752, + "balance_loss_clip": 1.15389633, + "balance_loss_mlp": 1.02094388, + "epoch": 0.49547572523673533, + "flos": 12822197502240.0, + "grad_norm": 2.1146100522974485, + "language_loss": 0.85405421, + "learning_rate": 2.126296226410898e-06, + "loss": 0.88098699, + "num_input_tokens_seen": 177206155, + "step": 8241, + "time_per_iteration": 2.7998523712158203 + }, + { + "auxiliary_loss_clip": 0.01458593, + "auxiliary_loss_mlp": 0.01230838, + "balance_loss_clip": 1.14965749, + "balance_loss_mlp": 1.02102971, + "epoch": 0.4955358484894033, + "flos": 15598907856000.0, + "grad_norm": 1.8800623030083785, + "language_loss": 0.77157414, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79846847, + "num_input_tokens_seen": 177224815, + "step": 8242, + "time_per_iteration": 2.767364025115967 + }, + { + "auxiliary_loss_clip": 0.01467185, + "auxiliary_loss_mlp": 0.01233482, + "balance_loss_clip": 1.1580137, + "balance_loss_mlp": 1.01966834, + "epoch": 0.49559597174207126, + "flos": 26466534180000.0, + "grad_norm": 3.0935717378486722, + "language_loss": 0.66765052, + "learning_rate": 2.125518848090833e-06, + "loss": 0.69465721, + "num_input_tokens_seen": 177244490, + "step": 8243, + "time_per_iteration": 2.8017942905426025 + }, + { + "auxiliary_loss_clip": 0.01462467, + "auxiliary_loss_mlp": 0.01241657, + "balance_loss_clip": 1.15467215, + "balance_loss_mlp": 1.02898812, + "epoch": 0.4956560949947392, + "flos": 23150441998080.0, + "grad_norm": 2.0516923294526763, + "language_loss": 0.68489444, + "learning_rate": 2.125130151783901e-06, + "loss": 0.71193564, + "num_input_tokens_seen": 177264340, + "step": 8244, + "time_per_iteration": 2.752117395401001 + }, + { + "auxiliary_loss_clip": 0.01463044, + "auxiliary_loss_mlp": 0.01234808, + "balance_loss_clip": 1.15495884, + "balance_loss_mlp": 1.02080393, + "epoch": 0.4957162182474072, + "flos": 20775356938080.0, + "grad_norm": 2.080638853151235, + "language_loss": 0.748945, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77592349, + "num_input_tokens_seen": 177283055, + "step": 8245, + "time_per_iteration": 2.8032987117767334 + }, + { + "auxiliary_loss_clip": 0.01466329, + "auxiliary_loss_mlp": 0.01243589, + "balance_loss_clip": 1.15936327, + "balance_loss_mlp": 1.03263664, + "epoch": 0.49577634150007516, + "flos": 18736204870080.0, + "grad_norm": 1.90351833856465, + "language_loss": 0.81769085, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.84479004, + "num_input_tokens_seen": 177301140, + "step": 8246, + "time_per_iteration": 2.7874598503112793 + }, + { + "auxiliary_loss_clip": 0.01463031, + "auxiliary_loss_mlp": 0.01238139, + "balance_loss_clip": 1.15376604, + "balance_loss_mlp": 1.02451634, + "epoch": 0.4958364647527431, + "flos": 25556400944640.0, + "grad_norm": 7.706543266493909, + "language_loss": 0.83673143, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.86374307, + "num_input_tokens_seen": 177323095, + "step": 8247, + "time_per_iteration": 2.858565330505371 + }, + { + "auxiliary_loss_clip": 0.01468226, + "auxiliary_loss_mlp": 0.01249118, + "balance_loss_clip": 1.16032481, + "balance_loss_mlp": 1.03740263, + "epoch": 0.4958965880054111, + "flos": 24427609680960.0, + "grad_norm": 3.5293422349838313, + "language_loss": 0.83428192, + "learning_rate": 2.123575319254087e-06, + "loss": 0.86145532, + "num_input_tokens_seen": 177339845, + "step": 8248, + "time_per_iteration": 4.301536560058594 + }, + { + "auxiliary_loss_clip": 0.01469515, + "auxiliary_loss_mlp": 0.01248577, + "balance_loss_clip": 1.16039884, + "balance_loss_mlp": 1.03590727, + "epoch": 0.49595671125807905, + "flos": 25085859455520.0, + "grad_norm": 2.114704528833066, + "language_loss": 0.73276722, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75994813, + "num_input_tokens_seen": 177359980, + "step": 8249, + "time_per_iteration": 4.297773599624634 + }, + { + "auxiliary_loss_clip": 0.01465581, + "auxiliary_loss_mlp": 0.01241811, + "balance_loss_clip": 1.15849543, + "balance_loss_mlp": 1.03028607, + "epoch": 0.496016834510747, + "flos": 16437773350080.0, + "grad_norm": 1.910710726170665, + "language_loss": 0.76496673, + "learning_rate": 2.122797874814289e-06, + "loss": 0.79204059, + "num_input_tokens_seen": 177378580, + "step": 8250, + "time_per_iteration": 2.74985671043396 + }, + { + "auxiliary_loss_clip": 0.01463483, + "auxiliary_loss_mlp": 0.0124249, + "balance_loss_clip": 1.15626609, + "balance_loss_mlp": 1.03153765, + "epoch": 0.496076957763415, + "flos": 23440026414240.0, + "grad_norm": 2.2124912389774063, + "language_loss": 0.7008661, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.7279259, + "num_input_tokens_seen": 177398790, + "step": 8251, + "time_per_iteration": 2.8748135566711426 + }, + { + "auxiliary_loss_clip": 0.01472028, + "auxiliary_loss_mlp": 0.01241516, + "balance_loss_clip": 1.16423488, + "balance_loss_mlp": 1.02808404, + "epoch": 0.49613708101608295, + "flos": 16911387020160.0, + "grad_norm": 1.8711996843571066, + "language_loss": 0.80196786, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82910329, + "num_input_tokens_seen": 177416515, + "step": 8252, + "time_per_iteration": 2.780604839324951 + }, + { + "auxiliary_loss_clip": 0.01481392, + "auxiliary_loss_mlp": 0.01248084, + "balance_loss_clip": 1.17387509, + "balance_loss_mlp": 1.03503346, + "epoch": 0.4961972042687509, + "flos": 16619982052320.0, + "grad_norm": 2.0922116368712196, + "language_loss": 0.81127727, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.83857203, + "num_input_tokens_seen": 177434425, + "step": 8253, + "time_per_iteration": 2.740840196609497 + }, + { + "auxiliary_loss_clip": 0.01471205, + "auxiliary_loss_mlp": 0.01234242, + "balance_loss_clip": 1.16364348, + "balance_loss_mlp": 1.02309847, + "epoch": 0.49625732752141893, + "flos": 28959690695040.0, + "grad_norm": 1.6495669388288452, + "language_loss": 0.67490584, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.70196027, + "num_input_tokens_seen": 177459675, + "step": 8254, + "time_per_iteration": 4.318636417388916 + }, + { + "auxiliary_loss_clip": 0.01468696, + "auxiliary_loss_mlp": 0.01248231, + "balance_loss_clip": 1.16189098, + "balance_loss_mlp": 1.0355618, + "epoch": 0.4963174507740869, + "flos": 23114599522560.0, + "grad_norm": 2.547967913807665, + "language_loss": 0.73752075, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76469004, + "num_input_tokens_seen": 177478895, + "step": 8255, + "time_per_iteration": 2.813572406768799 + }, + { + "auxiliary_loss_clip": 0.01467373, + "auxiliary_loss_mlp": 0.01241585, + "balance_loss_clip": 1.15911245, + "balance_loss_mlp": 1.02929735, + "epoch": 0.49637757402675486, + "flos": 13919583885120.0, + "grad_norm": 2.221657015753265, + "language_loss": 0.81601816, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.84310776, + "num_input_tokens_seen": 177494920, + "step": 8256, + "time_per_iteration": 2.7330143451690674 + }, + { + "auxiliary_loss_clip": 0.01469241, + "auxiliary_loss_mlp": 0.01250551, + "balance_loss_clip": 1.1631248, + "balance_loss_mlp": 1.04074216, + "epoch": 0.49643769727942283, + "flos": 22311159294240.0, + "grad_norm": 1.9402259008640101, + "language_loss": 0.80751485, + "learning_rate": 2.120076673368901e-06, + "loss": 0.83471274, + "num_input_tokens_seen": 177515455, + "step": 8257, + "time_per_iteration": 2.821786403656006 + }, + { + "auxiliary_loss_clip": 0.01470212, + "auxiliary_loss_mlp": 0.0123734, + "balance_loss_clip": 1.16324186, + "balance_loss_mlp": 1.02619624, + "epoch": 0.4964978205320908, + "flos": 19502285496480.0, + "grad_norm": 2.9196903816952795, + "language_loss": 0.66145426, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68852973, + "num_input_tokens_seen": 177534040, + "step": 8258, + "time_per_iteration": 2.7739408016204834 + }, + { + "auxiliary_loss_clip": 0.01472062, + "auxiliary_loss_mlp": 0.01238674, + "balance_loss_clip": 1.16467047, + "balance_loss_mlp": 1.02924728, + "epoch": 0.49655794378475876, + "flos": 23438547216000.0, + "grad_norm": 1.5493200899947528, + "language_loss": 0.77519619, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.80230355, + "num_input_tokens_seen": 177554510, + "step": 8259, + "time_per_iteration": 2.8097970485687256 + }, + { + "auxiliary_loss_clip": 0.01468178, + "auxiliary_loss_mlp": 0.0124151, + "balance_loss_clip": 1.16039336, + "balance_loss_mlp": 1.03036618, + "epoch": 0.4966180670374267, + "flos": 26833416914880.0, + "grad_norm": 1.675216339689665, + "language_loss": 0.78530914, + "learning_rate": 2.1189103755834e-06, + "loss": 0.81240594, + "num_input_tokens_seen": 177575780, + "step": 8260, + "time_per_iteration": 2.8151662349700928 + }, + { + "auxiliary_loss_clip": 0.01463303, + "auxiliary_loss_mlp": 0.01246264, + "balance_loss_clip": 1.15575588, + "balance_loss_mlp": 1.03531122, + "epoch": 0.4966781902900947, + "flos": 22011030849600.0, + "grad_norm": 3.5613493570465056, + "language_loss": 0.75940973, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.7865054, + "num_input_tokens_seen": 177588965, + "step": 8261, + "time_per_iteration": 2.7746212482452393 + }, + { + "auxiliary_loss_clip": 0.01465478, + "auxiliary_loss_mlp": 0.01243824, + "balance_loss_clip": 1.15763903, + "balance_loss_mlp": 1.03496909, + "epoch": 0.49673831354276266, + "flos": 26215750635840.0, + "grad_norm": 2.343779809174366, + "language_loss": 0.89292884, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.92002189, + "num_input_tokens_seen": 177608425, + "step": 8262, + "time_per_iteration": 2.8162097930908203 + }, + { + "auxiliary_loss_clip": 0.01470316, + "auxiliary_loss_mlp": 0.01233859, + "balance_loss_clip": 1.16450989, + "balance_loss_mlp": 1.02462268, + "epoch": 0.4967984367954306, + "flos": 23184350137440.0, + "grad_norm": 1.8526788697205367, + "language_loss": 0.74162567, + "learning_rate": 2.11774403721606e-06, + "loss": 0.7686674, + "num_input_tokens_seen": 177628240, + "step": 8263, + "time_per_iteration": 2.7893998622894287 + }, + { + "auxiliary_loss_clip": 0.01472264, + "auxiliary_loss_mlp": 0.01239226, + "balance_loss_clip": 1.16553867, + "balance_loss_mlp": 1.02560258, + "epoch": 0.4968585600480986, + "flos": 19283475755520.0, + "grad_norm": 2.136238701481305, + "language_loss": 0.69693428, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.72404921, + "num_input_tokens_seen": 177645920, + "step": 8264, + "time_per_iteration": 2.786794900894165 + }, + { + "auxiliary_loss_clip": 0.01457413, + "auxiliary_loss_mlp": 0.01249138, + "balance_loss_clip": 1.14931226, + "balance_loss_mlp": 1.03952026, + "epoch": 0.49691868330076655, + "flos": 22530917239200.0, + "grad_norm": 1.7142674346317308, + "language_loss": 0.6467641, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67382967, + "num_input_tokens_seen": 177667185, + "step": 8265, + "time_per_iteration": 2.780211925506592 + }, + { + "auxiliary_loss_clip": 0.01619937, + "auxiliary_loss_mlp": 0.0121283, + "balance_loss_clip": 1.33841991, + "balance_loss_mlp": 1.01446533, + "epoch": 0.4969788065534345, + "flos": 66584421383040.0, + "grad_norm": 0.9899437998974442, + "language_loss": 0.53426087, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.56258857, + "num_input_tokens_seen": 177733020, + "step": 8266, + "time_per_iteration": 3.3686532974243164 + }, + { + "auxiliary_loss_clip": 0.01468318, + "auxiliary_loss_mlp": 0.01246865, + "balance_loss_clip": 1.16193497, + "balance_loss_mlp": 1.03610277, + "epoch": 0.49703892980610254, + "flos": 24061713078240.0, + "grad_norm": 1.892123243600738, + "language_loss": 0.79430485, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.82145667, + "num_input_tokens_seen": 177753370, + "step": 8267, + "time_per_iteration": 2.79377818107605 + }, + { + "auxiliary_loss_clip": 0.01463312, + "auxiliary_loss_mlp": 0.01250924, + "balance_loss_clip": 1.15564513, + "balance_loss_mlp": 1.03978014, + "epoch": 0.4970990530587705, + "flos": 29128396972320.0, + "grad_norm": 2.844802134176391, + "language_loss": 0.74285442, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76999676, + "num_input_tokens_seen": 177771530, + "step": 8268, + "time_per_iteration": 2.890089750289917 + }, + { + "auxiliary_loss_clip": 0.01456433, + "auxiliary_loss_mlp": 0.0124147, + "balance_loss_clip": 1.14760518, + "balance_loss_mlp": 1.02899134, + "epoch": 0.49715917631143847, + "flos": 46029733030080.0, + "grad_norm": 2.3545457293378016, + "language_loss": 0.67938185, + "learning_rate": 2.115411240328073e-06, + "loss": 0.70636082, + "num_input_tokens_seen": 177796355, + "step": 8269, + "time_per_iteration": 3.0279006958007812 + }, + { + "auxiliary_loss_clip": 0.01461248, + "auxiliary_loss_mlp": 0.01241525, + "balance_loss_clip": 1.15337825, + "balance_loss_mlp": 1.03228915, + "epoch": 0.49721929956410643, + "flos": 20193229709280.0, + "grad_norm": 1.6494388798974111, + "language_loss": 0.85607594, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.88310367, + "num_input_tokens_seen": 177814300, + "step": 8270, + "time_per_iteration": 2.765350580215454 + }, + { + "auxiliary_loss_clip": 0.0145821, + "auxiliary_loss_mlp": 0.01250385, + "balance_loss_clip": 1.15024543, + "balance_loss_mlp": 1.04133987, + "epoch": 0.4972794228167744, + "flos": 21655754131680.0, + "grad_norm": 1.8424651094596074, + "language_loss": 0.7057175, + "learning_rate": 2.114633606196899e-06, + "loss": 0.73280346, + "num_input_tokens_seen": 177833615, + "step": 8271, + "time_per_iteration": 2.8482232093811035 + }, + { + "auxiliary_loss_clip": 0.01457974, + "auxiliary_loss_mlp": 0.01251421, + "balance_loss_clip": 1.14938903, + "balance_loss_mlp": 1.04027748, + "epoch": 0.49733954606944236, + "flos": 24282039945600.0, + "grad_norm": 1.4427289662577107, + "language_loss": 0.78517485, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.81226879, + "num_input_tokens_seen": 177855315, + "step": 8272, + "time_per_iteration": 2.8167495727539062 + }, + { + "auxiliary_loss_clip": 0.01463333, + "auxiliary_loss_mlp": 0.01248137, + "balance_loss_clip": 1.15474081, + "balance_loss_mlp": 1.03546798, + "epoch": 0.4973996693221103, + "flos": 37855639876320.0, + "grad_norm": 2.072365733124484, + "language_loss": 0.66632879, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.69344342, + "num_input_tokens_seen": 177875590, + "step": 8273, + "time_per_iteration": 2.9257240295410156 + }, + { + "auxiliary_loss_clip": 0.01462727, + "auxiliary_loss_mlp": 0.0124907, + "balance_loss_clip": 1.15467191, + "balance_loss_mlp": 1.04002428, + "epoch": 0.4974597925747783, + "flos": 21363590600640.0, + "grad_norm": 1.8051872181103843, + "language_loss": 0.78278047, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80989844, + "num_input_tokens_seen": 177894175, + "step": 8274, + "time_per_iteration": 2.7207202911376953 + }, + { + "auxiliary_loss_clip": 0.01461904, + "auxiliary_loss_mlp": 0.01251846, + "balance_loss_clip": 1.152542, + "balance_loss_mlp": 1.03879535, + "epoch": 0.49751991582744626, + "flos": 30740814940320.0, + "grad_norm": 1.9834381754543187, + "language_loss": 0.75720185, + "learning_rate": 2.113078285889493e-06, + "loss": 0.78433931, + "num_input_tokens_seen": 177913920, + "step": 8275, + "time_per_iteration": 2.9243838787078857 + }, + { + "auxiliary_loss_clip": 0.01465072, + "auxiliary_loss_mlp": 0.01242445, + "balance_loss_clip": 1.15611458, + "balance_loss_mlp": 1.02882195, + "epoch": 0.4975800390801142, + "flos": 14102285653440.0, + "grad_norm": 2.4505194551357903, + "language_loss": 0.84028733, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.8673625, + "num_input_tokens_seen": 177930425, + "step": 8276, + "time_per_iteration": 2.7996022701263428 + }, + { + "auxiliary_loss_clip": 0.01459667, + "auxiliary_loss_mlp": 0.01236122, + "balance_loss_clip": 1.15272892, + "balance_loss_mlp": 1.02612305, + "epoch": 0.4976401623327822, + "flos": 24209596431360.0, + "grad_norm": 1.5684585392164676, + "language_loss": 0.70325881, + "learning_rate": 2.112300599949172e-06, + "loss": 0.73021674, + "num_input_tokens_seen": 177949885, + "step": 8277, + "time_per_iteration": 2.7871172428131104 + }, + { + "auxiliary_loss_clip": 0.01460273, + "auxiliary_loss_mlp": 0.01242415, + "balance_loss_clip": 1.15164948, + "balance_loss_mlp": 1.03184366, + "epoch": 0.49770028558545015, + "flos": 21138598569600.0, + "grad_norm": 1.8762674172771117, + "language_loss": 0.82304358, + "learning_rate": 2.111911750583964e-06, + "loss": 0.85007048, + "num_input_tokens_seen": 177965720, + "step": 8278, + "time_per_iteration": 4.346324682235718 + }, + { + "auxiliary_loss_clip": 0.0146235, + "auxiliary_loss_mlp": 0.01249545, + "balance_loss_clip": 1.15439928, + "balance_loss_mlp": 1.03801966, + "epoch": 0.4977604088381181, + "flos": 16765817284800.0, + "grad_norm": 1.9810556778509727, + "language_loss": 0.67844152, + "learning_rate": 2.111522896975052e-06, + "loss": 0.70556045, + "num_input_tokens_seen": 177983190, + "step": 8279, + "time_per_iteration": 2.754643440246582 + }, + { + "auxiliary_loss_clip": 0.01456712, + "auxiliary_loss_mlp": 0.01247035, + "balance_loss_clip": 1.14838862, + "balance_loss_mlp": 1.03684497, + "epoch": 0.49782053209078614, + "flos": 15705221581440.0, + "grad_norm": 2.674167039334621, + "language_loss": 0.70868421, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.73572171, + "num_input_tokens_seen": 178000155, + "step": 8280, + "time_per_iteration": 2.7981386184692383 + }, + { + "auxiliary_loss_clip": 0.01454377, + "auxiliary_loss_mlp": 0.01233493, + "balance_loss_clip": 1.14703369, + "balance_loss_mlp": 1.02482951, + "epoch": 0.4978806553434541, + "flos": 24755994969120.0, + "grad_norm": 3.9352751842961875, + "language_loss": 0.64540029, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.67227906, + "num_input_tokens_seen": 178021060, + "step": 8281, + "time_per_iteration": 2.8476951122283936 + }, + { + "auxiliary_loss_clip": 0.01465921, + "auxiliary_loss_mlp": 0.01249331, + "balance_loss_clip": 1.15738058, + "balance_loss_mlp": 1.03704309, + "epoch": 0.49794077859612207, + "flos": 13117433214240.0, + "grad_norm": 2.701532012142986, + "language_loss": 0.72406328, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.75121582, + "num_input_tokens_seen": 178038180, + "step": 8282, + "time_per_iteration": 2.732370615005493 + }, + { + "auxiliary_loss_clip": 0.01462119, + "auxiliary_loss_mlp": 0.01238123, + "balance_loss_clip": 1.15498054, + "balance_loss_mlp": 1.02888715, + "epoch": 0.49800090184879003, + "flos": 27527509164960.0, + "grad_norm": 1.5884675287153183, + "language_loss": 0.73544991, + "learning_rate": 2.109967440397263e-06, + "loss": 0.76245236, + "num_input_tokens_seen": 178057565, + "step": 8283, + "time_per_iteration": 2.8740031719207764 + }, + { + "auxiliary_loss_clip": 0.01465062, + "auxiliary_loss_mlp": 0.01243474, + "balance_loss_clip": 1.15678692, + "balance_loss_mlp": 1.03118634, + "epoch": 0.498061025101458, + "flos": 19794562812000.0, + "grad_norm": 1.6533565308313005, + "language_loss": 0.78986645, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.81695175, + "num_input_tokens_seen": 178076965, + "step": 8284, + "time_per_iteration": 2.7897257804870605 + }, + { + "auxiliary_loss_clip": 0.01462066, + "auxiliary_loss_mlp": 0.01238038, + "balance_loss_clip": 1.15291214, + "balance_loss_mlp": 1.02441478, + "epoch": 0.49812114835412596, + "flos": 29896260222240.0, + "grad_norm": 1.802351972372379, + "language_loss": 0.73762149, + "learning_rate": 2.109189687029526e-06, + "loss": 0.76462245, + "num_input_tokens_seen": 178095105, + "step": 8285, + "time_per_iteration": 2.8727362155914307 + }, + { + "auxiliary_loss_clip": 0.01471692, + "auxiliary_loss_mlp": 0.01246267, + "balance_loss_clip": 1.16447783, + "balance_loss_mlp": 1.03321576, + "epoch": 0.49818127160679393, + "flos": 23149190368800.0, + "grad_norm": 1.8124757571835295, + "language_loss": 0.74218702, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76936662, + "num_input_tokens_seen": 178114505, + "step": 8286, + "time_per_iteration": 4.272755861282349 + }, + { + "auxiliary_loss_clip": 0.01466946, + "auxiliary_loss_mlp": 0.0125123, + "balance_loss_clip": 1.15910208, + "balance_loss_mlp": 1.03894198, + "epoch": 0.4982413948594619, + "flos": 21654957640320.0, + "grad_norm": 2.727132543404204, + "language_loss": 0.85592782, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.88310969, + "num_input_tokens_seen": 178131595, + "step": 8287, + "time_per_iteration": 4.353255748748779 + }, + { + "auxiliary_loss_clip": 0.01463762, + "auxiliary_loss_mlp": 0.01247725, + "balance_loss_clip": 1.15671957, + "balance_loss_mlp": 1.0342927, + "epoch": 0.49830151811212986, + "flos": 32489624028960.0, + "grad_norm": 1.6999103905144801, + "language_loss": 0.72247189, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74958682, + "num_input_tokens_seen": 178152055, + "step": 8288, + "time_per_iteration": 2.9213407039642334 + }, + { + "auxiliary_loss_clip": 0.01471776, + "auxiliary_loss_mlp": 0.01245993, + "balance_loss_clip": 1.16389823, + "balance_loss_mlp": 1.0304625, + "epoch": 0.4983616413647978, + "flos": 18143874966240.0, + "grad_norm": 2.971933285388748, + "language_loss": 0.8065713, + "learning_rate": 2.10763413072622e-06, + "loss": 0.833749, + "num_input_tokens_seen": 178168150, + "step": 8289, + "time_per_iteration": 2.800017833709717 + }, + { + "auxiliary_loss_clip": 0.01462388, + "auxiliary_loss_mlp": 0.01243723, + "balance_loss_clip": 1.15362716, + "balance_loss_mlp": 1.03277016, + "epoch": 0.4984217646174658, + "flos": 19720867668480.0, + "grad_norm": 2.2361938221268653, + "language_loss": 0.73097187, + "learning_rate": 2.107245231409784e-06, + "loss": 0.75803304, + "num_input_tokens_seen": 178186150, + "step": 8290, + "time_per_iteration": 2.751880645751953 + }, + { + "auxiliary_loss_clip": 0.01470697, + "auxiliary_loss_mlp": 0.01246665, + "balance_loss_clip": 1.16348016, + "balance_loss_mlp": 1.03456831, + "epoch": 0.49848188787013376, + "flos": 24938886378240.0, + "grad_norm": 1.8512052456414616, + "language_loss": 0.84080398, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86797762, + "num_input_tokens_seen": 178207665, + "step": 8291, + "time_per_iteration": 2.999351978302002 + }, + { + "auxiliary_loss_clip": 0.01465401, + "auxiliary_loss_mlp": 0.01249771, + "balance_loss_clip": 1.15707552, + "balance_loss_mlp": 1.0357666, + "epoch": 0.4985420111228017, + "flos": 22384626868800.0, + "grad_norm": 1.8222418295101144, + "language_loss": 0.6720072, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69915903, + "num_input_tokens_seen": 178226325, + "step": 8292, + "time_per_iteration": 4.215372085571289 + }, + { + "auxiliary_loss_clip": 0.01465415, + "auxiliary_loss_mlp": 0.01249367, + "balance_loss_clip": 1.15825939, + "balance_loss_mlp": 1.03765106, + "epoch": 0.4986021343754697, + "flos": 16218318830400.0, + "grad_norm": 1.6258145569923719, + "language_loss": 0.66852033, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69566816, + "num_input_tokens_seen": 178244960, + "step": 8293, + "time_per_iteration": 2.7215816974639893 + }, + { + "auxiliary_loss_clip": 0.01470561, + "auxiliary_loss_mlp": 0.01246497, + "balance_loss_clip": 1.1641922, + "balance_loss_mlp": 1.03420949, + "epoch": 0.4986622576281377, + "flos": 23405814849600.0, + "grad_norm": 2.1490977812354295, + "language_loss": 0.82051516, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84768575, + "num_input_tokens_seen": 178265400, + "step": 8294, + "time_per_iteration": 2.8639280796051025 + }, + { + "auxiliary_loss_clip": 0.01468793, + "auxiliary_loss_mlp": 0.01238214, + "balance_loss_clip": 1.16072488, + "balance_loss_mlp": 1.0255444, + "epoch": 0.49872238088080567, + "flos": 19976619801600.0, + "grad_norm": 3.1854788469431212, + "language_loss": 0.7281431, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.75521314, + "num_input_tokens_seen": 178284535, + "step": 8295, + "time_per_iteration": 2.719921112060547 + }, + { + "auxiliary_loss_clip": 0.01467704, + "auxiliary_loss_mlp": 0.01237901, + "balance_loss_clip": 1.16042209, + "balance_loss_mlp": 1.02694774, + "epoch": 0.49878250413347364, + "flos": 22895220859200.0, + "grad_norm": 2.0669841207166333, + "language_loss": 0.67493713, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.70199317, + "num_input_tokens_seen": 178302425, + "step": 8296, + "time_per_iteration": 2.794412851333618 + }, + { + "auxiliary_loss_clip": 0.01467481, + "auxiliary_loss_mlp": 0.01244372, + "balance_loss_clip": 1.15936506, + "balance_loss_mlp": 1.0292232, + "epoch": 0.4988426273861416, + "flos": 32601209768640.0, + "grad_norm": 2.1180335736592273, + "language_loss": 0.64609456, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.67321312, + "num_input_tokens_seen": 178323065, + "step": 8297, + "time_per_iteration": 2.8444111347198486 + }, + { + "auxiliary_loss_clip": 0.01470667, + "auxiliary_loss_mlp": 0.01244373, + "balance_loss_clip": 1.16405451, + "balance_loss_mlp": 1.03418279, + "epoch": 0.49890275063880957, + "flos": 20925743549760.0, + "grad_norm": 1.688527660848399, + "language_loss": 0.69805038, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.72520077, + "num_input_tokens_seen": 178343985, + "step": 8298, + "time_per_iteration": 2.7925193309783936 + }, + { + "auxiliary_loss_clip": 0.01460861, + "auxiliary_loss_mlp": 0.01232455, + "balance_loss_clip": 1.15304852, + "balance_loss_mlp": 1.02302861, + "epoch": 0.49896287389147753, + "flos": 18626439682080.0, + "grad_norm": 1.8929831806712383, + "language_loss": 0.85012305, + "learning_rate": 2.103744956327814e-06, + "loss": 0.87705624, + "num_input_tokens_seen": 178362345, + "step": 8299, + "time_per_iteration": 2.8476898670196533 + }, + { + "auxiliary_loss_clip": 0.01470621, + "auxiliary_loss_mlp": 0.01248051, + "balance_loss_clip": 1.16275239, + "balance_loss_mlp": 1.03404653, + "epoch": 0.4990229971441455, + "flos": 24828855693120.0, + "grad_norm": 3.6557933234942315, + "language_loss": 0.69337928, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.72056603, + "num_input_tokens_seen": 178383190, + "step": 8300, + "time_per_iteration": 2.8175673484802246 + }, + { + "auxiliary_loss_clip": 0.01598838, + "auxiliary_loss_mlp": 0.0121283, + "balance_loss_clip": 1.31656456, + "balance_loss_mlp": 1.01446533, + "epoch": 0.49908312039681346, + "flos": 71391598256160.0, + "grad_norm": 0.7520453429798881, + "language_loss": 0.5115881, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.5397048, + "num_input_tokens_seen": 178444250, + "step": 8301, + "time_per_iteration": 3.4565398693084717 + }, + { + "auxiliary_loss_clip": 0.01465934, + "auxiliary_loss_mlp": 0.01244398, + "balance_loss_clip": 1.15862131, + "balance_loss_mlp": 1.03649712, + "epoch": 0.4991432436494814, + "flos": 19830708712800.0, + "grad_norm": 1.811040088646445, + "language_loss": 0.8434453, + "learning_rate": 2.102578126623879e-06, + "loss": 0.87054861, + "num_input_tokens_seen": 178463250, + "step": 8302, + "time_per_iteration": 2.741314649581909 + }, + { + "auxiliary_loss_clip": 0.01469042, + "auxiliary_loss_mlp": 0.01235525, + "balance_loss_clip": 1.16256595, + "balance_loss_mlp": 1.02399981, + "epoch": 0.4992033669021494, + "flos": 15123701203200.0, + "grad_norm": 2.1014175756539455, + "language_loss": 0.68951589, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71656156, + "num_input_tokens_seen": 178481340, + "step": 8303, + "time_per_iteration": 2.824429988861084 + }, + { + "auxiliary_loss_clip": 0.01465541, + "auxiliary_loss_mlp": 0.01246327, + "balance_loss_clip": 1.1593864, + "balance_loss_mlp": 1.03499293, + "epoch": 0.49926349015481736, + "flos": 31210408225440.0, + "grad_norm": 1.7907201063926212, + "language_loss": 0.72764993, + "learning_rate": 2.101800220681144e-06, + "loss": 0.75476861, + "num_input_tokens_seen": 178501545, + "step": 8304, + "time_per_iteration": 2.9083340167999268 + }, + { + "auxiliary_loss_clip": 0.01467408, + "auxiliary_loss_mlp": 0.01235622, + "balance_loss_clip": 1.16013074, + "balance_loss_mlp": 1.02314341, + "epoch": 0.4993236134074853, + "flos": 24902550836640.0, + "grad_norm": 2.2408249239000195, + "language_loss": 0.80733675, + "learning_rate": 2.10141126191199e-06, + "loss": 0.83436704, + "num_input_tokens_seen": 178519700, + "step": 8305, + "time_per_iteration": 2.7722764015197754 + }, + { + "auxiliary_loss_clip": 0.01591838, + "auxiliary_loss_mlp": 0.01215774, + "balance_loss_clip": 1.31124234, + "balance_loss_mlp": 1.01664734, + "epoch": 0.4993837366601533, + "flos": 70426734321600.0, + "grad_norm": 0.7095854118624999, + "language_loss": 0.56833053, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.5964067, + "num_input_tokens_seen": 178576740, + "step": 8306, + "time_per_iteration": 3.474350690841675 + }, + { + "auxiliary_loss_clip": 0.0146465, + "auxiliary_loss_mlp": 0.01252741, + "balance_loss_clip": 1.15733826, + "balance_loss_mlp": 1.04102516, + "epoch": 0.4994438599128213, + "flos": 15963059763360.0, + "grad_norm": 1.7562085527355633, + "language_loss": 0.82287693, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.85005081, + "num_input_tokens_seen": 178594745, + "step": 8307, + "time_per_iteration": 2.7527358531951904 + }, + { + "auxiliary_loss_clip": 0.01466576, + "auxiliary_loss_mlp": 0.01244596, + "balance_loss_clip": 1.15928042, + "balance_loss_mlp": 1.03307104, + "epoch": 0.4995039831654893, + "flos": 27930803297760.0, + "grad_norm": 1.8272898952415992, + "language_loss": 0.60909522, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.63620698, + "num_input_tokens_seen": 178614110, + "step": 8308, + "time_per_iteration": 2.849167823791504 + }, + { + "auxiliary_loss_clip": 0.01461674, + "auxiliary_loss_mlp": 0.01242321, + "balance_loss_clip": 1.15428901, + "balance_loss_mlp": 1.03003287, + "epoch": 0.49956410641815724, + "flos": 24206979388320.0, + "grad_norm": 1.659900699337983, + "language_loss": 0.74836266, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.77540255, + "num_input_tokens_seen": 178634170, + "step": 8309, + "time_per_iteration": 2.7691867351531982 + }, + { + "auxiliary_loss_clip": 0.01458449, + "auxiliary_loss_mlp": 0.01244996, + "balance_loss_clip": 1.15234804, + "balance_loss_mlp": 1.03366208, + "epoch": 0.4996242296708252, + "flos": 16181983288800.0, + "grad_norm": 2.231518634980698, + "language_loss": 0.80089033, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82792473, + "num_input_tokens_seen": 178651775, + "step": 8310, + "time_per_iteration": 2.7811901569366455 + }, + { + "auxiliary_loss_clip": 0.01463282, + "auxiliary_loss_mlp": 0.01239881, + "balance_loss_clip": 1.15586972, + "balance_loss_mlp": 1.02606773, + "epoch": 0.49968435292349317, + "flos": 16875430760160.0, + "grad_norm": 1.8572788774651965, + "language_loss": 0.71000993, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.73704159, + "num_input_tokens_seen": 178669720, + "step": 8311, + "time_per_iteration": 2.7884321212768555 + }, + { + "auxiliary_loss_clip": 0.01462157, + "auxiliary_loss_mlp": 0.01238792, + "balance_loss_clip": 1.15475917, + "balance_loss_mlp": 1.02841115, + "epoch": 0.49974447617616113, + "flos": 14941302860160.0, + "grad_norm": 1.8941335305747151, + "language_loss": 0.77433085, + "learning_rate": 2.098688443679187e-06, + "loss": 0.80134034, + "num_input_tokens_seen": 178686765, + "step": 8312, + "time_per_iteration": 2.7644100189208984 + }, + { + "auxiliary_loss_clip": 0.01464378, + "auxiliary_loss_mlp": 0.01235738, + "balance_loss_clip": 1.15651429, + "balance_loss_mlp": 1.02097011, + "epoch": 0.4998045994288291, + "flos": 26653976968320.0, + "grad_norm": 1.7651708756228888, + "language_loss": 0.85061496, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.87761611, + "num_input_tokens_seen": 178705845, + "step": 8313, + "time_per_iteration": 2.8647665977478027 + }, + { + "auxiliary_loss_clip": 0.01457432, + "auxiliary_loss_mlp": 0.01237557, + "balance_loss_clip": 1.14976931, + "balance_loss_mlp": 1.0243156, + "epoch": 0.49986472268149706, + "flos": 20955555447840.0, + "grad_norm": 4.372741839242397, + "language_loss": 0.8063426, + "learning_rate": 2.097910461710939e-06, + "loss": 0.83329248, + "num_input_tokens_seen": 178723410, + "step": 8314, + "time_per_iteration": 2.7612955570220947 + }, + { + "auxiliary_loss_clip": 0.014593, + "auxiliary_loss_mlp": 0.01242568, + "balance_loss_clip": 1.15202546, + "balance_loss_mlp": 1.02970731, + "epoch": 0.49992484593416503, + "flos": 22786100449920.0, + "grad_norm": 2.043518636638327, + "language_loss": 0.79249221, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81951082, + "num_input_tokens_seen": 178743560, + "step": 8315, + "time_per_iteration": 2.806007146835327 + }, + { + "auxiliary_loss_clip": 0.01459755, + "auxiliary_loss_mlp": 0.01241707, + "balance_loss_clip": 1.1518929, + "balance_loss_mlp": 1.03151703, + "epoch": 0.499984969186833, + "flos": 46790276145120.0, + "grad_norm": 2.380803284075043, + "language_loss": 0.74514008, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.77215469, + "num_input_tokens_seen": 178767225, + "step": 8316, + "time_per_iteration": 2.9595813751220703 + }, + { + "auxiliary_loss_clip": 0.01463965, + "auxiliary_loss_mlp": 0.01239077, + "balance_loss_clip": 1.15731823, + "balance_loss_mlp": 1.02659845, + "epoch": 0.500045092439501, + "flos": 25559359341120.0, + "grad_norm": 1.5416831441428764, + "language_loss": 0.81488931, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.84191966, + "num_input_tokens_seen": 178786810, + "step": 8317, + "time_per_iteration": 4.31138014793396 + }, + { + "auxiliary_loss_clip": 0.0145675, + "auxiliary_loss_mlp": 0.01246715, + "balance_loss_clip": 1.14751291, + "balance_loss_mlp": 1.0361439, + "epoch": 0.5001052156921689, + "flos": 20706895880640.0, + "grad_norm": 1.677143553840703, + "language_loss": 0.83457851, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.86161315, + "num_input_tokens_seen": 178805660, + "step": 8318, + "time_per_iteration": 2.772578716278076 + }, + { + "auxiliary_loss_clip": 0.01455731, + "auxiliary_loss_mlp": 0.01230998, + "balance_loss_clip": 1.14758682, + "balance_loss_mlp": 1.01890063, + "epoch": 0.500165338944837, + "flos": 21253218562080.0, + "grad_norm": 1.858209320893781, + "language_loss": 0.81739187, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.84425914, + "num_input_tokens_seen": 178824780, + "step": 8319, + "time_per_iteration": 2.876767873764038 + }, + { + "auxiliary_loss_clip": 0.01457913, + "auxiliary_loss_mlp": 0.01238311, + "balance_loss_clip": 1.14959598, + "balance_loss_mlp": 1.02926564, + "epoch": 0.5002254621975049, + "flos": 27856842657120.0, + "grad_norm": 1.5945704873012407, + "language_loss": 0.71616828, + "learning_rate": 2.095576427171635e-06, + "loss": 0.74313051, + "num_input_tokens_seen": 178845640, + "step": 8320, + "time_per_iteration": 2.850142478942871 + }, + { + "auxiliary_loss_clip": 0.01448636, + "auxiliary_loss_mlp": 0.01255774, + "balance_loss_clip": 1.13940501, + "balance_loss_mlp": 1.04196012, + "epoch": 0.5002855854501729, + "flos": 15553317843360.0, + "grad_norm": 3.6500317987206556, + "language_loss": 0.77059007, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.79763418, + "num_input_tokens_seen": 178862290, + "step": 8321, + "time_per_iteration": 2.747248888015747 + }, + { + "auxiliary_loss_clip": 0.01456612, + "auxiliary_loss_mlp": 0.01246279, + "balance_loss_clip": 1.14783621, + "balance_loss_mlp": 1.03227472, + "epoch": 0.5003457087028408, + "flos": 16109274277440.0, + "grad_norm": 1.8003590260156666, + "language_loss": 0.83240807, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85943699, + "num_input_tokens_seen": 178879805, + "step": 8322, + "time_per_iteration": 2.859874725341797 + }, + { + "auxiliary_loss_clip": 0.0146397, + "auxiliary_loss_mlp": 0.01244361, + "balance_loss_clip": 1.15655589, + "balance_loss_mlp": 1.03378987, + "epoch": 0.5004058319555088, + "flos": 22712708731680.0, + "grad_norm": 2.2616067370559825, + "language_loss": 0.73679423, + "learning_rate": 2.094409360775228e-06, + "loss": 0.76387751, + "num_input_tokens_seen": 178896985, + "step": 8323, + "time_per_iteration": 2.7628026008605957 + }, + { + "auxiliary_loss_clip": 0.01454743, + "auxiliary_loss_mlp": 0.01249706, + "balance_loss_clip": 1.14594102, + "balance_loss_mlp": 1.03951645, + "epoch": 0.5004659552081767, + "flos": 30120607474560.0, + "grad_norm": 1.4716164929969626, + "language_loss": 0.69646811, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.72351259, + "num_input_tokens_seen": 178920605, + "step": 8324, + "time_per_iteration": 4.293035268783569 + }, + { + "auxiliary_loss_clip": 0.01453276, + "auxiliary_loss_mlp": 0.01250349, + "balance_loss_clip": 1.14501369, + "balance_loss_mlp": 1.03863299, + "epoch": 0.5005260784608447, + "flos": 18626705179200.0, + "grad_norm": 2.8555121057265636, + "language_loss": 0.7263881, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.75342429, + "num_input_tokens_seen": 178937760, + "step": 8325, + "time_per_iteration": 4.165910482406616 + }, + { + "auxiliary_loss_clip": 0.014617, + "auxiliary_loss_mlp": 0.01246183, + "balance_loss_clip": 1.1531713, + "balance_loss_mlp": 1.03503919, + "epoch": 0.5005862017135126, + "flos": 24862119053760.0, + "grad_norm": 1.6283775299723529, + "language_loss": 0.7337141, + "learning_rate": 2.093242262158709e-06, + "loss": 0.76079285, + "num_input_tokens_seen": 178957985, + "step": 8326, + "time_per_iteration": 2.7876694202423096 + }, + { + "auxiliary_loss_clip": 0.01460549, + "auxiliary_loss_mlp": 0.01237245, + "balance_loss_clip": 1.15148842, + "balance_loss_mlp": 1.02819979, + "epoch": 0.5006463249661807, + "flos": 18736394510880.0, + "grad_norm": 2.168856308113392, + "language_loss": 0.78002667, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80700463, + "num_input_tokens_seen": 178977070, + "step": 8327, + "time_per_iteration": 2.76176118850708 + }, + { + "auxiliary_loss_clip": 0.01458118, + "auxiliary_loss_mlp": 0.01245217, + "balance_loss_clip": 1.15003383, + "balance_loss_mlp": 1.03540874, + "epoch": 0.5007064482188487, + "flos": 13043813927040.0, + "grad_norm": 2.4405706883958103, + "language_loss": 0.87724233, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90427566, + "num_input_tokens_seen": 178994175, + "step": 8328, + "time_per_iteration": 2.788719892501831 + }, + { + "auxiliary_loss_clip": 0.01454393, + "auxiliary_loss_mlp": 0.01243774, + "balance_loss_clip": 1.14611292, + "balance_loss_mlp": 1.03320312, + "epoch": 0.5007665714715166, + "flos": 21290843661120.0, + "grad_norm": 2.2356769143411497, + "language_loss": 0.74675262, + "learning_rate": 2.092075131720388e-06, + "loss": 0.77373433, + "num_input_tokens_seen": 179013710, + "step": 8329, + "time_per_iteration": 2.8065245151519775 + }, + { + "auxiliary_loss_clip": 0.01461769, + "auxiliary_loss_mlp": 0.01237448, + "balance_loss_clip": 1.15312576, + "balance_loss_mlp": 1.02763939, + "epoch": 0.5008266947241846, + "flos": 29757289986720.0, + "grad_norm": 1.6605863999935035, + "language_loss": 0.79591084, + "learning_rate": 2.091686081238281e-06, + "loss": 0.82290298, + "num_input_tokens_seen": 179035255, + "step": 8330, + "time_per_iteration": 2.8542349338531494 + }, + { + "auxiliary_loss_clip": 0.01587176, + "auxiliary_loss_mlp": 0.01188271, + "balance_loss_clip": 1.304775, + "balance_loss_mlp": 0.98685455, + "epoch": 0.5008868179768525, + "flos": 63563413564800.0, + "grad_norm": 0.7344986088038924, + "language_loss": 0.56024659, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.58800107, + "num_input_tokens_seen": 179090915, + "step": 8331, + "time_per_iteration": 4.660766839981079 + }, + { + "auxiliary_loss_clip": 0.01465327, + "auxiliary_loss_mlp": 0.01242725, + "balance_loss_clip": 1.15708804, + "balance_loss_mlp": 1.03215408, + "epoch": 0.5009469412295205, + "flos": 27377805260160.0, + "grad_norm": 2.2279640299221692, + "language_loss": 0.65390396, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.68098444, + "num_input_tokens_seen": 179109160, + "step": 8332, + "time_per_iteration": 2.875675916671753 + }, + { + "auxiliary_loss_clip": 0.01460238, + "auxiliary_loss_mlp": 0.01248132, + "balance_loss_clip": 1.15196681, + "balance_loss_mlp": 1.03813243, + "epoch": 0.5010070644821885, + "flos": 27381180866400.0, + "grad_norm": 1.5942849132107109, + "language_loss": 0.74643385, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.77351755, + "num_input_tokens_seen": 179130610, + "step": 8333, + "time_per_iteration": 2.7710752487182617 + }, + { + "auxiliary_loss_clip": 0.01458789, + "auxiliary_loss_mlp": 0.01242566, + "balance_loss_clip": 1.15170419, + "balance_loss_mlp": 1.02951503, + "epoch": 0.5010671877348565, + "flos": 20664567689760.0, + "grad_norm": 2.152445657088993, + "language_loss": 0.80291575, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82992935, + "num_input_tokens_seen": 179147860, + "step": 8334, + "time_per_iteration": 2.7848148345947266 + }, + { + "auxiliary_loss_clip": 0.01583943, + "auxiliary_loss_mlp": 0.01187164, + "balance_loss_clip": 1.30306518, + "balance_loss_mlp": 0.98727417, + "epoch": 0.5011273109875244, + "flos": 59135218509600.0, + "grad_norm": 0.8902938515700568, + "language_loss": 0.62608987, + "learning_rate": 2.089740776971626e-06, + "loss": 0.65380096, + "num_input_tokens_seen": 179210490, + "step": 8335, + "time_per_iteration": 3.242927312850952 + }, + { + "auxiliary_loss_clip": 0.01453285, + "auxiliary_loss_mlp": 0.01241031, + "balance_loss_clip": 1.1440587, + "balance_loss_mlp": 1.03313041, + "epoch": 0.5011874342401924, + "flos": 25338615264000.0, + "grad_norm": 1.6534646208659078, + "language_loss": 0.79555988, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.82250303, + "num_input_tokens_seen": 179231360, + "step": 8336, + "time_per_iteration": 2.7913711071014404 + }, + { + "auxiliary_loss_clip": 0.01464225, + "auxiliary_loss_mlp": 0.0126073, + "balance_loss_clip": 1.15467405, + "balance_loss_mlp": 1.05168462, + "epoch": 0.5012475574928603, + "flos": 20232030581280.0, + "grad_norm": 1.8066463185593071, + "language_loss": 0.80065769, + "learning_rate": 2.088962631340836e-06, + "loss": 0.8279072, + "num_input_tokens_seen": 179250625, + "step": 8337, + "time_per_iteration": 2.786714553833008 + }, + { + "auxiliary_loss_clip": 0.01457288, + "auxiliary_loss_mlp": 0.01254154, + "balance_loss_clip": 1.14682698, + "balance_loss_mlp": 1.04491746, + "epoch": 0.5013076807455283, + "flos": 22712291521920.0, + "grad_norm": 2.2703407160939832, + "language_loss": 0.79294109, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.8200556, + "num_input_tokens_seen": 179267360, + "step": 8338, + "time_per_iteration": 2.8212523460388184 + }, + { + "auxiliary_loss_clip": 0.01455496, + "auxiliary_loss_mlp": 0.01248204, + "balance_loss_clip": 1.14580321, + "balance_loss_mlp": 1.03915858, + "epoch": 0.5013678039981962, + "flos": 24247562883840.0, + "grad_norm": 1.5884151928838923, + "language_loss": 0.85117602, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87821299, + "num_input_tokens_seen": 179289810, + "step": 8339, + "time_per_iteration": 2.8351738452911377 + }, + { + "auxiliary_loss_clip": 0.01462583, + "auxiliary_loss_mlp": 0.01252308, + "balance_loss_clip": 1.15191913, + "balance_loss_mlp": 1.04402506, + "epoch": 0.5014279272508643, + "flos": 26179073740800.0, + "grad_norm": 2.4634923977534764, + "language_loss": 0.70932287, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.73647177, + "num_input_tokens_seen": 179310620, + "step": 8340, + "time_per_iteration": 2.846975564956665 + }, + { + "auxiliary_loss_clip": 0.01458313, + "auxiliary_loss_mlp": 0.0125645, + "balance_loss_clip": 1.14905834, + "balance_loss_mlp": 1.04664111, + "epoch": 0.5014880505035323, + "flos": 21432393011520.0, + "grad_norm": 1.9075582135598528, + "language_loss": 0.78200865, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.8091563, + "num_input_tokens_seen": 179329005, + "step": 8341, + "time_per_iteration": 2.831397771835327 + }, + { + "auxiliary_loss_clip": 0.01463619, + "auxiliary_loss_mlp": 0.01249523, + "balance_loss_clip": 1.1531285, + "balance_loss_mlp": 1.03723526, + "epoch": 0.5015481737562002, + "flos": 15772127584320.0, + "grad_norm": 3.0044371532301968, + "language_loss": 0.89268398, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91981536, + "num_input_tokens_seen": 179343785, + "step": 8342, + "time_per_iteration": 2.7451767921447754 + }, + { + "auxiliary_loss_clip": 0.01460436, + "auxiliary_loss_mlp": 0.01262157, + "balance_loss_clip": 1.15046096, + "balance_loss_mlp": 1.05520976, + "epoch": 0.5016082970088682, + "flos": 26833075561440.0, + "grad_norm": 1.9984481357226653, + "language_loss": 0.76126617, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.78849208, + "num_input_tokens_seen": 179364070, + "step": 8343, + "time_per_iteration": 2.7782633304595947 + }, + { + "auxiliary_loss_clip": 0.01461596, + "auxiliary_loss_mlp": 0.01242355, + "balance_loss_clip": 1.15131736, + "balance_loss_mlp": 1.03578913, + "epoch": 0.5016684202615361, + "flos": 21472748938080.0, + "grad_norm": 2.36298049665989, + "language_loss": 0.66996598, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69700551, + "num_input_tokens_seen": 179384225, + "step": 8344, + "time_per_iteration": 2.7736001014709473 + }, + { + "auxiliary_loss_clip": 0.01462744, + "auxiliary_loss_mlp": 0.01249978, + "balance_loss_clip": 1.15429354, + "balance_loss_mlp": 1.04207706, + "epoch": 0.5017285435142042, + "flos": 26249051924640.0, + "grad_norm": 2.2517902821130122, + "language_loss": 0.75351775, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.78064501, + "num_input_tokens_seen": 179402595, + "step": 8345, + "time_per_iteration": 2.7903246879577637 + }, + { + "auxiliary_loss_clip": 0.01462667, + "auxiliary_loss_mlp": 0.01243493, + "balance_loss_clip": 1.15375113, + "balance_loss_mlp": 1.03349447, + "epoch": 0.5017886667668721, + "flos": 20779680748320.0, + "grad_norm": 2.252564147626207, + "language_loss": 0.7848711, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.81193268, + "num_input_tokens_seen": 179419635, + "step": 8346, + "time_per_iteration": 2.7916648387908936 + }, + { + "auxiliary_loss_clip": 0.01462838, + "auxiliary_loss_mlp": 0.01257181, + "balance_loss_clip": 1.15105486, + "balance_loss_mlp": 1.04928017, + "epoch": 0.5018487900195401, + "flos": 20158449222240.0, + "grad_norm": 1.6150274184219255, + "language_loss": 0.69022948, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71742964, + "num_input_tokens_seen": 179438770, + "step": 8347, + "time_per_iteration": 2.770963430404663 + }, + { + "auxiliary_loss_clip": 0.01461895, + "auxiliary_loss_mlp": 0.01249813, + "balance_loss_clip": 1.15267253, + "balance_loss_mlp": 1.0390507, + "epoch": 0.501908913272208, + "flos": 18152674299360.0, + "grad_norm": 2.1222305225365323, + "language_loss": 0.7152102, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.74232727, + "num_input_tokens_seen": 179457475, + "step": 8348, + "time_per_iteration": 2.809030771255493 + }, + { + "auxiliary_loss_clip": 0.01461371, + "auxiliary_loss_mlp": 0.01245202, + "balance_loss_clip": 1.15172768, + "balance_loss_mlp": 1.03749132, + "epoch": 0.501969036524876, + "flos": 23114523666240.0, + "grad_norm": 1.413454833672437, + "language_loss": 0.7430737, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.7701394, + "num_input_tokens_seen": 179478140, + "step": 8349, + "time_per_iteration": 2.758883476257324 + }, + { + "auxiliary_loss_clip": 0.01463625, + "auxiliary_loss_mlp": 0.01234676, + "balance_loss_clip": 1.15260351, + "balance_loss_mlp": 1.02315104, + "epoch": 0.5020291597775439, + "flos": 11364755453280.0, + "grad_norm": 2.3826920346462765, + "language_loss": 0.641541, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.66852397, + "num_input_tokens_seen": 179494325, + "step": 8350, + "time_per_iteration": 2.8105974197387695 + }, + { + "auxiliary_loss_clip": 0.01593601, + "auxiliary_loss_mlp": 0.01232658, + "balance_loss_clip": 1.31246829, + "balance_loss_mlp": 1.03582001, + "epoch": 0.5020892830302119, + "flos": 64017569724480.0, + "grad_norm": 0.7767655943058063, + "language_loss": 0.597826, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6260885, + "num_input_tokens_seen": 179553545, + "step": 8351, + "time_per_iteration": 3.502586603164673 + }, + { + "auxiliary_loss_clip": 0.01456524, + "auxiliary_loss_mlp": 0.01246506, + "balance_loss_clip": 1.14612162, + "balance_loss_mlp": 1.03707933, + "epoch": 0.5021494062828799, + "flos": 23735148341760.0, + "grad_norm": 2.0605010818868457, + "language_loss": 0.75213325, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.7791636, + "num_input_tokens_seen": 179573645, + "step": 8352, + "time_per_iteration": 2.8333628177642822 + }, + { + "auxiliary_loss_clip": 0.01458124, + "auxiliary_loss_mlp": 0.01234624, + "balance_loss_clip": 1.14863682, + "balance_loss_mlp": 1.02348065, + "epoch": 0.5022095295355479, + "flos": 21578721310080.0, + "grad_norm": 3.0165042375785056, + "language_loss": 0.71881008, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74573755, + "num_input_tokens_seen": 179591435, + "step": 8353, + "time_per_iteration": 2.817059278488159 + }, + { + "auxiliary_loss_clip": 0.01463228, + "auxiliary_loss_mlp": 0.01248934, + "balance_loss_clip": 1.15325093, + "balance_loss_mlp": 1.03645515, + "epoch": 0.5022696527882159, + "flos": 21399357219840.0, + "grad_norm": 1.6335114487879716, + "language_loss": 0.74213946, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.76926112, + "num_input_tokens_seen": 179609955, + "step": 8354, + "time_per_iteration": 2.7757415771484375 + }, + { + "auxiliary_loss_clip": 0.0145748, + "auxiliary_loss_mlp": 0.0124732, + "balance_loss_clip": 1.14668667, + "balance_loss_mlp": 1.03560412, + "epoch": 0.5023297760408838, + "flos": 27163091760480.0, + "grad_norm": 1.4806854344187543, + "language_loss": 0.72669399, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.75374198, + "num_input_tokens_seen": 179630875, + "step": 8355, + "time_per_iteration": 4.353864431381226 + }, + { + "auxiliary_loss_clip": 0.01464599, + "auxiliary_loss_mlp": 0.01242876, + "balance_loss_clip": 1.15393162, + "balance_loss_mlp": 1.02848971, + "epoch": 0.5023898992935518, + "flos": 26216243701920.0, + "grad_norm": 2.3110409790010826, + "language_loss": 0.80924535, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83632004, + "num_input_tokens_seen": 179649835, + "step": 8356, + "time_per_iteration": 2.8796706199645996 + }, + { + "auxiliary_loss_clip": 0.01455721, + "auxiliary_loss_mlp": 0.01236682, + "balance_loss_clip": 1.14446843, + "balance_loss_mlp": 1.02172399, + "epoch": 0.5024500225462197, + "flos": 13441836045600.0, + "grad_norm": 2.4175952632536615, + "language_loss": 0.76110458, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.7880286, + "num_input_tokens_seen": 179667605, + "step": 8357, + "time_per_iteration": 2.773037910461426 + }, + { + "auxiliary_loss_clip": 0.01454629, + "auxiliary_loss_mlp": 0.01232156, + "balance_loss_clip": 1.14485526, + "balance_loss_mlp": 1.02158427, + "epoch": 0.5025101457988878, + "flos": 21581717634720.0, + "grad_norm": 1.7313144093357806, + "language_loss": 0.76522028, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.79208815, + "num_input_tokens_seen": 179686910, + "step": 8358, + "time_per_iteration": 2.8589792251586914 + }, + { + "auxiliary_loss_clip": 0.01456127, + "auxiliary_loss_mlp": 0.01238541, + "balance_loss_clip": 1.14595342, + "balance_loss_mlp": 1.02758861, + "epoch": 0.5025702690515557, + "flos": 24647481410400.0, + "grad_norm": 2.196421695070092, + "language_loss": 0.72137165, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74831831, + "num_input_tokens_seen": 179706395, + "step": 8359, + "time_per_iteration": 2.82431960105896 + }, + { + "auxiliary_loss_clip": 0.01463636, + "auxiliary_loss_mlp": 0.01240323, + "balance_loss_clip": 1.1532433, + "balance_loss_mlp": 1.02975202, + "epoch": 0.5026303923042237, + "flos": 22092311625120.0, + "grad_norm": 1.6325318025591211, + "language_loss": 0.77180785, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79884738, + "num_input_tokens_seen": 179725735, + "step": 8360, + "time_per_iteration": 2.8372421264648438 + }, + { + "auxiliary_loss_clip": 0.01459663, + "auxiliary_loss_mlp": 0.0124225, + "balance_loss_clip": 1.14902163, + "balance_loss_mlp": 1.03091621, + "epoch": 0.5026905155568916, + "flos": 23699533435200.0, + "grad_norm": 1.8098966560753798, + "language_loss": 0.76851386, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.79553306, + "num_input_tokens_seen": 179746150, + "step": 8361, + "time_per_iteration": 2.8477303981781006 + }, + { + "auxiliary_loss_clip": 0.01456783, + "auxiliary_loss_mlp": 0.01242244, + "balance_loss_clip": 1.14680433, + "balance_loss_mlp": 1.02881205, + "epoch": 0.5027506388095596, + "flos": 25814808048960.0, + "grad_norm": 1.5676995912427074, + "language_loss": 0.85069126, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87768155, + "num_input_tokens_seen": 179767550, + "step": 8362, + "time_per_iteration": 4.33314061164856 + }, + { + "auxiliary_loss_clip": 0.01456717, + "auxiliary_loss_mlp": 0.01235483, + "balance_loss_clip": 1.14607477, + "balance_loss_mlp": 1.02414894, + "epoch": 0.5028107620622275, + "flos": 27529064219520.0, + "grad_norm": 1.756112982617094, + "language_loss": 0.78136665, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80828863, + "num_input_tokens_seen": 179790075, + "step": 8363, + "time_per_iteration": 2.8800408840179443 + }, + { + "auxiliary_loss_clip": 0.01456418, + "auxiliary_loss_mlp": 0.01236867, + "balance_loss_clip": 1.1466583, + "balance_loss_mlp": 1.02362514, + "epoch": 0.5028708853148955, + "flos": 24536540449440.0, + "grad_norm": 3.835415502839879, + "language_loss": 0.75701261, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.78394544, + "num_input_tokens_seen": 179806515, + "step": 8364, + "time_per_iteration": 4.348700523376465 + }, + { + "auxiliary_loss_clip": 0.01457077, + "auxiliary_loss_mlp": 0.01230018, + "balance_loss_clip": 1.14712453, + "balance_loss_mlp": 1.01982844, + "epoch": 0.5029310085675635, + "flos": 20815712864640.0, + "grad_norm": 1.7526761989180262, + "language_loss": 0.70008183, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.72695279, + "num_input_tokens_seen": 179826450, + "step": 8365, + "time_per_iteration": 2.820882558822632 + }, + { + "auxiliary_loss_clip": 0.01460269, + "auxiliary_loss_mlp": 0.01231019, + "balance_loss_clip": 1.15002871, + "balance_loss_mlp": 1.01606107, + "epoch": 0.5029911318202315, + "flos": 22344460583040.0, + "grad_norm": 1.6933072106915457, + "language_loss": 0.73078054, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75769341, + "num_input_tokens_seen": 179846770, + "step": 8366, + "time_per_iteration": 2.8077659606933594 + }, + { + "auxiliary_loss_clip": 0.01470168, + "auxiliary_loss_mlp": 0.01244055, + "balance_loss_clip": 1.16050696, + "balance_loss_mlp": 1.03062308, + "epoch": 0.5030512550728995, + "flos": 24355014454080.0, + "grad_norm": 2.0316514295159998, + "language_loss": 0.78359854, + "learning_rate": 2.077288893713735e-06, + "loss": 0.81074077, + "num_input_tokens_seen": 179866585, + "step": 8367, + "time_per_iteration": 2.831983804702759 + }, + { + "auxiliary_loss_clip": 0.01464019, + "auxiliary_loss_mlp": 0.01233793, + "balance_loss_clip": 1.15410757, + "balance_loss_mlp": 1.02188611, + "epoch": 0.5031113783255674, + "flos": 18261908493120.0, + "grad_norm": 2.1423457566397968, + "language_loss": 0.69885826, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.7258364, + "num_input_tokens_seen": 179885575, + "step": 8368, + "time_per_iteration": 4.219008445739746 + }, + { + "auxiliary_loss_clip": 0.01591454, + "auxiliary_loss_mlp": 0.01207069, + "balance_loss_clip": 1.30737209, + "balance_loss_mlp": 1.00946808, + "epoch": 0.5031715015782354, + "flos": 57258817997760.0, + "grad_norm": 0.8565978487893934, + "language_loss": 0.63292277, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.66090804, + "num_input_tokens_seen": 179939650, + "step": 8369, + "time_per_iteration": 3.2587103843688965 + }, + { + "auxiliary_loss_clip": 0.01471103, + "auxiliary_loss_mlp": 0.01242102, + "balance_loss_clip": 1.16124821, + "balance_loss_mlp": 1.03057694, + "epoch": 0.5032316248309033, + "flos": 27529860710880.0, + "grad_norm": 1.7150765974956765, + "language_loss": 0.6052711, + "learning_rate": 2.076121368302263e-06, + "loss": 0.63240314, + "num_input_tokens_seen": 179961765, + "step": 8370, + "time_per_iteration": 2.820065975189209 + }, + { + "auxiliary_loss_clip": 0.01461448, + "auxiliary_loss_mlp": 0.01234628, + "balance_loss_clip": 1.15042043, + "balance_loss_mlp": 1.02176785, + "epoch": 0.5032917480835714, + "flos": 34498698701760.0, + "grad_norm": 1.8881090036550987, + "language_loss": 0.68563771, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.71259844, + "num_input_tokens_seen": 179983015, + "step": 8371, + "time_per_iteration": 2.901048421859741 + }, + { + "auxiliary_loss_clip": 0.01465221, + "auxiliary_loss_mlp": 0.01249978, + "balance_loss_clip": 1.15396214, + "balance_loss_mlp": 1.03883421, + "epoch": 0.5033518713362393, + "flos": 33659605638720.0, + "grad_norm": 2.35755143874961, + "language_loss": 0.67453176, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.70168376, + "num_input_tokens_seen": 180003210, + "step": 8372, + "time_per_iteration": 2.8769021034240723 + }, + { + "auxiliary_loss_clip": 0.01460095, + "auxiliary_loss_mlp": 0.01239868, + "balance_loss_clip": 1.14951885, + "balance_loss_mlp": 1.02929652, + "epoch": 0.5034119945889073, + "flos": 28188110485440.0, + "grad_norm": 2.420190283440593, + "language_loss": 0.66837454, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.69537419, + "num_input_tokens_seen": 180025530, + "step": 8373, + "time_per_iteration": 2.7746589183807373 + }, + { + "auxiliary_loss_clip": 0.01462536, + "auxiliary_loss_mlp": 0.01236697, + "balance_loss_clip": 1.1532371, + "balance_loss_mlp": 1.02822351, + "epoch": 0.5034721178415752, + "flos": 21360708060480.0, + "grad_norm": 1.9482263155740336, + "language_loss": 0.74829209, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.77528441, + "num_input_tokens_seen": 180043180, + "step": 8374, + "time_per_iteration": 2.754612445831299 + }, + { + "auxiliary_loss_clip": 0.01466672, + "auxiliary_loss_mlp": 0.01245438, + "balance_loss_clip": 1.15591025, + "balance_loss_mlp": 1.03639293, + "epoch": 0.5035322410942432, + "flos": 22677055896960.0, + "grad_norm": 1.8053402932348614, + "language_loss": 0.67926395, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70638502, + "num_input_tokens_seen": 180062905, + "step": 8375, + "time_per_iteration": 2.7691590785980225 + }, + { + "auxiliary_loss_clip": 0.01460049, + "auxiliary_loss_mlp": 0.01245298, + "balance_loss_clip": 1.14847875, + "balance_loss_mlp": 1.0301491, + "epoch": 0.5035923643469111, + "flos": 19830784569120.0, + "grad_norm": 1.9429530392281387, + "language_loss": 0.78907233, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.81612581, + "num_input_tokens_seen": 180082000, + "step": 8376, + "time_per_iteration": 2.749166965484619 + }, + { + "auxiliary_loss_clip": 0.01457618, + "auxiliary_loss_mlp": 0.01242897, + "balance_loss_clip": 1.14708793, + "balance_loss_mlp": 1.03175402, + "epoch": 0.5036524875995791, + "flos": 30517036610400.0, + "grad_norm": 2.1596396377275995, + "language_loss": 0.60042918, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.62743431, + "num_input_tokens_seen": 180101340, + "step": 8377, + "time_per_iteration": 2.8665530681610107 + }, + { + "auxiliary_loss_clip": 0.01454566, + "auxiliary_loss_mlp": 0.01238638, + "balance_loss_clip": 1.14285219, + "balance_loss_mlp": 1.02787614, + "epoch": 0.5037126108522471, + "flos": 14722341406560.0, + "grad_norm": 4.633520792270717, + "language_loss": 0.7609418, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78787386, + "num_input_tokens_seen": 180119160, + "step": 8378, + "time_per_iteration": 2.774528741836548 + }, + { + "auxiliary_loss_clip": 0.01453857, + "auxiliary_loss_mlp": 0.01246157, + "balance_loss_clip": 1.14283299, + "balance_loss_mlp": 1.03501379, + "epoch": 0.5037727341049151, + "flos": 25299852320160.0, + "grad_norm": 1.7449125515827961, + "language_loss": 0.74849385, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.77549398, + "num_input_tokens_seen": 180138730, + "step": 8379, + "time_per_iteration": 2.9109432697296143 + }, + { + "auxiliary_loss_clip": 0.01455331, + "auxiliary_loss_mlp": 0.01247336, + "balance_loss_clip": 1.14590049, + "balance_loss_mlp": 1.03848147, + "epoch": 0.5038328573575831, + "flos": 28543918197600.0, + "grad_norm": 3.140561755529813, + "language_loss": 0.66983688, + "learning_rate": 2.072229431544548e-06, + "loss": 0.69686353, + "num_input_tokens_seen": 180158810, + "step": 8380, + "time_per_iteration": 2.809685468673706 + }, + { + "auxiliary_loss_clip": 0.01461173, + "auxiliary_loss_mlp": 0.01239505, + "balance_loss_clip": 1.15009785, + "balance_loss_mlp": 1.03084111, + "epoch": 0.503892980610251, + "flos": 31652579086560.0, + "grad_norm": 2.2757098108648, + "language_loss": 0.63423419, + "learning_rate": 2.071840222561051e-06, + "loss": 0.66124105, + "num_input_tokens_seen": 180179700, + "step": 8381, + "time_per_iteration": 2.8027453422546387 + }, + { + "auxiliary_loss_clip": 0.01453053, + "auxiliary_loss_mlp": 0.01241686, + "balance_loss_clip": 1.14288402, + "balance_loss_mlp": 1.03435707, + "epoch": 0.503953103862919, + "flos": 27091824019200.0, + "grad_norm": 1.5609578685141607, + "language_loss": 0.67465264, + "learning_rate": 2.071451010853365e-06, + "loss": 0.70160007, + "num_input_tokens_seen": 180199890, + "step": 8382, + "time_per_iteration": 2.762897491455078 + }, + { + "auxiliary_loss_clip": 0.01460211, + "auxiliary_loss_mlp": 0.01247096, + "balance_loss_clip": 1.14913666, + "balance_loss_mlp": 1.03518987, + "epoch": 0.5040132271155869, + "flos": 15634788259680.0, + "grad_norm": 1.794420849318087, + "language_loss": 0.62338305, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.65045619, + "num_input_tokens_seen": 180217840, + "step": 8383, + "time_per_iteration": 2.814296007156372 + }, + { + "auxiliary_loss_clip": 0.01455927, + "auxiliary_loss_mlp": 0.01249253, + "balance_loss_clip": 1.14448345, + "balance_loss_mlp": 1.04077947, + "epoch": 0.504073350368255, + "flos": 13591843375680.0, + "grad_norm": 2.5124868989390867, + "language_loss": 0.66883957, + "learning_rate": 2.070672579324465e-06, + "loss": 0.69589132, + "num_input_tokens_seen": 180236465, + "step": 8384, + "time_per_iteration": 2.7147250175476074 + }, + { + "auxiliary_loss_clip": 0.01456174, + "auxiliary_loss_mlp": 0.01239063, + "balance_loss_clip": 1.14456689, + "balance_loss_mlp": 1.02868271, + "epoch": 0.5041334736209229, + "flos": 29060656549920.0, + "grad_norm": 2.3110837508851834, + "language_loss": 0.71483368, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.74178612, + "num_input_tokens_seen": 180258025, + "step": 8385, + "time_per_iteration": 2.8339974880218506 + }, + { + "auxiliary_loss_clip": 0.01454016, + "auxiliary_loss_mlp": 0.01230687, + "balance_loss_clip": 1.14131558, + "balance_loss_mlp": 1.02106977, + "epoch": 0.5041935968735909, + "flos": 24610690730880.0, + "grad_norm": 1.7914259649387212, + "language_loss": 0.83342528, + "learning_rate": 2.069894137075919e-06, + "loss": 0.86027229, + "num_input_tokens_seen": 180277825, + "step": 8386, + "time_per_iteration": 2.835602283477783 + }, + { + "auxiliary_loss_clip": 0.01452946, + "auxiliary_loss_mlp": 0.01242654, + "balance_loss_clip": 1.14218736, + "balance_loss_mlp": 1.03303683, + "epoch": 0.5042537201262588, + "flos": 26289369923040.0, + "grad_norm": 1.5525171639853772, + "language_loss": 0.66432297, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.69127893, + "num_input_tokens_seen": 180300465, + "step": 8387, + "time_per_iteration": 2.7961363792419434 + }, + { + "auxiliary_loss_clip": 0.01460087, + "auxiliary_loss_mlp": 0.01240048, + "balance_loss_clip": 1.14755321, + "balance_loss_mlp": 1.03043032, + "epoch": 0.5043138433789268, + "flos": 22019526757440.0, + "grad_norm": 2.3938832900810945, + "language_loss": 0.80314726, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.83014858, + "num_input_tokens_seen": 180321050, + "step": 8388, + "time_per_iteration": 2.7988200187683105 + }, + { + "auxiliary_loss_clip": 0.01459916, + "auxiliary_loss_mlp": 0.01245718, + "balance_loss_clip": 1.14837229, + "balance_loss_mlp": 1.03686309, + "epoch": 0.5043739666315947, + "flos": 28769630863680.0, + "grad_norm": 2.3286179182377875, + "language_loss": 0.70046335, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72751963, + "num_input_tokens_seen": 180338870, + "step": 8389, + "time_per_iteration": 2.8236708641052246 + }, + { + "auxiliary_loss_clip": 0.01453802, + "auxiliary_loss_mlp": 0.01252203, + "balance_loss_clip": 1.1420989, + "balance_loss_mlp": 1.04182291, + "epoch": 0.5044340898842627, + "flos": 27601697374560.0, + "grad_norm": 1.77126656558066, + "language_loss": 0.69398642, + "learning_rate": 2.068337220892191e-06, + "loss": 0.72104651, + "num_input_tokens_seen": 180361285, + "step": 8390, + "time_per_iteration": 2.83708119392395 + }, + { + "auxiliary_loss_clip": 0.01594736, + "auxiliary_loss_mlp": 0.01199776, + "balance_loss_clip": 1.30688548, + "balance_loss_mlp": 1.00217438, + "epoch": 0.5044942131369307, + "flos": 67463643168000.0, + "grad_norm": 0.8166699358243167, + "language_loss": 0.52903426, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55697942, + "num_input_tokens_seen": 180415170, + "step": 8391, + "time_per_iteration": 3.181643486022949 + }, + { + "auxiliary_loss_clip": 0.01593572, + "auxiliary_loss_mlp": 0.01197586, + "balance_loss_clip": 1.30578136, + "balance_loss_mlp": 1.00151062, + "epoch": 0.5045543363895987, + "flos": 58636079187840.0, + "grad_norm": 0.8554922511759459, + "language_loss": 0.60650492, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.63441646, + "num_input_tokens_seen": 180468060, + "step": 8392, + "time_per_iteration": 3.1067800521850586 + }, + { + "auxiliary_loss_clip": 0.01457065, + "auxiliary_loss_mlp": 0.01246878, + "balance_loss_clip": 1.14643955, + "balance_loss_mlp": 1.03764176, + "epoch": 0.5046144596422667, + "flos": 22528717405920.0, + "grad_norm": 1.6564427755636781, + "language_loss": 0.84863734, + "learning_rate": 2.067169506493517e-06, + "loss": 0.87567675, + "num_input_tokens_seen": 180486610, + "step": 8393, + "time_per_iteration": 2.815234661102295 + }, + { + "auxiliary_loss_clip": 0.01453835, + "auxiliary_loss_mlp": 0.01234811, + "balance_loss_clip": 1.14156485, + "balance_loss_mlp": 1.02557492, + "epoch": 0.5046745828949346, + "flos": 27456734489760.0, + "grad_norm": 2.2082116427133167, + "language_loss": 0.50776672, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.53465319, + "num_input_tokens_seen": 180508135, + "step": 8394, + "time_per_iteration": 4.331147193908691 + }, + { + "auxiliary_loss_clip": 0.01450359, + "auxiliary_loss_mlp": 0.01246946, + "balance_loss_clip": 1.13820267, + "balance_loss_mlp": 1.03866339, + "epoch": 0.5047347061476026, + "flos": 17276449203360.0, + "grad_norm": 3.018236095193752, + "language_loss": 0.75216961, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.77914262, + "num_input_tokens_seen": 180527000, + "step": 8395, + "time_per_iteration": 2.816155433654785 + }, + { + "auxiliary_loss_clip": 0.01454551, + "auxiliary_loss_mlp": 0.01245321, + "balance_loss_clip": 1.14448094, + "balance_loss_mlp": 1.03627586, + "epoch": 0.5047948294002705, + "flos": 16651121436000.0, + "grad_norm": 2.1201781942283926, + "language_loss": 0.67895848, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.70595717, + "num_input_tokens_seen": 180544715, + "step": 8396, + "time_per_iteration": 2.7589609622955322 + }, + { + "auxiliary_loss_clip": 0.01462619, + "auxiliary_loss_mlp": 0.0124943, + "balance_loss_clip": 1.15092766, + "balance_loss_mlp": 1.04171968, + "epoch": 0.5048549526529386, + "flos": 26867893976640.0, + "grad_norm": 1.8844575507389956, + "language_loss": 0.78491485, + "learning_rate": 2.065612518371792e-06, + "loss": 0.81203532, + "num_input_tokens_seen": 180565365, + "step": 8397, + "time_per_iteration": 2.8213791847229004 + }, + { + "auxiliary_loss_clip": 0.01454235, + "auxiliary_loss_mlp": 0.01234969, + "balance_loss_clip": 1.14388108, + "balance_loss_mlp": 1.0255425, + "epoch": 0.5049150759056065, + "flos": 21836028497760.0, + "grad_norm": 2.0617377225978815, + "language_loss": 0.66322696, + "learning_rate": 2.065223265084376e-06, + "loss": 0.69011903, + "num_input_tokens_seen": 180586670, + "step": 8398, + "time_per_iteration": 2.8382601737976074 + }, + { + "auxiliary_loss_clip": 0.01452878, + "auxiliary_loss_mlp": 0.01242492, + "balance_loss_clip": 1.14211071, + "balance_loss_mlp": 1.03153956, + "epoch": 0.5049751991582745, + "flos": 21687765863040.0, + "grad_norm": 1.7289203554500427, + "language_loss": 0.71832073, + "learning_rate": 2.064834009323688e-06, + "loss": 0.74527442, + "num_input_tokens_seen": 180605085, + "step": 8399, + "time_per_iteration": 2.7339303493499756 + }, + { + "auxiliary_loss_clip": 0.01456108, + "auxiliary_loss_mlp": 0.01246088, + "balance_loss_clip": 1.1434021, + "balance_loss_mlp": 1.03418159, + "epoch": 0.5050353224109424, + "flos": 21361428695520.0, + "grad_norm": 1.8160951974349002, + "language_loss": 0.81785214, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.84487408, + "num_input_tokens_seen": 180624370, + "step": 8400, + "time_per_iteration": 4.149063587188721 + }, + { + "auxiliary_loss_clip": 0.01458788, + "auxiliary_loss_mlp": 0.01237955, + "balance_loss_clip": 1.14582086, + "balance_loss_mlp": 1.02776539, + "epoch": 0.5050954456636104, + "flos": 22822284278880.0, + "grad_norm": 2.019920334624618, + "language_loss": 0.7922014, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.81916881, + "num_input_tokens_seen": 180642450, + "step": 8401, + "time_per_iteration": 4.224117279052734 + }, + { + "auxiliary_loss_clip": 0.0145095, + "auxiliary_loss_mlp": 0.01235111, + "balance_loss_clip": 1.13907683, + "balance_loss_mlp": 1.02492106, + "epoch": 0.5051555689162783, + "flos": 30451002955200.0, + "grad_norm": 1.753945861578984, + "language_loss": 0.702079, + "learning_rate": 2.063666227349593e-06, + "loss": 0.72893953, + "num_input_tokens_seen": 180665250, + "step": 8402, + "time_per_iteration": 2.8807528018951416 + }, + { + "auxiliary_loss_clip": 0.01451061, + "auxiliary_loss_mlp": 0.01233313, + "balance_loss_clip": 1.13884437, + "balance_loss_mlp": 1.02178836, + "epoch": 0.5052156921689464, + "flos": 21290085097920.0, + "grad_norm": 1.6731389189276131, + "language_loss": 0.6942786, + "learning_rate": 2.063276961843422e-06, + "loss": 0.72112226, + "num_input_tokens_seen": 180687425, + "step": 8403, + "time_per_iteration": 2.7776753902435303 + }, + { + "auxiliary_loss_clip": 0.0145486, + "auxiliary_loss_mlp": 0.01236216, + "balance_loss_clip": 1.14413905, + "balance_loss_mlp": 1.02812469, + "epoch": 0.5052758154216143, + "flos": 25083583765920.0, + "grad_norm": 1.48118617352294, + "language_loss": 0.86010313, + "learning_rate": 2.062887693937781e-06, + "loss": 0.88701391, + "num_input_tokens_seen": 180708725, + "step": 8404, + "time_per_iteration": 2.8877437114715576 + }, + { + "auxiliary_loss_clip": 0.01452107, + "auxiliary_loss_mlp": 0.01236902, + "balance_loss_clip": 1.13991475, + "balance_loss_mlp": 1.02842855, + "epoch": 0.5053359386742823, + "flos": 20887549528320.0, + "grad_norm": 1.6515173828292888, + "language_loss": 0.7591399, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.78602993, + "num_input_tokens_seen": 180727990, + "step": 8405, + "time_per_iteration": 2.8019416332244873 + }, + { + "auxiliary_loss_clip": 0.01449579, + "auxiliary_loss_mlp": 0.0124131, + "balance_loss_clip": 1.13762426, + "balance_loss_mlp": 1.0297848, + "epoch": 0.5053960619269503, + "flos": 37746405682560.0, + "grad_norm": 1.6874061164551402, + "language_loss": 0.73148179, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75839067, + "num_input_tokens_seen": 180749765, + "step": 8406, + "time_per_iteration": 3.0982534885406494 + }, + { + "auxiliary_loss_clip": 0.0145112, + "auxiliary_loss_mlp": 0.01230542, + "balance_loss_clip": 1.13784099, + "balance_loss_mlp": 1.02340388, + "epoch": 0.5054561851796182, + "flos": 23516111031840.0, + "grad_norm": 1.9074612994821232, + "language_loss": 0.77054107, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.79735768, + "num_input_tokens_seen": 180769580, + "step": 8407, + "time_per_iteration": 4.308155059814453 + }, + { + "auxiliary_loss_clip": 0.01448793, + "auxiliary_loss_mlp": 0.01241833, + "balance_loss_clip": 1.13580465, + "balance_loss_mlp": 1.03412282, + "epoch": 0.5055163084322862, + "flos": 30412922718240.0, + "grad_norm": 3.1034797106404395, + "language_loss": 0.63338816, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.66029441, + "num_input_tokens_seen": 180790295, + "step": 8408, + "time_per_iteration": 2.879998207092285 + }, + { + "auxiliary_loss_clip": 0.01455307, + "auxiliary_loss_mlp": 0.01238863, + "balance_loss_clip": 1.14204919, + "balance_loss_mlp": 1.02924538, + "epoch": 0.5055764316849541, + "flos": 20261349413280.0, + "grad_norm": 2.0334964288844546, + "language_loss": 0.63642138, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.6633631, + "num_input_tokens_seen": 180807875, + "step": 8409, + "time_per_iteration": 2.8088762760162354 + }, + { + "auxiliary_loss_clip": 0.01452517, + "auxiliary_loss_mlp": 0.01227484, + "balance_loss_clip": 1.14041972, + "balance_loss_mlp": 1.01882052, + "epoch": 0.5056365549376222, + "flos": 26073480650400.0, + "grad_norm": 1.4066503951880198, + "language_loss": 0.70959216, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73639214, + "num_input_tokens_seen": 180831300, + "step": 8410, + "time_per_iteration": 2.8402857780456543 + }, + { + "auxiliary_loss_clip": 0.0145108, + "auxiliary_loss_mlp": 0.0123764, + "balance_loss_clip": 1.1371336, + "balance_loss_mlp": 1.02878571, + "epoch": 0.5056966781902901, + "flos": 19280972496960.0, + "grad_norm": 1.5653974266454203, + "language_loss": 0.79353821, + "learning_rate": 2.060162752653113e-06, + "loss": 0.82042545, + "num_input_tokens_seen": 180849055, + "step": 8411, + "time_per_iteration": 2.815978527069092 + }, + { + "auxiliary_loss_clip": 0.01455605, + "auxiliary_loss_mlp": 0.01238647, + "balance_loss_clip": 1.14244676, + "balance_loss_mlp": 1.02788448, + "epoch": 0.5057568014429581, + "flos": 21325396579200.0, + "grad_norm": 2.1008522729592016, + "language_loss": 0.81731141, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.8442539, + "num_input_tokens_seen": 180867395, + "step": 8412, + "time_per_iteration": 2.768317937850952 + }, + { + "auxiliary_loss_clip": 0.01455894, + "auxiliary_loss_mlp": 0.01242918, + "balance_loss_clip": 1.14264715, + "balance_loss_mlp": 1.03329992, + "epoch": 0.505816924695626, + "flos": 17495524441440.0, + "grad_norm": 2.0227667637726348, + "language_loss": 0.81049919, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.83748734, + "num_input_tokens_seen": 180886670, + "step": 8413, + "time_per_iteration": 2.729931354522705 + }, + { + "auxiliary_loss_clip": 0.01462037, + "auxiliary_loss_mlp": 0.01245581, + "balance_loss_clip": 1.14839518, + "balance_loss_mlp": 1.03539085, + "epoch": 0.505877047948294, + "flos": 21144211937280.0, + "grad_norm": 2.632562975866954, + "language_loss": 0.80480343, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.83187962, + "num_input_tokens_seen": 180904645, + "step": 8414, + "time_per_iteration": 2.844374179840088 + }, + { + "auxiliary_loss_clip": 0.01455021, + "auxiliary_loss_mlp": 0.01243743, + "balance_loss_clip": 1.14077353, + "balance_loss_mlp": 1.03774905, + "epoch": 0.5059371712009619, + "flos": 36352645742880.0, + "grad_norm": 2.079792529405936, + "language_loss": 0.62045729, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64744496, + "num_input_tokens_seen": 180922340, + "step": 8415, + "time_per_iteration": 2.8548641204833984 + }, + { + "auxiliary_loss_clip": 0.01458091, + "auxiliary_loss_mlp": 0.01234721, + "balance_loss_clip": 1.14476323, + "balance_loss_mlp": 1.02319574, + "epoch": 0.50599729445363, + "flos": 22675538770560.0, + "grad_norm": 1.6063173454513267, + "language_loss": 0.81614965, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.84307778, + "num_input_tokens_seen": 180941350, + "step": 8416, + "time_per_iteration": 2.7957308292388916 + }, + { + "auxiliary_loss_clip": 0.0146213, + "auxiliary_loss_mlp": 0.01234443, + "balance_loss_clip": 1.15028703, + "balance_loss_mlp": 1.0246346, + "epoch": 0.5060574177062979, + "flos": 22750106261760.0, + "grad_norm": 2.0554269910450875, + "language_loss": 0.7928797, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81984544, + "num_input_tokens_seen": 180960720, + "step": 8417, + "time_per_iteration": 2.7792282104492188 + }, + { + "auxiliary_loss_clip": 0.01457275, + "auxiliary_loss_mlp": 0.01239299, + "balance_loss_clip": 1.14333081, + "balance_loss_mlp": 1.03082585, + "epoch": 0.5061175409589659, + "flos": 21655374850080.0, + "grad_norm": 3.0487638661543772, + "language_loss": 0.62494278, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.65190858, + "num_input_tokens_seen": 180979725, + "step": 8418, + "time_per_iteration": 2.823110580444336 + }, + { + "auxiliary_loss_clip": 0.01455238, + "auxiliary_loss_mlp": 0.01244186, + "balance_loss_clip": 1.14165628, + "balance_loss_mlp": 1.03399622, + "epoch": 0.5061776642116339, + "flos": 21618394529760.0, + "grad_norm": 2.058132358636353, + "language_loss": 0.7776314, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.80462563, + "num_input_tokens_seen": 180998980, + "step": 8419, + "time_per_iteration": 2.765845537185669 + }, + { + "auxiliary_loss_clip": 0.01456158, + "auxiliary_loss_mlp": 0.01239612, + "balance_loss_clip": 1.14274907, + "balance_loss_mlp": 1.02923179, + "epoch": 0.5062377874643018, + "flos": 24428633741280.0, + "grad_norm": 2.3986427984352336, + "language_loss": 0.77101189, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79796958, + "num_input_tokens_seen": 181019165, + "step": 8420, + "time_per_iteration": 2.823617935180664 + }, + { + "auxiliary_loss_clip": 0.01462405, + "auxiliary_loss_mlp": 0.01234047, + "balance_loss_clip": 1.14741945, + "balance_loss_mlp": 1.02366638, + "epoch": 0.5062979107169698, + "flos": 22526403788160.0, + "grad_norm": 1.9637081905539826, + "language_loss": 0.7734102, + "learning_rate": 2.056269786726999e-06, + "loss": 0.80037475, + "num_input_tokens_seen": 181037110, + "step": 8421, + "time_per_iteration": 2.7759530544281006 + }, + { + "auxiliary_loss_clip": 0.01452264, + "auxiliary_loss_mlp": 0.01235626, + "balance_loss_clip": 1.1402514, + "balance_loss_mlp": 1.02753448, + "epoch": 0.5063580339696377, + "flos": 24574317261120.0, + "grad_norm": 1.4228750069283873, + "language_loss": 0.66665912, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.69353801, + "num_input_tokens_seen": 181057775, + "step": 8422, + "time_per_iteration": 2.7960281372070312 + }, + { + "auxiliary_loss_clip": 0.01455801, + "auxiliary_loss_mlp": 0.01246383, + "balance_loss_clip": 1.14496732, + "balance_loss_mlp": 1.03676534, + "epoch": 0.5064181572223058, + "flos": 22598050811040.0, + "grad_norm": 2.521911003405738, + "language_loss": 0.81803077, + "learning_rate": 2.05549116746431e-06, + "loss": 0.8450526, + "num_input_tokens_seen": 181078260, + "step": 8423, + "time_per_iteration": 2.8572099208831787 + }, + { + "auxiliary_loss_clip": 0.01459227, + "auxiliary_loss_mlp": 0.01240739, + "balance_loss_clip": 1.14746141, + "balance_loss_mlp": 1.03264689, + "epoch": 0.5064782804749737, + "flos": 25997358104640.0, + "grad_norm": 4.10413576485368, + "language_loss": 0.74909431, + "learning_rate": 2.055101854669237e-06, + "loss": 0.77609396, + "num_input_tokens_seen": 181098755, + "step": 8424, + "time_per_iteration": 2.938655138015747 + }, + { + "auxiliary_loss_clip": 0.0145577, + "auxiliary_loss_mlp": 0.01234395, + "balance_loss_clip": 1.1439724, + "balance_loss_mlp": 1.02668476, + "epoch": 0.5065384037276417, + "flos": 28555827639840.0, + "grad_norm": 1.7704276164138455, + "language_loss": 0.71576995, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.74267161, + "num_input_tokens_seen": 181121570, + "step": 8425, + "time_per_iteration": 2.909991502761841 + }, + { + "auxiliary_loss_clip": 0.01456393, + "auxiliary_loss_mlp": 0.01246551, + "balance_loss_clip": 1.14451575, + "balance_loss_mlp": 1.03617096, + "epoch": 0.5065985269803096, + "flos": 22968233295840.0, + "grad_norm": 1.7740837124769362, + "language_loss": 0.78388327, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.81091273, + "num_input_tokens_seen": 181140240, + "step": 8426, + "time_per_iteration": 2.808183431625366 + }, + { + "auxiliary_loss_clip": 0.01453925, + "auxiliary_loss_mlp": 0.01240841, + "balance_loss_clip": 1.1414938, + "balance_loss_mlp": 1.0310322, + "epoch": 0.5066586502329776, + "flos": 21610277903520.0, + "grad_norm": 2.7695020292802393, + "language_loss": 0.78135723, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80830491, + "num_input_tokens_seen": 181158630, + "step": 8427, + "time_per_iteration": 2.8215370178222656 + }, + { + "auxiliary_loss_clip": 0.01460255, + "auxiliary_loss_mlp": 0.01237627, + "balance_loss_clip": 1.14798474, + "balance_loss_mlp": 1.02724695, + "epoch": 0.5067187734856455, + "flos": 20342023338240.0, + "grad_norm": 2.1682098191707855, + "language_loss": 0.71656412, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.74354291, + "num_input_tokens_seen": 181176405, + "step": 8428, + "time_per_iteration": 2.7742176055908203 + }, + { + "auxiliary_loss_clip": 0.01457899, + "auxiliary_loss_mlp": 0.01234236, + "balance_loss_clip": 1.14475358, + "balance_loss_mlp": 1.0261445, + "epoch": 0.5067788967383136, + "flos": 28843667360640.0, + "grad_norm": 1.698887519105514, + "language_loss": 0.8287378, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.85565913, + "num_input_tokens_seen": 181197595, + "step": 8429, + "time_per_iteration": 2.8323354721069336 + }, + { + "auxiliary_loss_clip": 0.01458887, + "auxiliary_loss_mlp": 0.01238298, + "balance_loss_clip": 1.14581096, + "balance_loss_mlp": 1.02848935, + "epoch": 0.5068390199909815, + "flos": 32452795421280.0, + "grad_norm": 2.180024067176753, + "language_loss": 0.73518085, + "learning_rate": 2.052765934536682e-06, + "loss": 0.76215267, + "num_input_tokens_seen": 181218560, + "step": 8430, + "time_per_iteration": 2.888859987258911 + }, + { + "auxiliary_loss_clip": 0.01459669, + "auxiliary_loss_mlp": 0.01234406, + "balance_loss_clip": 1.14825678, + "balance_loss_mlp": 1.02631426, + "epoch": 0.5068991432436495, + "flos": 23148697302720.0, + "grad_norm": 1.6543318729207506, + "language_loss": 0.76315022, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.79009092, + "num_input_tokens_seen": 181237095, + "step": 8431, + "time_per_iteration": 2.790111780166626 + }, + { + "auxiliary_loss_clip": 0.01458891, + "auxiliary_loss_mlp": 0.01238334, + "balance_loss_clip": 1.14519191, + "balance_loss_mlp": 1.02967036, + "epoch": 0.5069592664963174, + "flos": 19938046498560.0, + "grad_norm": 1.6745236253089533, + "language_loss": 0.722085, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74905729, + "num_input_tokens_seen": 181255940, + "step": 8432, + "time_per_iteration": 4.113640308380127 + }, + { + "auxiliary_loss_clip": 0.0156157, + "auxiliary_loss_mlp": 0.01220306, + "balance_loss_clip": 1.27344012, + "balance_loss_mlp": 1.02537537, + "epoch": 0.5070193897489854, + "flos": 65800059383520.0, + "grad_norm": 0.7542063475231839, + "language_loss": 0.63586295, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.66368175, + "num_input_tokens_seen": 181316945, + "step": 8433, + "time_per_iteration": 3.37017560005188 + }, + { + "auxiliary_loss_clip": 0.01463878, + "auxiliary_loss_mlp": 0.01238583, + "balance_loss_clip": 1.15129995, + "balance_loss_mlp": 1.02915645, + "epoch": 0.5070795130016534, + "flos": 17277283622880.0, + "grad_norm": 1.792869167682501, + "language_loss": 0.77484286, + "learning_rate": 2.051208614233681e-06, + "loss": 0.80186743, + "num_input_tokens_seen": 181335555, + "step": 8434, + "time_per_iteration": 2.6960644721984863 + }, + { + "auxiliary_loss_clip": 0.01463166, + "auxiliary_loss_mlp": 0.01241855, + "balance_loss_clip": 1.15183067, + "balance_loss_mlp": 1.03013921, + "epoch": 0.5071396362543213, + "flos": 21072109776480.0, + "grad_norm": 1.8165745912409554, + "language_loss": 0.70974195, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.73679209, + "num_input_tokens_seen": 181354580, + "step": 8435, + "time_per_iteration": 2.8044650554656982 + }, + { + "auxiliary_loss_clip": 0.01464759, + "auxiliary_loss_mlp": 0.0124176, + "balance_loss_clip": 1.15289998, + "balance_loss_mlp": 1.02870905, + "epoch": 0.5071997595069894, + "flos": 23146838822880.0, + "grad_norm": 1.9174296687466938, + "language_loss": 0.72216356, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74922884, + "num_input_tokens_seen": 181374320, + "step": 8436, + "time_per_iteration": 2.8774635791778564 + }, + { + "auxiliary_loss_clip": 0.01463722, + "auxiliary_loss_mlp": 0.01239914, + "balance_loss_clip": 1.15246415, + "balance_loss_mlp": 1.03163171, + "epoch": 0.5072598827596573, + "flos": 22749385626720.0, + "grad_norm": 1.558157650157168, + "language_loss": 0.83792645, + "learning_rate": 2.050040603565483e-06, + "loss": 0.86496276, + "num_input_tokens_seen": 181392190, + "step": 8437, + "time_per_iteration": 2.8307290077209473 + }, + { + "auxiliary_loss_clip": 0.01461184, + "auxiliary_loss_mlp": 0.01240665, + "balance_loss_clip": 1.14953661, + "balance_loss_mlp": 1.03257298, + "epoch": 0.5073200060123253, + "flos": 22568580266400.0, + "grad_norm": 2.073009466974025, + "language_loss": 0.8058368, + "learning_rate": 2.049651262861309e-06, + "loss": 0.83285522, + "num_input_tokens_seen": 181413890, + "step": 8438, + "time_per_iteration": 4.259103775024414 + }, + { + "auxiliary_loss_clip": 0.01462425, + "auxiliary_loss_mlp": 0.01241515, + "balance_loss_clip": 1.15111804, + "balance_loss_mlp": 1.02941751, + "epoch": 0.5073801292649932, + "flos": 25808398189920.0, + "grad_norm": 1.7746889226913376, + "language_loss": 0.79550761, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.82254702, + "num_input_tokens_seen": 181433240, + "step": 8439, + "time_per_iteration": 4.245609998703003 + }, + { + "auxiliary_loss_clip": 0.01460921, + "auxiliary_loss_mlp": 0.01232999, + "balance_loss_clip": 1.14884925, + "balance_loss_mlp": 1.02643275, + "epoch": 0.5074402525176612, + "flos": 25376278291200.0, + "grad_norm": 1.5118236103432918, + "language_loss": 0.7074402, + "learning_rate": 2.048872575819383e-06, + "loss": 0.73437941, + "num_input_tokens_seen": 181453535, + "step": 8440, + "time_per_iteration": 2.7845027446746826 + }, + { + "auxiliary_loss_clip": 0.01460164, + "auxiliary_loss_mlp": 0.01235144, + "balance_loss_clip": 1.14872253, + "balance_loss_mlp": 1.02590835, + "epoch": 0.5075003757703291, + "flos": 26066653581600.0, + "grad_norm": 1.649284320894158, + "language_loss": 0.71192175, + "learning_rate": 2.048483229511158e-06, + "loss": 0.73887479, + "num_input_tokens_seen": 181474195, + "step": 8441, + "time_per_iteration": 2.8528172969818115 + }, + { + "auxiliary_loss_clip": 0.01458153, + "auxiliary_loss_mlp": 0.01238561, + "balance_loss_clip": 1.14658773, + "balance_loss_mlp": 1.02856183, + "epoch": 0.5075604990229972, + "flos": 21837621480480.0, + "grad_norm": 1.7437556467502504, + "language_loss": 0.63788819, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.66485536, + "num_input_tokens_seen": 181494000, + "step": 8442, + "time_per_iteration": 2.763430118560791 + }, + { + "auxiliary_loss_clip": 0.01459238, + "auxiliary_loss_mlp": 0.01240876, + "balance_loss_clip": 1.14722979, + "balance_loss_mlp": 1.03564572, + "epoch": 0.5076206222756651, + "flos": 31981457440800.0, + "grad_norm": 1.4889366091671954, + "language_loss": 0.71364921, + "learning_rate": 2.047704531394006e-06, + "loss": 0.74065042, + "num_input_tokens_seen": 181515955, + "step": 8443, + "time_per_iteration": 2.8329458236694336 + }, + { + "auxiliary_loss_clip": 0.01456165, + "auxiliary_loss_mlp": 0.01238495, + "balance_loss_clip": 1.1450634, + "balance_loss_mlp": 1.03097498, + "epoch": 0.5076807455283331, + "flos": 36907047122400.0, + "grad_norm": 1.4037189704647266, + "language_loss": 0.61779594, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64474255, + "num_input_tokens_seen": 181540225, + "step": 8444, + "time_per_iteration": 2.9173481464385986 + }, + { + "auxiliary_loss_clip": 0.01457957, + "auxiliary_loss_mlp": 0.01235166, + "balance_loss_clip": 1.1447897, + "balance_loss_mlp": 1.02802813, + "epoch": 0.507740868781001, + "flos": 29865044982240.0, + "grad_norm": 1.6416795757278273, + "language_loss": 0.6365298, + "learning_rate": 2.046925826041012e-06, + "loss": 0.66346103, + "num_input_tokens_seen": 181560125, + "step": 8445, + "time_per_iteration": 4.2832677364349365 + }, + { + "auxiliary_loss_clip": 0.01557847, + "auxiliary_loss_mlp": 0.01204498, + "balance_loss_clip": 1.27149796, + "balance_loss_mlp": 1.00765991, + "epoch": 0.507800992033669, + "flos": 61924635161280.0, + "grad_norm": 0.8450619511007995, + "language_loss": 0.61850148, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.6461249, + "num_input_tokens_seen": 181618830, + "step": 8446, + "time_per_iteration": 3.3514404296875 + }, + { + "auxiliary_loss_clip": 0.01453331, + "auxiliary_loss_mlp": 0.01239062, + "balance_loss_clip": 1.14234245, + "balance_loss_mlp": 1.03249609, + "epoch": 0.507861115286337, + "flos": 20702382429600.0, + "grad_norm": 1.6998387033262483, + "language_loss": 0.80571151, + "learning_rate": 2.04614711357029e-06, + "loss": 0.8326354, + "num_input_tokens_seen": 181637120, + "step": 8447, + "time_per_iteration": 2.7601449489593506 + }, + { + "auxiliary_loss_clip": 0.0145525, + "auxiliary_loss_mlp": 0.01248506, + "balance_loss_clip": 1.14329898, + "balance_loss_mlp": 1.04327512, + "epoch": 0.507921238539005, + "flos": 30849707780640.0, + "grad_norm": 1.529690952842943, + "language_loss": 0.70449424, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.73153186, + "num_input_tokens_seen": 181659965, + "step": 8448, + "time_per_iteration": 2.898916721343994 + }, + { + "auxiliary_loss_clip": 0.01457907, + "auxiliary_loss_mlp": 0.0123321, + "balance_loss_clip": 1.14802337, + "balance_loss_mlp": 1.02778864, + "epoch": 0.507981361791673, + "flos": 35703157373280.0, + "grad_norm": 2.0049944877024335, + "language_loss": 0.71905994, + "learning_rate": 2.045368394099955e-06, + "loss": 0.74597108, + "num_input_tokens_seen": 181685290, + "step": 8449, + "time_per_iteration": 3.0587961673736572 + }, + { + "auxiliary_loss_clip": 0.01454767, + "auxiliary_loss_mlp": 0.01234207, + "balance_loss_clip": 1.14375937, + "balance_loss_mlp": 1.02687883, + "epoch": 0.5080414850443409, + "flos": 27163888251840.0, + "grad_norm": 1.648800140490048, + "language_loss": 0.7290616, + "learning_rate": 2.044979031776844e-06, + "loss": 0.75595129, + "num_input_tokens_seen": 181706080, + "step": 8450, + "time_per_iteration": 2.8604133129119873 + }, + { + "auxiliary_loss_clip": 0.01456302, + "auxiliary_loss_mlp": 0.01238711, + "balance_loss_clip": 1.14494514, + "balance_loss_mlp": 1.02871215, + "epoch": 0.5081016082970089, + "flos": 27087424352640.0, + "grad_norm": 9.555205035046942, + "language_loss": 0.77089685, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79784697, + "num_input_tokens_seen": 181724805, + "step": 8451, + "time_per_iteration": 2.876329183578491 + }, + { + "auxiliary_loss_clip": 0.0145584, + "auxiliary_loss_mlp": 0.01240834, + "balance_loss_clip": 1.14501357, + "balance_loss_mlp": 1.03464937, + "epoch": 0.5081617315496768, + "flos": 22858733604960.0, + "grad_norm": 2.0521877818675467, + "language_loss": 0.85123134, + "learning_rate": 2.044200302028559e-06, + "loss": 0.87819803, + "num_input_tokens_seen": 181743725, + "step": 8452, + "time_per_iteration": 2.806689977645874 + }, + { + "auxiliary_loss_clip": 0.01455895, + "auxiliary_loss_mlp": 0.0124285, + "balance_loss_clip": 1.14496732, + "balance_loss_mlp": 1.03113413, + "epoch": 0.5082218548023448, + "flos": 16283328425280.0, + "grad_norm": 2.557641808535434, + "language_loss": 0.77533406, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.80232155, + "num_input_tokens_seen": 181757720, + "step": 8453, + "time_per_iteration": 2.6943092346191406 + }, + { + "auxiliary_loss_clip": 0.01457061, + "auxiliary_loss_mlp": 0.01241471, + "balance_loss_clip": 1.14715958, + "balance_loss_mlp": 1.03318858, + "epoch": 0.5082819780550127, + "flos": 24462997018560.0, + "grad_norm": 1.9558547932973593, + "language_loss": 0.76658189, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.7935673, + "num_input_tokens_seen": 181778545, + "step": 8454, + "time_per_iteration": 2.8159613609313965 + }, + { + "auxiliary_loss_clip": 0.01453416, + "auxiliary_loss_mlp": 0.01247293, + "balance_loss_clip": 1.14247453, + "balance_loss_mlp": 1.03805661, + "epoch": 0.5083421013076808, + "flos": 23405473496160.0, + "grad_norm": 1.7517457848884612, + "language_loss": 0.89285898, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91986609, + "num_input_tokens_seen": 181799495, + "step": 8455, + "time_per_iteration": 2.8131186962127686 + }, + { + "auxiliary_loss_clip": 0.0145197, + "auxiliary_loss_mlp": 0.01242383, + "balance_loss_clip": 1.13959372, + "balance_loss_mlp": 1.03066707, + "epoch": 0.5084022245603487, + "flos": 23874649571520.0, + "grad_norm": 1.799565503022172, + "language_loss": 0.62366694, + "learning_rate": 2.042642822537149e-06, + "loss": 0.65061045, + "num_input_tokens_seen": 181818400, + "step": 8456, + "time_per_iteration": 2.7266435623168945 + }, + { + "auxiliary_loss_clip": 0.01532819, + "auxiliary_loss_mlp": 0.01203598, + "balance_loss_clip": 1.24847603, + "balance_loss_mlp": 1.00752258, + "epoch": 0.5084623478130167, + "flos": 62879372277120.0, + "grad_norm": 0.8067537023636001, + "language_loss": 0.62330234, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.65066648, + "num_input_tokens_seen": 181875975, + "step": 8457, + "time_per_iteration": 3.198669195175171 + }, + { + "auxiliary_loss_clip": 0.01447544, + "auxiliary_loss_mlp": 0.01240208, + "balance_loss_clip": 1.13661397, + "balance_loss_mlp": 1.03173482, + "epoch": 0.5085224710656846, + "flos": 22348329255360.0, + "grad_norm": 5.702377411815687, + "language_loss": 0.67571062, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.70258808, + "num_input_tokens_seen": 181896450, + "step": 8458, + "time_per_iteration": 2.824495315551758 + }, + { + "auxiliary_loss_clip": 0.01450049, + "auxiliary_loss_mlp": 0.01233749, + "balance_loss_clip": 1.13761139, + "balance_loss_mlp": 1.02489471, + "epoch": 0.5085825943183526, + "flos": 26068587917760.0, + "grad_norm": 1.639334781877702, + "language_loss": 0.77424169, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.80107963, + "num_input_tokens_seen": 181916770, + "step": 8459, + "time_per_iteration": 2.8800294399261475 + }, + { + "auxiliary_loss_clip": 0.01454948, + "auxiliary_loss_mlp": 0.01253169, + "balance_loss_clip": 1.14331937, + "balance_loss_mlp": 1.04297948, + "epoch": 0.5086427175710206, + "flos": 17422663717440.0, + "grad_norm": 2.063160913110453, + "language_loss": 0.80707574, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.83415687, + "num_input_tokens_seen": 181932710, + "step": 8460, + "time_per_iteration": 2.755476951599121 + }, + { + "auxiliary_loss_clip": 0.01450048, + "auxiliary_loss_mlp": 0.01246507, + "balance_loss_clip": 1.13881946, + "balance_loss_mlp": 1.03879702, + "epoch": 0.5087028408236886, + "flos": 20633883444000.0, + "grad_norm": 1.9514529115221042, + "language_loss": 0.68985742, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.71682298, + "num_input_tokens_seen": 181950665, + "step": 8461, + "time_per_iteration": 2.8303725719451904 + }, + { + "auxiliary_loss_clip": 0.01454496, + "auxiliary_loss_mlp": 0.0124087, + "balance_loss_clip": 1.14472246, + "balance_loss_mlp": 1.03125238, + "epoch": 0.5087629640763566, + "flos": 25596225876960.0, + "grad_norm": 1.7308560080151518, + "language_loss": 0.76329982, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.7902534, + "num_input_tokens_seen": 181971270, + "step": 8462, + "time_per_iteration": 2.830819606781006 + }, + { + "auxiliary_loss_clip": 0.01451596, + "auxiliary_loss_mlp": 0.01236572, + "balance_loss_clip": 1.14022899, + "balance_loss_mlp": 1.02771688, + "epoch": 0.5088230873290245, + "flos": 13263344303040.0, + "grad_norm": 2.0961857515290023, + "language_loss": 0.81351542, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.84039712, + "num_input_tokens_seen": 181988410, + "step": 8463, + "time_per_iteration": 2.8062241077423096 + }, + { + "auxiliary_loss_clip": 0.01444993, + "auxiliary_loss_mlp": 0.01238107, + "balance_loss_clip": 1.13354254, + "balance_loss_mlp": 1.02887034, + "epoch": 0.5088832105816925, + "flos": 20045308428000.0, + "grad_norm": 1.7655293790044055, + "language_loss": 0.76336837, + "learning_rate": 2.039527786882341e-06, + "loss": 0.7901994, + "num_input_tokens_seen": 182006530, + "step": 8464, + "time_per_iteration": 2.7713801860809326 + }, + { + "auxiliary_loss_clip": 0.0153561, + "auxiliary_loss_mlp": 0.01221596, + "balance_loss_clip": 1.25094199, + "balance_loss_mlp": 1.02552032, + "epoch": 0.5089433338343604, + "flos": 67429848448800.0, + "grad_norm": 0.6839309947761141, + "language_loss": 0.59314358, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.62071562, + "num_input_tokens_seen": 182074240, + "step": 8465, + "time_per_iteration": 3.4772493839263916 + }, + { + "auxiliary_loss_clip": 0.01445725, + "auxiliary_loss_mlp": 0.01255389, + "balance_loss_clip": 1.1353935, + "balance_loss_mlp": 1.04920435, + "epoch": 0.5090034570870284, + "flos": 22712519090880.0, + "grad_norm": 2.9576894350786405, + "language_loss": 0.79898632, + "learning_rate": 2.038749012684354e-06, + "loss": 0.82599747, + "num_input_tokens_seen": 182093360, + "step": 8466, + "time_per_iteration": 2.852165699005127 + }, + { + "auxiliary_loss_clip": 0.01450163, + "auxiliary_loss_mlp": 0.01228378, + "balance_loss_clip": 1.13758695, + "balance_loss_mlp": 1.0189507, + "epoch": 0.5090635803396963, + "flos": 20447730213120.0, + "grad_norm": 1.5836048761375348, + "language_loss": 0.7846868, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.81147218, + "num_input_tokens_seen": 182110170, + "step": 8467, + "time_per_iteration": 2.7898688316345215 + }, + { + "auxiliary_loss_clip": 0.01451479, + "auxiliary_loss_mlp": 0.012342, + "balance_loss_clip": 1.14034271, + "balance_loss_mlp": 1.026299, + "epoch": 0.5091237035923644, + "flos": 23771142529920.0, + "grad_norm": 1.6634494348252793, + "language_loss": 0.74108863, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76794541, + "num_input_tokens_seen": 182129570, + "step": 8468, + "time_per_iteration": 2.8206725120544434 + }, + { + "auxiliary_loss_clip": 0.01450801, + "auxiliary_loss_mlp": 0.01240687, + "balance_loss_clip": 1.13957787, + "balance_loss_mlp": 1.03106928, + "epoch": 0.5091838268450323, + "flos": 18329610987360.0, + "grad_norm": 1.9740886914298061, + "language_loss": 0.7782135, + "learning_rate": 2.03758084040404e-06, + "loss": 0.80512834, + "num_input_tokens_seen": 182147565, + "step": 8469, + "time_per_iteration": 2.7214443683624268 + }, + { + "auxiliary_loss_clip": 0.01466719, + "auxiliary_loss_mlp": 0.01243893, + "balance_loss_clip": 1.1544379, + "balance_loss_mlp": 1.03351212, + "epoch": 0.5092439500977003, + "flos": 29060087627520.0, + "grad_norm": 1.5041156892359977, + "language_loss": 0.69841444, + "learning_rate": 2.037191446774109e-06, + "loss": 0.72552055, + "num_input_tokens_seen": 182169695, + "step": 8470, + "time_per_iteration": 4.336380481719971 + }, + { + "auxiliary_loss_clip": 0.0144818, + "auxiliary_loss_mlp": 0.01239046, + "balance_loss_clip": 1.13575721, + "balance_loss_mlp": 1.02942812, + "epoch": 0.5093040733503682, + "flos": 13555659546720.0, + "grad_norm": 1.7829469150271497, + "language_loss": 0.73606277, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.76293504, + "num_input_tokens_seen": 182186385, + "step": 8471, + "time_per_iteration": 2.7236571311950684 + }, + { + "auxiliary_loss_clip": 0.01545774, + "auxiliary_loss_mlp": 0.01208961, + "balance_loss_clip": 1.26245904, + "balance_loss_mlp": 1.01136017, + "epoch": 0.5093641966030362, + "flos": 68913726789600.0, + "grad_norm": 0.7462508033598524, + "language_loss": 0.58052266, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60807002, + "num_input_tokens_seen": 182247095, + "step": 8472, + "time_per_iteration": 3.330982208251953 + }, + { + "auxiliary_loss_clip": 0.01450334, + "auxiliary_loss_mlp": 0.01237505, + "balance_loss_clip": 1.13976479, + "balance_loss_mlp": 1.03151166, + "epoch": 0.5094243198557042, + "flos": 21583158904800.0, + "grad_norm": 1.8979924970903783, + "language_loss": 0.69105136, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71792972, + "num_input_tokens_seen": 182266380, + "step": 8473, + "time_per_iteration": 2.8473031520843506 + }, + { + "auxiliary_loss_clip": 0.01459178, + "auxiliary_loss_mlp": 0.01237312, + "balance_loss_clip": 1.14867485, + "balance_loss_mlp": 1.02559626, + "epoch": 0.5094844431083722, + "flos": 28843288079040.0, + "grad_norm": 1.8638809957445082, + "language_loss": 0.85318267, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.88014758, + "num_input_tokens_seen": 182284685, + "step": 8474, + "time_per_iteration": 2.916679859161377 + }, + { + "auxiliary_loss_clip": 0.01453054, + "auxiliary_loss_mlp": 0.01239271, + "balance_loss_clip": 1.14175594, + "balance_loss_mlp": 1.03194201, + "epoch": 0.5095445663610402, + "flos": 14977903898880.0, + "grad_norm": 2.782404395823208, + "language_loss": 0.65536571, + "learning_rate": 2.035244457765222e-06, + "loss": 0.68228889, + "num_input_tokens_seen": 182301810, + "step": 8475, + "time_per_iteration": 2.7910866737365723 + }, + { + "auxiliary_loss_clip": 0.01461845, + "auxiliary_loss_mlp": 0.0124557, + "balance_loss_clip": 1.15147007, + "balance_loss_mlp": 1.03461707, + "epoch": 0.5096046896137081, + "flos": 20779187682240.0, + "grad_norm": 7.573852599182042, + "language_loss": 0.81913894, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.8462131, + "num_input_tokens_seen": 182320285, + "step": 8476, + "time_per_iteration": 4.126841068267822 + }, + { + "auxiliary_loss_clip": 0.01464379, + "auxiliary_loss_mlp": 0.01256509, + "balance_loss_clip": 1.15190113, + "balance_loss_mlp": 1.04650998, + "epoch": 0.5096648128663761, + "flos": 23187308533920.0, + "grad_norm": 2.3447966119623747, + "language_loss": 0.80739379, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83460265, + "num_input_tokens_seen": 182339465, + "step": 8477, + "time_per_iteration": 4.373257875442505 + }, + { + "auxiliary_loss_clip": 0.01461916, + "auxiliary_loss_mlp": 0.0124404, + "balance_loss_clip": 1.15148449, + "balance_loss_mlp": 1.03289676, + "epoch": 0.509724936119044, + "flos": 22311614432160.0, + "grad_norm": 8.210852321646463, + "language_loss": 0.61422455, + "learning_rate": 2.034076248204082e-06, + "loss": 0.64128417, + "num_input_tokens_seen": 182358375, + "step": 8478, + "time_per_iteration": 2.8204920291900635 + }, + { + "auxiliary_loss_clip": 0.01460764, + "auxiliary_loss_mlp": 0.0123578, + "balance_loss_clip": 1.15070164, + "balance_loss_mlp": 1.02520835, + "epoch": 0.509785059371712, + "flos": 26289521635680.0, + "grad_norm": 2.0664495111179786, + "language_loss": 0.663077, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.69004238, + "num_input_tokens_seen": 182377935, + "step": 8479, + "time_per_iteration": 2.8236141204833984 + }, + { + "auxiliary_loss_clip": 0.01467744, + "auxiliary_loss_mlp": 0.01241032, + "balance_loss_clip": 1.15857279, + "balance_loss_mlp": 1.03217697, + "epoch": 0.50984518262438, + "flos": 22966716169440.0, + "grad_norm": 1.634427972604306, + "language_loss": 0.69442284, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.72151065, + "num_input_tokens_seen": 182396440, + "step": 8480, + "time_per_iteration": 2.7879178524017334 + }, + { + "auxiliary_loss_clip": 0.01466884, + "auxiliary_loss_mlp": 0.01241828, + "balance_loss_clip": 1.15631557, + "balance_loss_mlp": 1.03335452, + "epoch": 0.509905305877048, + "flos": 26215788564000.0, + "grad_norm": 1.7950291131133496, + "language_loss": 0.7941342, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.82122123, + "num_input_tokens_seen": 182415890, + "step": 8481, + "time_per_iteration": 2.787713050842285 + }, + { + "auxiliary_loss_clip": 0.01462237, + "auxiliary_loss_mlp": 0.0123461, + "balance_loss_clip": 1.15279257, + "balance_loss_mlp": 1.02747154, + "epoch": 0.5099654291297159, + "flos": 20342175050880.0, + "grad_norm": 1.6000283838184814, + "language_loss": 0.83273596, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85970443, + "num_input_tokens_seen": 182434235, + "step": 8482, + "time_per_iteration": 2.7501115798950195 + }, + { + "auxiliary_loss_clip": 0.01468704, + "auxiliary_loss_mlp": 0.01239177, + "balance_loss_clip": 1.15939426, + "balance_loss_mlp": 1.03127635, + "epoch": 0.5100255523823839, + "flos": 29057129231040.0, + "grad_norm": 1.7747892629858986, + "language_loss": 0.85518068, + "learning_rate": 2.032129206622238e-06, + "loss": 0.88225949, + "num_input_tokens_seen": 182454360, + "step": 8483, + "time_per_iteration": 2.7852942943573 + }, + { + "auxiliary_loss_clip": 0.01465226, + "auxiliary_loss_mlp": 0.0123321, + "balance_loss_clip": 1.15446782, + "balance_loss_mlp": 1.02340126, + "epoch": 0.5100856756350518, + "flos": 22458132371520.0, + "grad_norm": 1.8891551966341666, + "language_loss": 0.826536, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85352039, + "num_input_tokens_seen": 182471940, + "step": 8484, + "time_per_iteration": 4.312335968017578 + }, + { + "auxiliary_loss_clip": 0.01471872, + "auxiliary_loss_mlp": 0.01257798, + "balance_loss_clip": 1.16181326, + "balance_loss_mlp": 1.04779851, + "epoch": 0.5101457988877198, + "flos": 19173027860640.0, + "grad_norm": 1.8610882829721047, + "language_loss": 0.81646514, + "learning_rate": 2.031350381357736e-06, + "loss": 0.8437618, + "num_input_tokens_seen": 182490685, + "step": 8485, + "time_per_iteration": 2.814391613006592 + }, + { + "auxiliary_loss_clip": 0.01466737, + "auxiliary_loss_mlp": 0.01237282, + "balance_loss_clip": 1.15720618, + "balance_loss_mlp": 1.03090668, + "epoch": 0.5102059221403878, + "flos": 14868138710880.0, + "grad_norm": 2.561004869877632, + "language_loss": 0.73830783, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.76534802, + "num_input_tokens_seen": 182508325, + "step": 8486, + "time_per_iteration": 2.7335257530212402 + }, + { + "auxiliary_loss_clip": 0.01472666, + "auxiliary_loss_mlp": 0.01249102, + "balance_loss_clip": 1.16096234, + "balance_loss_mlp": 1.04005623, + "epoch": 0.5102660453930558, + "flos": 22963037137920.0, + "grad_norm": 2.7789965770977783, + "language_loss": 0.70043063, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72764832, + "num_input_tokens_seen": 182527020, + "step": 8487, + "time_per_iteration": 2.8303308486938477 + }, + { + "auxiliary_loss_clip": 0.01473666, + "auxiliary_loss_mlp": 0.01240733, + "balance_loss_clip": 1.16367614, + "balance_loss_mlp": 1.03092432, + "epoch": 0.5103261686457238, + "flos": 23151579842880.0, + "grad_norm": 2.725698111034015, + "language_loss": 0.73218852, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75933254, + "num_input_tokens_seen": 182543505, + "step": 8488, + "time_per_iteration": 2.8036155700683594 + }, + { + "auxiliary_loss_clip": 0.01472501, + "auxiliary_loss_mlp": 0.01248082, + "balance_loss_clip": 1.16368747, + "balance_loss_mlp": 1.03636599, + "epoch": 0.5103862918983917, + "flos": 14320981609920.0, + "grad_norm": 2.3034777411339387, + "language_loss": 0.69496322, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.72216904, + "num_input_tokens_seen": 182562250, + "step": 8489, + "time_per_iteration": 2.782684803009033 + }, + { + "auxiliary_loss_clip": 0.0146645, + "auxiliary_loss_mlp": 0.01232161, + "balance_loss_clip": 1.15648508, + "balance_loss_mlp": 1.02406883, + "epoch": 0.5104464151510597, + "flos": 25850916021600.0, + "grad_norm": 1.7646577825958558, + "language_loss": 0.72177351, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.74875963, + "num_input_tokens_seen": 182581910, + "step": 8490, + "time_per_iteration": 2.8513364791870117 + }, + { + "auxiliary_loss_clip": 0.01469465, + "auxiliary_loss_mlp": 0.01236309, + "balance_loss_clip": 1.16066957, + "balance_loss_mlp": 1.02726412, + "epoch": 0.5105065384037276, + "flos": 21655071424800.0, + "grad_norm": 1.5612109172151785, + "language_loss": 0.80641502, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.83347273, + "num_input_tokens_seen": 182601350, + "step": 8491, + "time_per_iteration": 2.839400053024292 + }, + { + "auxiliary_loss_clip": 0.01467342, + "auxiliary_loss_mlp": 0.01222806, + "balance_loss_clip": 1.15815139, + "balance_loss_mlp": 1.01471412, + "epoch": 0.5105666616563956, + "flos": 22493747278080.0, + "grad_norm": 2.1810128817129355, + "language_loss": 0.79216105, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81906247, + "num_input_tokens_seen": 182619660, + "step": 8492, + "time_per_iteration": 2.788398265838623 + }, + { + "auxiliary_loss_clip": 0.01477194, + "auxiliary_loss_mlp": 0.01233189, + "balance_loss_clip": 1.16851187, + "balance_loss_mlp": 1.02261734, + "epoch": 0.5106267849090635, + "flos": 22457980658880.0, + "grad_norm": 2.087485753943869, + "language_loss": 0.78054166, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.8076455, + "num_input_tokens_seen": 182639815, + "step": 8493, + "time_per_iteration": 2.83892822265625 + }, + { + "auxiliary_loss_clip": 0.01466506, + "auxiliary_loss_mlp": 0.01238562, + "balance_loss_clip": 1.15745604, + "balance_loss_mlp": 1.02875376, + "epoch": 0.5106869081617316, + "flos": 23549184751680.0, + "grad_norm": 5.1512702929529475, + "language_loss": 0.83690852, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.86395919, + "num_input_tokens_seen": 182659655, + "step": 8494, + "time_per_iteration": 2.8126823902130127 + }, + { + "auxiliary_loss_clip": 0.01465744, + "auxiliary_loss_mlp": 0.01236316, + "balance_loss_clip": 1.15683174, + "balance_loss_mlp": 1.02784312, + "epoch": 0.5107470314143995, + "flos": 26795184965280.0, + "grad_norm": 3.7758093634115895, + "language_loss": 0.78998476, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81700528, + "num_input_tokens_seen": 182677075, + "step": 8495, + "time_per_iteration": 2.8206357955932617 + }, + { + "auxiliary_loss_clip": 0.01463302, + "auxiliary_loss_mlp": 0.01238466, + "balance_loss_clip": 1.15364635, + "balance_loss_mlp": 1.02903914, + "epoch": 0.5108071546670675, + "flos": 25742099037600.0, + "grad_norm": 1.8854931798608165, + "language_loss": 0.78253055, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80954826, + "num_input_tokens_seen": 182699625, + "step": 8496, + "time_per_iteration": 2.875371217727661 + }, + { + "auxiliary_loss_clip": 0.01461286, + "auxiliary_loss_mlp": 0.01235871, + "balance_loss_clip": 1.15249848, + "balance_loss_mlp": 1.02797019, + "epoch": 0.5108672779197354, + "flos": 18699527975040.0, + "grad_norm": 2.0278510445293882, + "language_loss": 0.78784251, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.81481409, + "num_input_tokens_seen": 182717020, + "step": 8497, + "time_per_iteration": 2.7780039310455322 + }, + { + "auxiliary_loss_clip": 0.0146025, + "auxiliary_loss_mlp": 0.01239309, + "balance_loss_clip": 1.15036988, + "balance_loss_mlp": 1.03064537, + "epoch": 0.5109274011724034, + "flos": 26690615935200.0, + "grad_norm": 1.6080185807102947, + "language_loss": 0.81342196, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84041762, + "num_input_tokens_seen": 182736955, + "step": 8498, + "time_per_iteration": 2.794464588165283 + }, + { + "auxiliary_loss_clip": 0.01466872, + "auxiliary_loss_mlp": 0.01229948, + "balance_loss_clip": 1.15904415, + "balance_loss_mlp": 1.02052116, + "epoch": 0.5109875244250714, + "flos": 22786252162560.0, + "grad_norm": 1.7404189629996092, + "language_loss": 0.70854294, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.73551118, + "num_input_tokens_seen": 182757620, + "step": 8499, + "time_per_iteration": 2.754709005355835 + }, + { + "auxiliary_loss_clip": 0.01463765, + "auxiliary_loss_mlp": 0.01242502, + "balance_loss_clip": 1.15311372, + "balance_loss_mlp": 1.03631747, + "epoch": 0.5110476476777394, + "flos": 35591344064640.0, + "grad_norm": 1.9024522073228696, + "language_loss": 0.72342062, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.75048333, + "num_input_tokens_seen": 182780195, + "step": 8500, + "time_per_iteration": 2.9156880378723145 + }, + { + "auxiliary_loss_clip": 0.01463675, + "auxiliary_loss_mlp": 0.01255937, + "balance_loss_clip": 1.15256882, + "balance_loss_mlp": 1.04536581, + "epoch": 0.5111077709304074, + "flos": 19282793048640.0, + "grad_norm": 3.858870123593942, + "language_loss": 0.63022894, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.65742505, + "num_input_tokens_seen": 182795765, + "step": 8501, + "time_per_iteration": 2.818570852279663 + }, + { + "auxiliary_loss_clip": 0.01456023, + "auxiliary_loss_mlp": 0.01249983, + "balance_loss_clip": 1.14514601, + "balance_loss_mlp": 1.04265428, + "epoch": 0.5111678941830753, + "flos": 20670332770080.0, + "grad_norm": 1.8889526498897213, + "language_loss": 0.87568176, + "learning_rate": 2.024730186540907e-06, + "loss": 0.90274191, + "num_input_tokens_seen": 182813120, + "step": 8502, + "time_per_iteration": 2.769355297088623 + }, + { + "auxiliary_loss_clip": 0.01465188, + "auxiliary_loss_mlp": 0.01252109, + "balance_loss_clip": 1.15538502, + "balance_loss_mlp": 1.04649639, + "epoch": 0.5112280174357433, + "flos": 26290393983360.0, + "grad_norm": 1.378515550651014, + "language_loss": 0.82433265, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.85150564, + "num_input_tokens_seen": 182835745, + "step": 8503, + "time_per_iteration": 2.9565443992614746 + }, + { + "auxiliary_loss_clip": 0.01577441, + "auxiliary_loss_mlp": 0.01205383, + "balance_loss_clip": 1.29339468, + "balance_loss_mlp": 1.00778198, + "epoch": 0.5112881406884112, + "flos": 59479761558240.0, + "grad_norm": 0.8479781284683803, + "language_loss": 0.63868934, + "learning_rate": 2.023951320871339e-06, + "loss": 0.66651762, + "num_input_tokens_seen": 182892540, + "step": 8504, + "time_per_iteration": 3.383666753768921 + }, + { + "auxiliary_loss_clip": 0.01463743, + "auxiliary_loss_mlp": 0.01246089, + "balance_loss_clip": 1.15452576, + "balance_loss_mlp": 1.03914154, + "epoch": 0.5113482639410792, + "flos": 26471275200000.0, + "grad_norm": 1.7335322316880262, + "language_loss": 0.83790052, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86499888, + "num_input_tokens_seen": 182911515, + "step": 8505, + "time_per_iteration": 2.8468940258026123 + }, + { + "auxiliary_loss_clip": 0.01461028, + "auxiliary_loss_mlp": 0.01248717, + "balance_loss_clip": 1.15277743, + "balance_loss_mlp": 1.04177022, + "epoch": 0.5114083871937471, + "flos": 29898308342880.0, + "grad_norm": 2.423470287626613, + "language_loss": 0.75306857, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.78016603, + "num_input_tokens_seen": 182930860, + "step": 8506, + "time_per_iteration": 2.8478236198425293 + }, + { + "auxiliary_loss_clip": 0.01464981, + "auxiliary_loss_mlp": 0.01248977, + "balance_loss_clip": 1.15570176, + "balance_loss_mlp": 1.03859687, + "epoch": 0.5114685104464152, + "flos": 24316782504480.0, + "grad_norm": 1.8266046069235933, + "language_loss": 0.5788486, + "learning_rate": 2.022783015592131e-06, + "loss": 0.60598814, + "num_input_tokens_seen": 182949960, + "step": 8507, + "time_per_iteration": 2.8475286960601807 + }, + { + "auxiliary_loss_clip": 0.01462547, + "auxiliary_loss_mlp": 0.01240261, + "balance_loss_clip": 1.15460253, + "balance_loss_mlp": 1.03197908, + "epoch": 0.5115286336990831, + "flos": 17021190136320.0, + "grad_norm": 2.4184910558068196, + "language_loss": 0.85396647, + "learning_rate": 2.022393578751503e-06, + "loss": 0.88099456, + "num_input_tokens_seen": 182968085, + "step": 8508, + "time_per_iteration": 4.315695762634277 + }, + { + "auxiliary_loss_clip": 0.01465742, + "auxiliary_loss_mlp": 0.01263663, + "balance_loss_clip": 1.15645504, + "balance_loss_mlp": 1.05576205, + "epoch": 0.5115887569517511, + "flos": 23661642839040.0, + "grad_norm": 2.021530880044287, + "language_loss": 0.72361505, + "learning_rate": 2.022004141061709e-06, + "loss": 0.75090921, + "num_input_tokens_seen": 182987275, + "step": 8509, + "time_per_iteration": 2.8224849700927734 + }, + { + "auxiliary_loss_clip": 0.01458724, + "auxiliary_loss_mlp": 0.01240282, + "balance_loss_clip": 1.15138531, + "balance_loss_mlp": 1.03333473, + "epoch": 0.511648880204419, + "flos": 16109046708480.0, + "grad_norm": 2.064046547972595, + "language_loss": 0.76382399, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.79081404, + "num_input_tokens_seen": 183004700, + "step": 8510, + "time_per_iteration": 2.7541844844818115 + }, + { + "auxiliary_loss_clip": 0.01464426, + "auxiliary_loss_mlp": 0.01234537, + "balance_loss_clip": 1.15653014, + "balance_loss_mlp": 1.02682662, + "epoch": 0.511709003457087, + "flos": 32637734951040.0, + "grad_norm": 1.6632960142736164, + "language_loss": 0.71027613, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73726577, + "num_input_tokens_seen": 183025830, + "step": 8511, + "time_per_iteration": 2.8614087104797363 + }, + { + "auxiliary_loss_clip": 0.01458033, + "auxiliary_loss_mlp": 0.01240431, + "balance_loss_clip": 1.14989734, + "balance_loss_mlp": 1.03176689, + "epoch": 0.511769126709755, + "flos": 21764229762240.0, + "grad_norm": 2.101459313038878, + "language_loss": 0.66578764, + "learning_rate": 2.020835823045001e-06, + "loss": 0.69277227, + "num_input_tokens_seen": 183045140, + "step": 8512, + "time_per_iteration": 2.7604780197143555 + }, + { + "auxiliary_loss_clip": 0.01455229, + "auxiliary_loss_mlp": 0.01238809, + "balance_loss_clip": 1.14599919, + "balance_loss_mlp": 1.02804685, + "epoch": 0.511829249962423, + "flos": 23917660469280.0, + "grad_norm": 1.970113616277274, + "language_loss": 0.66472226, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.69166267, + "num_input_tokens_seen": 183063935, + "step": 8513, + "time_per_iteration": 2.8333687782287598 + }, + { + "auxiliary_loss_clip": 0.01464206, + "auxiliary_loss_mlp": 0.0124786, + "balance_loss_clip": 1.1559602, + "balance_loss_mlp": 1.03938639, + "epoch": 0.511889373215091, + "flos": 23728966051680.0, + "grad_norm": 1.942655569708567, + "language_loss": 0.69181693, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71893758, + "num_input_tokens_seen": 183084135, + "step": 8514, + "time_per_iteration": 2.853492498397827 + }, + { + "auxiliary_loss_clip": 0.01455878, + "auxiliary_loss_mlp": 0.01233709, + "balance_loss_clip": 1.14880049, + "balance_loss_mlp": 1.0261898, + "epoch": 0.5119494964677589, + "flos": 28114073988480.0, + "grad_norm": 1.577669870820325, + "language_loss": 0.66143382, + "learning_rate": 2.019667497917424e-06, + "loss": 0.6883297, + "num_input_tokens_seen": 183104570, + "step": 8515, + "time_per_iteration": 4.250241756439209 + }, + { + "auxiliary_loss_clip": 0.01456912, + "auxiliary_loss_mlp": 0.01243538, + "balance_loss_clip": 1.1489532, + "balance_loss_mlp": 1.03582728, + "epoch": 0.5120096197204269, + "flos": 24975563273280.0, + "grad_norm": 2.2745190599782914, + "language_loss": 0.74798918, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77499366, + "num_input_tokens_seen": 183123850, + "step": 8516, + "time_per_iteration": 4.354687690734863 + }, + { + "auxiliary_loss_clip": 0.01461444, + "auxiliary_loss_mlp": 0.01235662, + "balance_loss_clip": 1.15250063, + "balance_loss_mlp": 1.02642632, + "epoch": 0.5120697429730948, + "flos": 17969972531040.0, + "grad_norm": 2.590875230163849, + "language_loss": 0.78130424, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80827528, + "num_input_tokens_seen": 183141725, + "step": 8517, + "time_per_iteration": 2.912060022354126 + }, + { + "auxiliary_loss_clip": 0.01466173, + "auxiliary_loss_mlp": 0.01235865, + "balance_loss_clip": 1.15743184, + "balance_loss_mlp": 1.02643776, + "epoch": 0.5121298662257628, + "flos": 23294456678880.0, + "grad_norm": 1.8336143924759392, + "language_loss": 0.73751765, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.76453805, + "num_input_tokens_seen": 183161300, + "step": 8518, + "time_per_iteration": 2.78715443611145 + }, + { + "auxiliary_loss_clip": 0.01463501, + "auxiliary_loss_mlp": 0.01237069, + "balance_loss_clip": 1.15564132, + "balance_loss_mlp": 1.02764177, + "epoch": 0.5121899894784308, + "flos": 17312974385760.0, + "grad_norm": 1.8535008471560086, + "language_loss": 0.77984631, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80685198, + "num_input_tokens_seen": 183180495, + "step": 8519, + "time_per_iteration": 2.8823087215423584 + }, + { + "auxiliary_loss_clip": 0.01471798, + "auxiliary_loss_mlp": 0.01238454, + "balance_loss_clip": 1.16421747, + "balance_loss_mlp": 1.02711987, + "epoch": 0.5122501127310988, + "flos": 24932059309440.0, + "grad_norm": 1.7235777687940352, + "language_loss": 0.79541397, + "learning_rate": 2.017720274652497e-06, + "loss": 0.82251644, + "num_input_tokens_seen": 183200330, + "step": 8520, + "time_per_iteration": 2.9489922523498535 + }, + { + "auxiliary_loss_clip": 0.01463267, + "auxiliary_loss_mlp": 0.01250468, + "balance_loss_clip": 1.15521526, + "balance_loss_mlp": 1.04046941, + "epoch": 0.5123102359837667, + "flos": 18444875758560.0, + "grad_norm": 1.797567216543255, + "language_loss": 0.81579572, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.84293306, + "num_input_tokens_seen": 183218230, + "step": 8521, + "time_per_iteration": 4.259719610214233 + }, + { + "auxiliary_loss_clip": 0.01465055, + "auxiliary_loss_mlp": 0.01238649, + "balance_loss_clip": 1.15693831, + "balance_loss_mlp": 1.03170204, + "epoch": 0.5123703592364347, + "flos": 26687126544480.0, + "grad_norm": 1.811579605940064, + "language_loss": 0.68381917, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.7108562, + "num_input_tokens_seen": 183236735, + "step": 8522, + "time_per_iteration": 2.8396992683410645 + }, + { + "auxiliary_loss_clip": 0.01466168, + "auxiliary_loss_mlp": 0.01252455, + "balance_loss_clip": 1.15794373, + "balance_loss_mlp": 1.03940439, + "epoch": 0.5124304824891026, + "flos": 28806497399520.0, + "grad_norm": 2.0330874351914354, + "language_loss": 0.61652243, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.64370865, + "num_input_tokens_seen": 183257550, + "step": 8523, + "time_per_iteration": 2.804155111312866 + }, + { + "auxiliary_loss_clip": 0.01465885, + "auxiliary_loss_mlp": 0.01237223, + "balance_loss_clip": 1.15723753, + "balance_loss_mlp": 1.0289402, + "epoch": 0.5124906057417706, + "flos": 21763964265120.0, + "grad_norm": 2.063806652216657, + "language_loss": 0.77851379, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80554479, + "num_input_tokens_seen": 183275515, + "step": 8524, + "time_per_iteration": 2.797837018966675 + }, + { + "auxiliary_loss_clip": 0.01463402, + "auxiliary_loss_mlp": 0.01244856, + "balance_loss_clip": 1.15679693, + "balance_loss_mlp": 1.0392437, + "epoch": 0.5125507289944387, + "flos": 18882571096800.0, + "grad_norm": 1.8177599493278047, + "language_loss": 0.7473309, + "learning_rate": 2.015773034588706e-06, + "loss": 0.77441347, + "num_input_tokens_seen": 183293880, + "step": 8525, + "time_per_iteration": 2.8226113319396973 + }, + { + "auxiliary_loss_clip": 0.01462178, + "auxiliary_loss_mlp": 0.01235265, + "balance_loss_clip": 1.15429187, + "balance_loss_mlp": 1.02545619, + "epoch": 0.5126108522471066, + "flos": 35630827643520.0, + "grad_norm": 35.47055675342443, + "language_loss": 0.74047077, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76744521, + "num_input_tokens_seen": 183315860, + "step": 8526, + "time_per_iteration": 2.896777868270874 + }, + { + "auxiliary_loss_clip": 0.01459715, + "auxiliary_loss_mlp": 0.01251633, + "balance_loss_clip": 1.15221214, + "balance_loss_mlp": 1.04335046, + "epoch": 0.5126709754997746, + "flos": 20192850427680.0, + "grad_norm": 1.5769584534523773, + "language_loss": 0.65252131, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67963487, + "num_input_tokens_seen": 183335480, + "step": 8527, + "time_per_iteration": 2.816918134689331 + }, + { + "auxiliary_loss_clip": 0.01470965, + "auxiliary_loss_mlp": 0.0123928, + "balance_loss_clip": 1.1625067, + "balance_loss_mlp": 1.03309512, + "epoch": 0.5127310987524425, + "flos": 18590559278400.0, + "grad_norm": 1.5805101621734539, + "language_loss": 0.74588764, + "learning_rate": 2.014604683254908e-06, + "loss": 0.77299011, + "num_input_tokens_seen": 183354395, + "step": 8528, + "time_per_iteration": 2.7726075649261475 + }, + { + "auxiliary_loss_clip": 0.01458438, + "auxiliary_loss_mlp": 0.01238795, + "balance_loss_clip": 1.14982247, + "balance_loss_mlp": 1.03241968, + "epoch": 0.5127912220051105, + "flos": 22456842814080.0, + "grad_norm": 1.667600350771692, + "language_loss": 0.82784903, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85482144, + "num_input_tokens_seen": 183372980, + "step": 8529, + "time_per_iteration": 2.8671491146087646 + }, + { + "auxiliary_loss_clip": 0.01467945, + "auxiliary_loss_mlp": 0.01230173, + "balance_loss_clip": 1.16001046, + "balance_loss_mlp": 1.02284396, + "epoch": 0.5128513452577784, + "flos": 19095426116640.0, + "grad_norm": 1.7528133193730908, + "language_loss": 0.74020213, + "learning_rate": 2.01382577957204e-06, + "loss": 0.7671833, + "num_input_tokens_seen": 183390160, + "step": 8530, + "time_per_iteration": 2.8289945125579834 + }, + { + "auxiliary_loss_clip": 0.01565453, + "auxiliary_loss_mlp": 0.01227539, + "balance_loss_clip": 1.28367376, + "balance_loss_mlp": 1.03146362, + "epoch": 0.5129114685104464, + "flos": 67899745159200.0, + "grad_norm": 0.7392674742135379, + "language_loss": 0.60748655, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.63541645, + "num_input_tokens_seen": 183455280, + "step": 8531, + "time_per_iteration": 3.3866825103759766 + }, + { + "auxiliary_loss_clip": 0.0146182, + "auxiliary_loss_mlp": 0.01238485, + "balance_loss_clip": 1.15282965, + "balance_loss_mlp": 1.02982104, + "epoch": 0.5129715917631144, + "flos": 20451257532000.0, + "grad_norm": 1.9471551421966287, + "language_loss": 0.76874506, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.79574811, + "num_input_tokens_seen": 183473955, + "step": 8532, + "time_per_iteration": 2.8047478199005127 + }, + { + "auxiliary_loss_clip": 0.01469341, + "auxiliary_loss_mlp": 0.01240281, + "balance_loss_clip": 1.1617955, + "balance_loss_mlp": 1.03199852, + "epoch": 0.5130317150157824, + "flos": 35119057880160.0, + "grad_norm": 1.9984329802266794, + "language_loss": 0.66912127, + "learning_rate": 2.012657420152597e-06, + "loss": 0.69621754, + "num_input_tokens_seen": 183497195, + "step": 8533, + "time_per_iteration": 2.8887784481048584 + }, + { + "auxiliary_loss_clip": 0.01463157, + "auxiliary_loss_mlp": 0.01238507, + "balance_loss_clip": 1.15515947, + "balance_loss_mlp": 1.02927077, + "epoch": 0.5130918382684503, + "flos": 19793652536160.0, + "grad_norm": 1.9979120766586234, + "language_loss": 0.81693834, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84395504, + "num_input_tokens_seen": 183513675, + "step": 8534, + "time_per_iteration": 2.7887558937072754 + }, + { + "auxiliary_loss_clip": 0.01465512, + "auxiliary_loss_mlp": 0.01245274, + "balance_loss_clip": 1.15674162, + "balance_loss_mlp": 1.03546607, + "epoch": 0.5131519615211183, + "flos": 26325515823840.0, + "grad_norm": 1.528063625345028, + "language_loss": 0.64066881, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.6677767, + "num_input_tokens_seen": 183535165, + "step": 8535, + "time_per_iteration": 2.826725482940674 + }, + { + "auxiliary_loss_clip": 0.01468297, + "auxiliary_loss_mlp": 0.01236353, + "balance_loss_clip": 1.1604321, + "balance_loss_mlp": 1.03035975, + "epoch": 0.5132120847737862, + "flos": 19173938136480.0, + "grad_norm": 1.7690367089218424, + "language_loss": 0.6977824, + "learning_rate": 2.011489056413418e-06, + "loss": 0.7248289, + "num_input_tokens_seen": 183553780, + "step": 8536, + "time_per_iteration": 2.8105454444885254 + }, + { + "auxiliary_loss_clip": 0.01459626, + "auxiliary_loss_mlp": 0.01241411, + "balance_loss_clip": 1.15069556, + "balance_loss_mlp": 1.02893257, + "epoch": 0.5132722080264542, + "flos": 20232523647360.0, + "grad_norm": 2.2923822885224214, + "language_loss": 0.70851016, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73552048, + "num_input_tokens_seen": 183572285, + "step": 8537, + "time_per_iteration": 2.8826539516448975 + }, + { + "auxiliary_loss_clip": 0.01458282, + "auxiliary_loss_mlp": 0.01231497, + "balance_loss_clip": 1.14954984, + "balance_loss_mlp": 1.02435875, + "epoch": 0.5133323312791223, + "flos": 16471188423360.0, + "grad_norm": 1.9413437593862675, + "language_loss": 0.79973853, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82663631, + "num_input_tokens_seen": 183589330, + "step": 8538, + "time_per_iteration": 2.760749101638794 + }, + { + "auxiliary_loss_clip": 0.01452315, + "auxiliary_loss_mlp": 0.01242408, + "balance_loss_clip": 1.14299595, + "balance_loss_mlp": 1.03641474, + "epoch": 0.5133924545317902, + "flos": 26070522253920.0, + "grad_norm": 1.9307248260669605, + "language_loss": 0.78287596, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80982322, + "num_input_tokens_seen": 183609205, + "step": 8539, + "time_per_iteration": 2.8488235473632812 + }, + { + "auxiliary_loss_clip": 0.01456803, + "auxiliary_loss_mlp": 0.01230938, + "balance_loss_clip": 1.14844728, + "balance_loss_mlp": 1.02265584, + "epoch": 0.5134525777844582, + "flos": 29133517273920.0, + "grad_norm": 2.5357743533057753, + "language_loss": 0.76194549, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78882289, + "num_input_tokens_seen": 183629985, + "step": 8540, + "time_per_iteration": 2.863112688064575 + }, + { + "auxiliary_loss_clip": 0.0146337, + "auxiliary_loss_mlp": 0.01242858, + "balance_loss_clip": 1.15701914, + "balance_loss_mlp": 1.03343153, + "epoch": 0.5135127010371261, + "flos": 17456571856800.0, + "grad_norm": 1.6939972805530905, + "language_loss": 0.74924493, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.77630723, + "num_input_tokens_seen": 183648220, + "step": 8541, + "time_per_iteration": 2.8311233520507812 + }, + { + "auxiliary_loss_clip": 0.01459756, + "auxiliary_loss_mlp": 0.01240685, + "balance_loss_clip": 1.15167093, + "balance_loss_mlp": 1.03049517, + "epoch": 0.5135728242897941, + "flos": 21947159099520.0, + "grad_norm": 1.6153531875427802, + "language_loss": 0.70478636, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.73179078, + "num_input_tokens_seen": 183668230, + "step": 8542, + "time_per_iteration": 2.8163681030273438 + }, + { + "auxiliary_loss_clip": 0.01456962, + "auxiliary_loss_mlp": 0.0123243, + "balance_loss_clip": 1.14749098, + "balance_loss_mlp": 1.02166748, + "epoch": 0.513632947542462, + "flos": 22677093825120.0, + "grad_norm": 2.018682342641929, + "language_loss": 0.79305458, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.81994849, + "num_input_tokens_seen": 183687800, + "step": 8543, + "time_per_iteration": 2.8478708267211914 + }, + { + "auxiliary_loss_clip": 0.01462983, + "auxiliary_loss_mlp": 0.01241717, + "balance_loss_clip": 1.15271258, + "balance_loss_mlp": 1.03209949, + "epoch": 0.51369307079513, + "flos": 29459551016160.0, + "grad_norm": 1.7622826396987423, + "language_loss": 0.67728704, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70433408, + "num_input_tokens_seen": 183709025, + "step": 8544, + "time_per_iteration": 2.8499042987823486 + }, + { + "auxiliary_loss_clip": 0.01453938, + "auxiliary_loss_mlp": 0.01239336, + "balance_loss_clip": 1.14390731, + "balance_loss_mlp": 1.03048182, + "epoch": 0.513753194047798, + "flos": 18991350152640.0, + "grad_norm": 2.6120104334505037, + "language_loss": 0.72277069, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74970347, + "num_input_tokens_seen": 183725740, + "step": 8545, + "time_per_iteration": 2.8148980140686035 + }, + { + "auxiliary_loss_clip": 0.01460862, + "auxiliary_loss_mlp": 0.01249083, + "balance_loss_clip": 1.15111518, + "balance_loss_mlp": 1.0369854, + "epoch": 0.513813317300466, + "flos": 17823795945120.0, + "grad_norm": 2.294999776201543, + "language_loss": 0.82272303, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84982252, + "num_input_tokens_seen": 183743995, + "step": 8546, + "time_per_iteration": 4.2369630336761475 + }, + { + "auxiliary_loss_clip": 0.01453473, + "auxiliary_loss_mlp": 0.01240491, + "balance_loss_clip": 1.14305949, + "balance_loss_mlp": 1.0308733, + "epoch": 0.5138734405531339, + "flos": 24063495701760.0, + "grad_norm": 1.7970656214266323, + "language_loss": 0.73697364, + "learning_rate": 2.007205025522544e-06, + "loss": 0.76391327, + "num_input_tokens_seen": 183764150, + "step": 8547, + "time_per_iteration": 2.8378868103027344 + }, + { + "auxiliary_loss_clip": 0.01467655, + "auxiliary_loss_mlp": 0.01238671, + "balance_loss_clip": 1.15610695, + "balance_loss_mlp": 1.02924383, + "epoch": 0.5139335638058019, + "flos": 26099272163520.0, + "grad_norm": 1.7470528127661205, + "language_loss": 0.73426706, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.76133031, + "num_input_tokens_seen": 183783280, + "step": 8548, + "time_per_iteration": 2.825486183166504 + }, + { + "auxiliary_loss_clip": 0.01464392, + "auxiliary_loss_mlp": 0.01241203, + "balance_loss_clip": 1.15333676, + "balance_loss_mlp": 1.03387415, + "epoch": 0.5139936870584698, + "flos": 18919134207360.0, + "grad_norm": 1.9798925710356037, + "language_loss": 0.82160354, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84865952, + "num_input_tokens_seen": 183800725, + "step": 8549, + "time_per_iteration": 2.7595155239105225 + }, + { + "auxiliary_loss_clip": 0.01471916, + "auxiliary_loss_mlp": 0.01240139, + "balance_loss_clip": 1.16001785, + "balance_loss_mlp": 1.0333823, + "epoch": 0.5140538103111378, + "flos": 16145875316160.0, + "grad_norm": 25.407455642002233, + "language_loss": 0.72151172, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.74863231, + "num_input_tokens_seen": 183818735, + "step": 8550, + "time_per_iteration": 2.7704102993011475 + }, + { + "auxiliary_loss_clip": 0.01464628, + "auxiliary_loss_mlp": 0.01237196, + "balance_loss_clip": 1.15294659, + "balance_loss_mlp": 1.02853251, + "epoch": 0.5141139335638057, + "flos": 22422403680480.0, + "grad_norm": 1.4924945017745779, + "language_loss": 0.75105363, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77807188, + "num_input_tokens_seen": 183840015, + "step": 8551, + "time_per_iteration": 2.820225954055786 + }, + { + "auxiliary_loss_clip": 0.01474098, + "auxiliary_loss_mlp": 0.01244963, + "balance_loss_clip": 1.16385901, + "balance_loss_mlp": 1.03744376, + "epoch": 0.5141740568164738, + "flos": 27092127444480.0, + "grad_norm": 1.7786719394056247, + "language_loss": 0.69631696, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.72350758, + "num_input_tokens_seen": 183860145, + "step": 8552, + "time_per_iteration": 4.332324743270874 + }, + { + "auxiliary_loss_clip": 0.01465909, + "auxiliary_loss_mlp": 0.01240661, + "balance_loss_clip": 1.15450287, + "balance_loss_mlp": 1.03333223, + "epoch": 0.5142341800691418, + "flos": 24975601201440.0, + "grad_norm": 1.9301586316075086, + "language_loss": 0.74414539, + "learning_rate": 2.004868266210965e-06, + "loss": 0.77121115, + "num_input_tokens_seen": 183880540, + "step": 8553, + "time_per_iteration": 2.890414237976074 + }, + { + "auxiliary_loss_clip": 0.01469114, + "auxiliary_loss_mlp": 0.01246034, + "balance_loss_clip": 1.15765142, + "balance_loss_mlp": 1.038324, + "epoch": 0.5142943033218097, + "flos": 20706630383520.0, + "grad_norm": 1.938665333631639, + "language_loss": 0.67838144, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70553291, + "num_input_tokens_seen": 183900895, + "step": 8554, + "time_per_iteration": 4.326183319091797 + }, + { + "auxiliary_loss_clip": 0.01466476, + "auxiliary_loss_mlp": 0.01246853, + "balance_loss_clip": 1.15403473, + "balance_loss_mlp": 1.0379982, + "epoch": 0.5143544265744777, + "flos": 22927460159520.0, + "grad_norm": 2.1264928494815494, + "language_loss": 0.73524654, + "learning_rate": 2.004089344806068e-06, + "loss": 0.76237983, + "num_input_tokens_seen": 183920335, + "step": 8555, + "time_per_iteration": 2.7933003902435303 + }, + { + "auxiliary_loss_clip": 0.01458837, + "auxiliary_loss_mlp": 0.01236835, + "balance_loss_clip": 1.14615035, + "balance_loss_mlp": 1.03122294, + "epoch": 0.5144145498271456, + "flos": 15923045190240.0, + "grad_norm": 2.467775216148461, + "language_loss": 0.74260074, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76955748, + "num_input_tokens_seen": 183936220, + "step": 8556, + "time_per_iteration": 2.7978954315185547 + }, + { + "auxiliary_loss_clip": 0.01461813, + "auxiliary_loss_mlp": 0.01239713, + "balance_loss_clip": 1.15085757, + "balance_loss_mlp": 1.03410077, + "epoch": 0.5144746730798136, + "flos": 19683204641280.0, + "grad_norm": 1.8557525982712664, + "language_loss": 0.86150455, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88851988, + "num_input_tokens_seen": 183953250, + "step": 8557, + "time_per_iteration": 2.8582496643066406 + }, + { + "auxiliary_loss_clip": 0.01465981, + "auxiliary_loss_mlp": 0.01228922, + "balance_loss_clip": 1.15546489, + "balance_loss_mlp": 1.02197456, + "epoch": 0.5145347963324816, + "flos": 23916977762400.0, + "grad_norm": 1.9592225666622483, + "language_loss": 0.89077258, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91772163, + "num_input_tokens_seen": 183973865, + "step": 8558, + "time_per_iteration": 2.7975728511810303 + }, + { + "auxiliary_loss_clip": 0.01475021, + "auxiliary_loss_mlp": 0.01236644, + "balance_loss_clip": 1.16493118, + "balance_loss_mlp": 1.02759898, + "epoch": 0.5145949195851496, + "flos": 18261984349440.0, + "grad_norm": 1.808064454520638, + "language_loss": 0.65299892, + "learning_rate": 2.002531500253602e-06, + "loss": 0.68011558, + "num_input_tokens_seen": 183992555, + "step": 8559, + "time_per_iteration": 4.271456480026245 + }, + { + "auxiliary_loss_clip": 0.01473305, + "auxiliary_loss_mlp": 0.01238837, + "balance_loss_clip": 1.16283464, + "balance_loss_mlp": 1.03227127, + "epoch": 0.5146550428378175, + "flos": 26215826492160.0, + "grad_norm": 1.640075045942125, + "language_loss": 0.63337088, + "learning_rate": 2.002142038838577e-06, + "loss": 0.66049224, + "num_input_tokens_seen": 184010825, + "step": 8560, + "time_per_iteration": 2.8716704845428467 + }, + { + "auxiliary_loss_clip": 0.01471343, + "auxiliary_loss_mlp": 0.01229279, + "balance_loss_clip": 1.16061294, + "balance_loss_mlp": 1.02233219, + "epoch": 0.5147151660904855, + "flos": 22676373190080.0, + "grad_norm": 1.5325795889134024, + "language_loss": 0.69738686, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72439313, + "num_input_tokens_seen": 184030155, + "step": 8561, + "time_per_iteration": 2.7963643074035645 + }, + { + "auxiliary_loss_clip": 0.01469946, + "auxiliary_loss_mlp": 0.0123178, + "balance_loss_clip": 1.15894091, + "balance_loss_mlp": 1.02502298, + "epoch": 0.5147752893431534, + "flos": 24974690925600.0, + "grad_norm": 1.562597453264656, + "language_loss": 0.66653174, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.69354904, + "num_input_tokens_seen": 184051440, + "step": 8562, + "time_per_iteration": 2.8202476501464844 + }, + { + "auxiliary_loss_clip": 0.01471251, + "auxiliary_loss_mlp": 0.01241488, + "balance_loss_clip": 1.1608851, + "balance_loss_mlp": 1.03415871, + "epoch": 0.5148354125958214, + "flos": 22746730655520.0, + "grad_norm": 1.631540357442329, + "language_loss": 0.77643168, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.80355906, + "num_input_tokens_seen": 184070205, + "step": 8563, + "time_per_iteration": 2.781475782394409 + }, + { + "auxiliary_loss_clip": 0.01465111, + "auxiliary_loss_mlp": 0.01231656, + "balance_loss_clip": 1.15377784, + "balance_loss_mlp": 1.02280116, + "epoch": 0.5148955358484893, + "flos": 23070602492640.0, + "grad_norm": 2.442964577043676, + "language_loss": 0.83041275, + "learning_rate": 2.0005841925139e-06, + "loss": 0.85738045, + "num_input_tokens_seen": 184087345, + "step": 8564, + "time_per_iteration": 2.78762149810791 + }, + { + "auxiliary_loss_clip": 0.01469686, + "auxiliary_loss_mlp": 0.01242465, + "balance_loss_clip": 1.15779138, + "balance_loss_mlp": 1.03227496, + "epoch": 0.5149556591011574, + "flos": 20342250907200.0, + "grad_norm": 1.881430194341183, + "language_loss": 0.73096418, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75808561, + "num_input_tokens_seen": 184107110, + "step": 8565, + "time_per_iteration": 2.7837414741516113 + }, + { + "auxiliary_loss_clip": 0.01465807, + "auxiliary_loss_mlp": 0.01251212, + "balance_loss_clip": 1.15414321, + "balance_loss_mlp": 1.04197574, + "epoch": 0.5150157823538254, + "flos": 22640454858240.0, + "grad_norm": 1.987214738216535, + "language_loss": 0.68189651, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70906669, + "num_input_tokens_seen": 184127105, + "step": 8566, + "time_per_iteration": 2.7830731868743896 + }, + { + "auxiliary_loss_clip": 0.01459256, + "auxiliary_loss_mlp": 0.01239346, + "balance_loss_clip": 1.14592505, + "balance_loss_mlp": 1.02991891, + "epoch": 0.5150759056064933, + "flos": 26070029187840.0, + "grad_norm": 1.8724954511249967, + "language_loss": 0.78140563, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80839169, + "num_input_tokens_seen": 184148060, + "step": 8567, + "time_per_iteration": 2.846266746520996 + }, + { + "auxiliary_loss_clip": 0.0146405, + "auxiliary_loss_mlp": 0.01250066, + "balance_loss_clip": 1.15184855, + "balance_loss_mlp": 1.04102039, + "epoch": 0.5151360288591613, + "flos": 25954726488480.0, + "grad_norm": 2.0694589583929317, + "language_loss": 0.79026222, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81740344, + "num_input_tokens_seen": 184166175, + "step": 8568, + "time_per_iteration": 2.8059403896331787 + }, + { + "auxiliary_loss_clip": 0.01467326, + "auxiliary_loss_mlp": 0.01231236, + "balance_loss_clip": 1.15515542, + "balance_loss_mlp": 1.02505159, + "epoch": 0.5151961521118292, + "flos": 18508785436800.0, + "grad_norm": 2.208364863958527, + "language_loss": 0.90607661, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93306226, + "num_input_tokens_seen": 184182600, + "step": 8569, + "time_per_iteration": 2.811447858810425 + }, + { + "auxiliary_loss_clip": 0.0146792, + "auxiliary_loss_mlp": 0.01249187, + "balance_loss_clip": 1.15441978, + "balance_loss_mlp": 1.04109573, + "epoch": 0.5152562753644973, + "flos": 22235719455360.0, + "grad_norm": 2.0260102835199523, + "language_loss": 0.76640701, + "learning_rate": 1.998247422657674e-06, + "loss": 0.79357809, + "num_input_tokens_seen": 184202020, + "step": 8570, + "time_per_iteration": 2.7628753185272217 + }, + { + "auxiliary_loss_clip": 0.01468196, + "auxiliary_loss_mlp": 0.01248825, + "balance_loss_clip": 1.15341794, + "balance_loss_mlp": 1.03978002, + "epoch": 0.5153163986171652, + "flos": 38439663513120.0, + "grad_norm": 3.106932174344069, + "language_loss": 0.73808837, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.76525867, + "num_input_tokens_seen": 184224850, + "step": 8571, + "time_per_iteration": 2.9222850799560547 + }, + { + "auxiliary_loss_clip": 0.01598186, + "auxiliary_loss_mlp": 0.01192421, + "balance_loss_clip": 1.30918419, + "balance_loss_mlp": 0.9940567, + "epoch": 0.5153765218698332, + "flos": 66391213514400.0, + "grad_norm": 0.7826514773710541, + "language_loss": 0.52911854, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.5570246, + "num_input_tokens_seen": 184288520, + "step": 8572, + "time_per_iteration": 3.4749860763549805 + }, + { + "auxiliary_loss_clip": 0.01469845, + "auxiliary_loss_mlp": 0.01249597, + "balance_loss_clip": 1.15883636, + "balance_loss_mlp": 1.04245913, + "epoch": 0.5154366451225011, + "flos": 24026970519360.0, + "grad_norm": 1.9647397771860855, + "language_loss": 0.75873208, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78592646, + "num_input_tokens_seen": 184308565, + "step": 8573, + "time_per_iteration": 2.8682422637939453 + }, + { + "auxiliary_loss_clip": 0.01461461, + "auxiliary_loss_mlp": 0.01232618, + "balance_loss_clip": 1.14927053, + "balance_loss_mlp": 1.02643359, + "epoch": 0.5154967683751691, + "flos": 23470634803680.0, + "grad_norm": 1.9741370080588387, + "language_loss": 0.77055818, + "learning_rate": 1.996689577219102e-06, + "loss": 0.797499, + "num_input_tokens_seen": 184326795, + "step": 8574, + "time_per_iteration": 2.8064839839935303 + }, + { + "auxiliary_loss_clip": 0.01470894, + "auxiliary_loss_mlp": 0.01237717, + "balance_loss_clip": 1.15914607, + "balance_loss_mlp": 1.0330584, + "epoch": 0.515556891627837, + "flos": 23807933209440.0, + "grad_norm": 4.08212518242826, + "language_loss": 0.85759604, + "learning_rate": 1.996300116136367e-06, + "loss": 0.88468218, + "num_input_tokens_seen": 184345990, + "step": 8575, + "time_per_iteration": 2.7315258979797363 + }, + { + "auxiliary_loss_clip": 0.01469705, + "auxiliary_loss_mlp": 0.01255967, + "balance_loss_clip": 1.15791917, + "balance_loss_mlp": 1.04959154, + "epoch": 0.515617014880505, + "flos": 19830594928320.0, + "grad_norm": 1.5474961541725987, + "language_loss": 0.76954919, + "learning_rate": 1.995910655193932e-06, + "loss": 0.79680586, + "num_input_tokens_seen": 184366300, + "step": 8576, + "time_per_iteration": 2.8533401489257812 + }, + { + "auxiliary_loss_clip": 0.01467038, + "auxiliary_loss_mlp": 0.01260516, + "balance_loss_clip": 1.15347028, + "balance_loss_mlp": 1.05356884, + "epoch": 0.515677138133173, + "flos": 14247627819840.0, + "grad_norm": 2.3615636952388765, + "language_loss": 0.75512969, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.78240526, + "num_input_tokens_seen": 184383030, + "step": 8577, + "time_per_iteration": 2.7102725505828857 + }, + { + "auxiliary_loss_clip": 0.01463802, + "auxiliary_loss_mlp": 0.01243454, + "balance_loss_clip": 1.14942646, + "balance_loss_mlp": 1.03574395, + "epoch": 0.515737261385841, + "flos": 28292034736800.0, + "grad_norm": 1.9320650522520437, + "language_loss": 0.80618322, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.83325577, + "num_input_tokens_seen": 184403410, + "step": 8578, + "time_per_iteration": 2.8715479373931885 + }, + { + "auxiliary_loss_clip": 0.01467406, + "auxiliary_loss_mlp": 0.01244797, + "balance_loss_clip": 1.15481699, + "balance_loss_mlp": 1.04071081, + "epoch": 0.515797384638509, + "flos": 27894429828000.0, + "grad_norm": 2.160619325638947, + "language_loss": 0.76203489, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78915691, + "num_input_tokens_seen": 184423830, + "step": 8579, + "time_per_iteration": 2.805084705352783 + }, + { + "auxiliary_loss_clip": 0.01466908, + "auxiliary_loss_mlp": 0.01242347, + "balance_loss_clip": 1.15247035, + "balance_loss_mlp": 1.03616297, + "epoch": 0.5158575078911769, + "flos": 23042345649120.0, + "grad_norm": 1.609460991935891, + "language_loss": 0.78871411, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81580675, + "num_input_tokens_seen": 184445050, + "step": 8580, + "time_per_iteration": 2.779799222946167 + }, + { + "auxiliary_loss_clip": 0.01466386, + "auxiliary_loss_mlp": 0.01247054, + "balance_loss_clip": 1.15301704, + "balance_loss_mlp": 1.0383904, + "epoch": 0.5159176311438449, + "flos": 12643023052800.0, + "grad_norm": 2.134752192467772, + "language_loss": 0.72831708, + "learning_rate": 1.99396335310315e-06, + "loss": 0.7554515, + "num_input_tokens_seen": 184460775, + "step": 8581, + "time_per_iteration": 2.8571107387542725 + }, + { + "auxiliary_loss_clip": 0.01463865, + "auxiliary_loss_mlp": 0.01242371, + "balance_loss_clip": 1.1510725, + "balance_loss_mlp": 1.03885674, + "epoch": 0.5159777543965128, + "flos": 15559879415040.0, + "grad_norm": 2.149443997177523, + "language_loss": 0.73804492, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76510733, + "num_input_tokens_seen": 184477365, + "step": 8582, + "time_per_iteration": 2.722532272338867 + }, + { + "auxiliary_loss_clip": 0.01465004, + "auxiliary_loss_mlp": 0.01238985, + "balance_loss_clip": 1.15145516, + "balance_loss_mlp": 1.03222847, + "epoch": 0.5160378776491809, + "flos": 23223909572640.0, + "grad_norm": 1.9623052491599455, + "language_loss": 0.66237807, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68941796, + "num_input_tokens_seen": 184497045, + "step": 8583, + "time_per_iteration": 2.8440980911254883 + }, + { + "auxiliary_loss_clip": 0.01468155, + "auxiliary_loss_mlp": 0.01253788, + "balance_loss_clip": 1.15403545, + "balance_loss_mlp": 1.04607809, + "epoch": 0.5160980009018488, + "flos": 21946779817920.0, + "grad_norm": 1.3972822977872152, + "language_loss": 0.76310748, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.79032689, + "num_input_tokens_seen": 184517675, + "step": 8584, + "time_per_iteration": 4.2388081550598145 + }, + { + "auxiliary_loss_clip": 0.01459596, + "auxiliary_loss_mlp": 0.01243869, + "balance_loss_clip": 1.14611578, + "balance_loss_mlp": 1.03749371, + "epoch": 0.5161581241545168, + "flos": 22786290090720.0, + "grad_norm": 2.4379690544762496, + "language_loss": 0.79128385, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.81831849, + "num_input_tokens_seen": 184537745, + "step": 8585, + "time_per_iteration": 2.8228039741516113 + }, + { + "auxiliary_loss_clip": 0.01467426, + "auxiliary_loss_mlp": 0.01240337, + "balance_loss_clip": 1.15290642, + "balance_loss_mlp": 1.03663218, + "epoch": 0.5162182474071847, + "flos": 19677022351200.0, + "grad_norm": 2.2825537611271, + "language_loss": 0.80820405, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83528173, + "num_input_tokens_seen": 184553630, + "step": 8586, + "time_per_iteration": 2.7868800163269043 + }, + { + "auxiliary_loss_clip": 0.01469772, + "auxiliary_loss_mlp": 0.01251704, + "balance_loss_clip": 1.15610921, + "balance_loss_mlp": 1.04590154, + "epoch": 0.5162783706598527, + "flos": 20048190968160.0, + "grad_norm": 1.809701156781059, + "language_loss": 0.71816516, + "learning_rate": 1.991626598310701e-06, + "loss": 0.74537992, + "num_input_tokens_seen": 184573530, + "step": 8587, + "time_per_iteration": 2.757946729660034 + }, + { + "auxiliary_loss_clip": 0.01587324, + "auxiliary_loss_mlp": 0.01216949, + "balance_loss_clip": 1.29894698, + "balance_loss_mlp": 1.01934814, + "epoch": 0.5163384939125206, + "flos": 69966433435680.0, + "grad_norm": 0.7237025662167318, + "language_loss": 0.57721281, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.6052556, + "num_input_tokens_seen": 184637875, + "step": 8588, + "time_per_iteration": 3.4031624794006348 + }, + { + "auxiliary_loss_clip": 0.01464933, + "auxiliary_loss_mlp": 0.0123422, + "balance_loss_clip": 1.15097404, + "balance_loss_mlp": 1.02574682, + "epoch": 0.5163986171651886, + "flos": 17418757116960.0, + "grad_norm": 12.700874020925061, + "language_loss": 0.75234234, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77933383, + "num_input_tokens_seen": 184656125, + "step": 8589, + "time_per_iteration": 2.73579478263855 + }, + { + "auxiliary_loss_clip": 0.01466653, + "auxiliary_loss_mlp": 0.01242417, + "balance_loss_clip": 1.15278566, + "balance_loss_mlp": 1.03527915, + "epoch": 0.5164587404178566, + "flos": 21324713872320.0, + "grad_norm": 1.7061993422989699, + "language_loss": 0.6723392, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69942999, + "num_input_tokens_seen": 184675920, + "step": 8590, + "time_per_iteration": 2.7915163040161133 + }, + { + "auxiliary_loss_clip": 0.01583307, + "auxiliary_loss_mlp": 0.01209129, + "balance_loss_clip": 1.29547071, + "balance_loss_mlp": 1.01229095, + "epoch": 0.5165188636705246, + "flos": 68063634560160.0, + "grad_norm": 0.7753968025224777, + "language_loss": 0.55773878, + "learning_rate": 1.990068767935895e-06, + "loss": 0.5856632, + "num_input_tokens_seen": 184730520, + "step": 8591, + "time_per_iteration": 4.460227012634277 + }, + { + "auxiliary_loss_clip": 0.0146453, + "auxiliary_loss_mlp": 0.01242344, + "balance_loss_clip": 1.15133166, + "balance_loss_mlp": 1.03921103, + "epoch": 0.5165789869231926, + "flos": 19387400006880.0, + "grad_norm": 1.6324062733928888, + "language_loss": 0.81496674, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.84203553, + "num_input_tokens_seen": 184748340, + "step": 8592, + "time_per_iteration": 4.379795789718628 + }, + { + "auxiliary_loss_clip": 0.01465721, + "auxiliary_loss_mlp": 0.01239227, + "balance_loss_clip": 1.15272522, + "balance_loss_mlp": 1.03227925, + "epoch": 0.5166391101758605, + "flos": 20962344588480.0, + "grad_norm": 2.195867414802192, + "language_loss": 0.83329308, + "learning_rate": 1.989289854948979e-06, + "loss": 0.8603425, + "num_input_tokens_seen": 184766615, + "step": 8593, + "time_per_iteration": 2.7857446670532227 + }, + { + "auxiliary_loss_clip": 0.01458567, + "auxiliary_loss_mlp": 0.01237249, + "balance_loss_clip": 1.14485896, + "balance_loss_mlp": 1.02915692, + "epoch": 0.5166992334285285, + "flos": 29465088527520.0, + "grad_norm": 1.7513900785280734, + "language_loss": 0.68769586, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71465403, + "num_input_tokens_seen": 184788075, + "step": 8594, + "time_per_iteration": 2.87385892868042 + }, + { + "auxiliary_loss_clip": 0.0146162, + "auxiliary_loss_mlp": 0.01245478, + "balance_loss_clip": 1.14828098, + "balance_loss_mlp": 1.03872156, + "epoch": 0.5167593566811964, + "flos": 20306522216160.0, + "grad_norm": 1.4647335149969682, + "language_loss": 0.77286649, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79993749, + "num_input_tokens_seen": 184808710, + "step": 8595, + "time_per_iteration": 2.767653226852417 + }, + { + "auxiliary_loss_clip": 0.01464934, + "auxiliary_loss_mlp": 0.01239625, + "balance_loss_clip": 1.15126932, + "balance_loss_mlp": 1.03382182, + "epoch": 0.5168194799338645, + "flos": 14613145140960.0, + "grad_norm": 1.5627176426716878, + "language_loss": 0.65497488, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.68202043, + "num_input_tokens_seen": 184826475, + "step": 8596, + "time_per_iteration": 2.8332555294036865 + }, + { + "auxiliary_loss_clip": 0.01462603, + "auxiliary_loss_mlp": 0.01235646, + "balance_loss_clip": 1.14771318, + "balance_loss_mlp": 1.0267911, + "epoch": 0.5168796031865324, + "flos": 25009471412640.0, + "grad_norm": 1.6367394976280567, + "language_loss": 0.75625926, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.78324175, + "num_input_tokens_seen": 184845245, + "step": 8597, + "time_per_iteration": 4.380585193634033 + }, + { + "auxiliary_loss_clip": 0.01466044, + "auxiliary_loss_mlp": 0.01237838, + "balance_loss_clip": 1.15141606, + "balance_loss_mlp": 1.03222585, + "epoch": 0.5169397264392004, + "flos": 26942233898880.0, + "grad_norm": 1.5591516454888266, + "language_loss": 0.81422961, + "learning_rate": 1.987342579847403e-06, + "loss": 0.84126836, + "num_input_tokens_seen": 184866605, + "step": 8598, + "time_per_iteration": 2.7783048152923584 + }, + { + "auxiliary_loss_clip": 0.01468219, + "auxiliary_loss_mlp": 0.01240203, + "balance_loss_clip": 1.15413713, + "balance_loss_mlp": 1.03401875, + "epoch": 0.5169998496918683, + "flos": 25409996789760.0, + "grad_norm": 1.6106853919701292, + "language_loss": 0.75469697, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.7817812, + "num_input_tokens_seen": 184886945, + "step": 8599, + "time_per_iteration": 2.851229429244995 + }, + { + "auxiliary_loss_clip": 0.01467415, + "auxiliary_loss_mlp": 0.01243559, + "balance_loss_clip": 1.15364861, + "balance_loss_mlp": 1.03603971, + "epoch": 0.5170599729445363, + "flos": 24683096316960.0, + "grad_norm": 2.763070398703343, + "language_loss": 0.73048848, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.75759816, + "num_input_tokens_seen": 184905590, + "step": 8600, + "time_per_iteration": 2.889139413833618 + }, + { + "auxiliary_loss_clip": 0.01467084, + "auxiliary_loss_mlp": 0.01244783, + "balance_loss_clip": 1.153584, + "balance_loss_mlp": 1.03764498, + "epoch": 0.5171200961972042, + "flos": 20996480296800.0, + "grad_norm": 1.4643390543845043, + "language_loss": 0.7431978, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.77031648, + "num_input_tokens_seen": 184925555, + "step": 8601, + "time_per_iteration": 2.8054039478302 + }, + { + "auxiliary_loss_clip": 0.01460489, + "auxiliary_loss_mlp": 0.01239869, + "balance_loss_clip": 1.14659441, + "balance_loss_mlp": 1.03177691, + "epoch": 0.5171802194498722, + "flos": 22747754715840.0, + "grad_norm": 2.0075345925578354, + "language_loss": 0.832555, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.85955858, + "num_input_tokens_seen": 184944490, + "step": 8602, + "time_per_iteration": 2.811544418334961 + }, + { + "auxiliary_loss_clip": 0.01465585, + "auxiliary_loss_mlp": 0.01229879, + "balance_loss_clip": 1.15394187, + "balance_loss_mlp": 1.02083361, + "epoch": 0.5172403427025402, + "flos": 28178856014400.0, + "grad_norm": 1.8985532562157938, + "language_loss": 0.74828672, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.77524137, + "num_input_tokens_seen": 184963190, + "step": 8603, + "time_per_iteration": 2.8764426708221436 + }, + { + "auxiliary_loss_clip": 0.01464307, + "auxiliary_loss_mlp": 0.01251771, + "balance_loss_clip": 1.15207934, + "balance_loss_mlp": 1.0450139, + "epoch": 0.5173004659552082, + "flos": 20339823504960.0, + "grad_norm": 2.9066713401369073, + "language_loss": 0.72415864, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.75131935, + "num_input_tokens_seen": 184981220, + "step": 8604, + "time_per_iteration": 2.8032617568969727 + }, + { + "auxiliary_loss_clip": 0.01459417, + "auxiliary_loss_mlp": 0.01245279, + "balance_loss_clip": 1.14738631, + "balance_loss_mlp": 1.03661525, + "epoch": 0.5173605892078762, + "flos": 19065386649600.0, + "grad_norm": 2.0031422683757985, + "language_loss": 0.85083258, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87787956, + "num_input_tokens_seen": 184998810, + "step": 8605, + "time_per_iteration": 2.787320375442505 + }, + { + "auxiliary_loss_clip": 0.01462937, + "auxiliary_loss_mlp": 0.01236273, + "balance_loss_clip": 1.15100324, + "balance_loss_mlp": 1.02913475, + "epoch": 0.5174207124605441, + "flos": 27997330019040.0, + "grad_norm": 1.6337834420509445, + "language_loss": 0.64724755, + "learning_rate": 1.984226965411294e-06, + "loss": 0.67423964, + "num_input_tokens_seen": 185021185, + "step": 8606, + "time_per_iteration": 2.8627007007598877 + }, + { + "auxiliary_loss_clip": 0.01466433, + "auxiliary_loss_mlp": 0.0123491, + "balance_loss_clip": 1.15445828, + "balance_loss_mlp": 1.02815318, + "epoch": 0.5174808357132121, + "flos": 19498492680480.0, + "grad_norm": 1.7760896137090583, + "language_loss": 0.77937627, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80638969, + "num_input_tokens_seen": 185038465, + "step": 8607, + "time_per_iteration": 2.8269505500793457 + }, + { + "auxiliary_loss_clip": 0.01473765, + "auxiliary_loss_mlp": 0.01240263, + "balance_loss_clip": 1.16284943, + "balance_loss_mlp": 1.03236234, + "epoch": 0.51754095896588, + "flos": 22786403875200.0, + "grad_norm": 2.1511754119288744, + "language_loss": 0.71920979, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74635011, + "num_input_tokens_seen": 185057340, + "step": 8608, + "time_per_iteration": 2.850504159927368 + }, + { + "auxiliary_loss_clip": 0.01467923, + "auxiliary_loss_mlp": 0.01243909, + "balance_loss_clip": 1.15717411, + "balance_loss_mlp": 1.03390968, + "epoch": 0.5176010822185481, + "flos": 22671177032160.0, + "grad_norm": 1.8532071480894405, + "language_loss": 0.86650848, + "learning_rate": 1.983058619460531e-06, + "loss": 0.89362681, + "num_input_tokens_seen": 185074935, + "step": 8609, + "time_per_iteration": 2.815269947052002 + }, + { + "auxiliary_loss_clip": 0.01466953, + "auxiliary_loss_mlp": 0.01228028, + "balance_loss_clip": 1.15603995, + "balance_loss_mlp": 1.02108073, + "epoch": 0.517661205471216, + "flos": 23953465016640.0, + "grad_norm": 1.7270367157207995, + "language_loss": 0.7375046, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.76445442, + "num_input_tokens_seen": 185095050, + "step": 8610, + "time_per_iteration": 2.8339781761169434 + }, + { + "auxiliary_loss_clip": 0.01469216, + "auxiliary_loss_mlp": 0.0124123, + "balance_loss_clip": 1.15953708, + "balance_loss_mlp": 1.03065896, + "epoch": 0.517721328723884, + "flos": 15597883795680.0, + "grad_norm": 2.7350759912517355, + "language_loss": 0.67182261, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69892704, + "num_input_tokens_seen": 185112275, + "step": 8611, + "time_per_iteration": 2.8373565673828125 + }, + { + "auxiliary_loss_clip": 0.01462083, + "auxiliary_loss_mlp": 0.0123811, + "balance_loss_clip": 1.15237594, + "balance_loss_mlp": 1.03192556, + "epoch": 0.5177814519765519, + "flos": 20962344588480.0, + "grad_norm": 2.0605256205684306, + "language_loss": 0.77468562, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.80168748, + "num_input_tokens_seen": 185132165, + "step": 8612, + "time_per_iteration": 2.822232961654663 + }, + { + "auxiliary_loss_clip": 0.01471682, + "auxiliary_loss_mlp": 0.01238815, + "balance_loss_clip": 1.1606648, + "balance_loss_mlp": 1.03053212, + "epoch": 0.5178415752292199, + "flos": 17969782890240.0, + "grad_norm": 1.9950917404059074, + "language_loss": 0.81970644, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84681135, + "num_input_tokens_seen": 185151025, + "step": 8613, + "time_per_iteration": 2.8407793045043945 + }, + { + "auxiliary_loss_clip": 0.01470621, + "auxiliary_loss_mlp": 0.01252082, + "balance_loss_clip": 1.16110969, + "balance_loss_mlp": 1.0445627, + "epoch": 0.5179016984818878, + "flos": 17823416663520.0, + "grad_norm": 2.5808162437533784, + "language_loss": 0.6646595, + "learning_rate": 1.981111389254541e-06, + "loss": 0.69188648, + "num_input_tokens_seen": 185168455, + "step": 8614, + "time_per_iteration": 2.884256601333618 + }, + { + "auxiliary_loss_clip": 0.0146757, + "auxiliary_loss_mlp": 0.01235072, + "balance_loss_clip": 1.15663362, + "balance_loss_mlp": 1.02774358, + "epoch": 0.5179618217345558, + "flos": 17822316746880.0, + "grad_norm": 2.1151959646873397, + "language_loss": 0.86327982, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.89030623, + "num_input_tokens_seen": 185184415, + "step": 8615, + "time_per_iteration": 2.754687547683716 + }, + { + "auxiliary_loss_clip": 0.01469825, + "auxiliary_loss_mlp": 0.0123997, + "balance_loss_clip": 1.15885711, + "balance_loss_mlp": 1.03645563, + "epoch": 0.5180219449872238, + "flos": 22523976385920.0, + "grad_norm": 1.8732174686721037, + "language_loss": 0.80924851, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.83634651, + "num_input_tokens_seen": 185202910, + "step": 8616, + "time_per_iteration": 2.7694058418273926 + }, + { + "auxiliary_loss_clip": 0.01476037, + "auxiliary_loss_mlp": 0.01252834, + "balance_loss_clip": 1.16725087, + "balance_loss_mlp": 1.04264474, + "epoch": 0.5180820682398918, + "flos": 23917774253760.0, + "grad_norm": 1.7682200470528437, + "language_loss": 0.7512297, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77851844, + "num_input_tokens_seen": 185223085, + "step": 8617, + "time_per_iteration": 2.740290403366089 + }, + { + "auxiliary_loss_clip": 0.01467066, + "auxiliary_loss_mlp": 0.01242466, + "balance_loss_clip": 1.15619183, + "balance_loss_mlp": 1.03437448, + "epoch": 0.5181421914925598, + "flos": 16981668629280.0, + "grad_norm": 1.85021762145237, + "language_loss": 0.69732094, + "learning_rate": 1.979553617893785e-06, + "loss": 0.7244162, + "num_input_tokens_seen": 185241295, + "step": 8618, + "time_per_iteration": 2.733011245727539 + }, + { + "auxiliary_loss_clip": 0.01600109, + "auxiliary_loss_mlp": 0.01202309, + "balance_loss_clip": 1.31471288, + "balance_loss_mlp": 1.0039444, + "epoch": 0.5182023147452277, + "flos": 66066772754880.0, + "grad_norm": 0.9430190902210488, + "language_loss": 0.67272061, + "learning_rate": 1.979164176954999e-06, + "loss": 0.70074475, + "num_input_tokens_seen": 185298295, + "step": 8619, + "time_per_iteration": 3.236201524734497 + }, + { + "auxiliary_loss_clip": 0.01470958, + "auxiliary_loss_mlp": 0.01236824, + "balance_loss_clip": 1.16189909, + "balance_loss_mlp": 1.03140211, + "epoch": 0.5182624379978957, + "flos": 18189768404160.0, + "grad_norm": 2.6425399062822423, + "language_loss": 0.79406834, + "learning_rate": 1.97877473680631e-06, + "loss": 0.82114613, + "num_input_tokens_seen": 185317000, + "step": 8620, + "time_per_iteration": 2.819284200668335 + }, + { + "auxiliary_loss_clip": 0.01467941, + "auxiliary_loss_mlp": 0.01240683, + "balance_loss_clip": 1.1582197, + "balance_loss_mlp": 1.03449821, + "epoch": 0.5183225612505636, + "flos": 14028400869120.0, + "grad_norm": 2.72842760713726, + "language_loss": 0.82309932, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.85018557, + "num_input_tokens_seen": 185331185, + "step": 8621, + "time_per_iteration": 4.2251927852630615 + }, + { + "auxiliary_loss_clip": 0.01468818, + "auxiliary_loss_mlp": 0.01241752, + "balance_loss_clip": 1.15865707, + "balance_loss_mlp": 1.03499508, + "epoch": 0.5183826845032317, + "flos": 23662060048800.0, + "grad_norm": 2.0364818056939957, + "language_loss": 0.65607113, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.68317688, + "num_input_tokens_seen": 185348955, + "step": 8622, + "time_per_iteration": 2.7597179412841797 + }, + { + "auxiliary_loss_clip": 0.01467663, + "auxiliary_loss_mlp": 0.01246773, + "balance_loss_clip": 1.15814972, + "balance_loss_mlp": 1.03963506, + "epoch": 0.5184428077558996, + "flos": 15890464536480.0, + "grad_norm": 2.1167404790135893, + "language_loss": 0.60957444, + "learning_rate": 1.977606421248497e-06, + "loss": 0.63671875, + "num_input_tokens_seen": 185367330, + "step": 8623, + "time_per_iteration": 2.8020071983337402 + }, + { + "auxiliary_loss_clip": 0.01466952, + "auxiliary_loss_mlp": 0.01238429, + "balance_loss_clip": 1.15503526, + "balance_loss_mlp": 1.03357923, + "epoch": 0.5185029310085676, + "flos": 21032853766560.0, + "grad_norm": 1.749593840419222, + "language_loss": 0.7628746, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78992844, + "num_input_tokens_seen": 185385060, + "step": 8624, + "time_per_iteration": 2.8001585006713867 + }, + { + "auxiliary_loss_clip": 0.01462331, + "auxiliary_loss_mlp": 0.01243861, + "balance_loss_clip": 1.15181255, + "balance_loss_mlp": 1.03901148, + "epoch": 0.5185630542612355, + "flos": 26545159984320.0, + "grad_norm": 2.096114540710291, + "language_loss": 0.71272051, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73978245, + "num_input_tokens_seen": 185403745, + "step": 8625, + "time_per_iteration": 2.9666576385498047 + }, + { + "auxiliary_loss_clip": 0.01466148, + "auxiliary_loss_mlp": 0.01247328, + "balance_loss_clip": 1.15551722, + "balance_loss_mlp": 1.04305136, + "epoch": 0.5186231775139035, + "flos": 20670560339040.0, + "grad_norm": 2.2260644334779904, + "language_loss": 0.67548454, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70261925, + "num_input_tokens_seen": 185422620, + "step": 8626, + "time_per_iteration": 2.812051296234131 + }, + { + "auxiliary_loss_clip": 0.01466075, + "auxiliary_loss_mlp": 0.0123941, + "balance_loss_clip": 1.15615726, + "balance_loss_mlp": 1.03131831, + "epoch": 0.5186833007665714, + "flos": 20887549528320.0, + "grad_norm": 1.904879125630726, + "language_loss": 0.70551264, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.73256755, + "num_input_tokens_seen": 185439380, + "step": 8627, + "time_per_iteration": 2.75140380859375 + }, + { + "auxiliary_loss_clip": 0.01473711, + "auxiliary_loss_mlp": 0.01253222, + "balance_loss_clip": 1.16311693, + "balance_loss_mlp": 1.0447489, + "epoch": 0.5187434240192395, + "flos": 20889028726560.0, + "grad_norm": 2.53949712647992, + "language_loss": 0.73156691, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75883627, + "num_input_tokens_seen": 185458830, + "step": 8628, + "time_per_iteration": 2.855437755584717 + }, + { + "auxiliary_loss_clip": 0.01461833, + "auxiliary_loss_mlp": 0.0123751, + "balance_loss_clip": 1.15094709, + "balance_loss_mlp": 1.03189731, + "epoch": 0.5188035472719074, + "flos": 19861696383840.0, + "grad_norm": 1.841236843382334, + "language_loss": 0.77660275, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.80359614, + "num_input_tokens_seen": 185477270, + "step": 8629, + "time_per_iteration": 4.127071380615234 + }, + { + "auxiliary_loss_clip": 0.0146554, + "auxiliary_loss_mlp": 0.01248424, + "balance_loss_clip": 1.15434957, + "balance_loss_mlp": 1.03975987, + "epoch": 0.5188636705245754, + "flos": 21140229480480.0, + "grad_norm": 2.6461275434795466, + "language_loss": 0.74640024, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.77353984, + "num_input_tokens_seen": 185495795, + "step": 8630, + "time_per_iteration": 2.766474962234497 + }, + { + "auxiliary_loss_clip": 0.01464193, + "auxiliary_loss_mlp": 0.01240837, + "balance_loss_clip": 1.15331006, + "balance_loss_mlp": 1.03408015, + "epoch": 0.5189237937772434, + "flos": 22422024398880.0, + "grad_norm": 1.7969156059372557, + "language_loss": 0.80602896, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.83307922, + "num_input_tokens_seen": 185514885, + "step": 8631, + "time_per_iteration": 4.324849605560303 + }, + { + "auxiliary_loss_clip": 0.01462363, + "auxiliary_loss_mlp": 0.01248913, + "balance_loss_clip": 1.1512301, + "balance_loss_mlp": 1.04234743, + "epoch": 0.5189839170299113, + "flos": 25449025230720.0, + "grad_norm": 1.6569407409459769, + "language_loss": 0.74505752, + "learning_rate": 1.974101522024942e-06, + "loss": 0.77217031, + "num_input_tokens_seen": 185537155, + "step": 8632, + "time_per_iteration": 2.8730666637420654 + }, + { + "auxiliary_loss_clip": 0.01466694, + "auxiliary_loss_mlp": 0.01239816, + "balance_loss_clip": 1.15550959, + "balance_loss_mlp": 1.03191495, + "epoch": 0.5190440402825793, + "flos": 18589686930720.0, + "grad_norm": 2.2125950109935557, + "language_loss": 0.78689075, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.81395578, + "num_input_tokens_seen": 185555520, + "step": 8633, + "time_per_iteration": 2.7365550994873047 + }, + { + "auxiliary_loss_clip": 0.01467118, + "auxiliary_loss_mlp": 0.01237127, + "balance_loss_clip": 1.15558088, + "balance_loss_mlp": 1.03132367, + "epoch": 0.5191041635352472, + "flos": 21910823557920.0, + "grad_norm": 1.6457334482657844, + "language_loss": 0.80523014, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.83227259, + "num_input_tokens_seen": 185573855, + "step": 8634, + "time_per_iteration": 2.803215503692627 + }, + { + "auxiliary_loss_clip": 0.01459218, + "auxiliary_loss_mlp": 0.01248741, + "balance_loss_clip": 1.14825141, + "balance_loss_mlp": 1.04331934, + "epoch": 0.5191642867879153, + "flos": 27530505489600.0, + "grad_norm": 1.8083070680558493, + "language_loss": 0.68619275, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.71327233, + "num_input_tokens_seen": 185595145, + "step": 8635, + "time_per_iteration": 2.8208789825439453 + }, + { + "auxiliary_loss_clip": 0.01462661, + "auxiliary_loss_mlp": 0.01246713, + "balance_loss_clip": 1.15157771, + "balance_loss_mlp": 1.04014659, + "epoch": 0.5192244100405832, + "flos": 15707838624480.0, + "grad_norm": 1.7031303011697416, + "language_loss": 0.77900594, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.80609965, + "num_input_tokens_seen": 185613320, + "step": 8636, + "time_per_iteration": 4.196723699569702 + }, + { + "auxiliary_loss_clip": 0.01458241, + "auxiliary_loss_mlp": 0.01239389, + "balance_loss_clip": 1.14647579, + "balance_loss_mlp": 1.03091586, + "epoch": 0.5192845332932512, + "flos": 12058695990720.0, + "grad_norm": 1.9653054303507873, + "language_loss": 0.71303409, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.74001038, + "num_input_tokens_seen": 185630730, + "step": 8637, + "time_per_iteration": 2.8053081035614014 + }, + { + "auxiliary_loss_clip": 0.01461531, + "auxiliary_loss_mlp": 0.01231862, + "balance_loss_clip": 1.15041018, + "balance_loss_mlp": 1.02377009, + "epoch": 0.5193446565459191, + "flos": 18955204251840.0, + "grad_norm": 2.2164459905436216, + "language_loss": 0.75832433, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78525829, + "num_input_tokens_seen": 185648515, + "step": 8638, + "time_per_iteration": 2.759230136871338 + }, + { + "auxiliary_loss_clip": 0.01452149, + "auxiliary_loss_mlp": 0.01226066, + "balance_loss_clip": 1.14087987, + "balance_loss_mlp": 1.02045369, + "epoch": 0.5194047797985871, + "flos": 20376993466080.0, + "grad_norm": 2.188547873619619, + "language_loss": 0.74459636, + "learning_rate": 1.971375543740272e-06, + "loss": 0.77137858, + "num_input_tokens_seen": 185665220, + "step": 8639, + "time_per_iteration": 2.855233669281006 + }, + { + "auxiliary_loss_clip": 0.01459534, + "auxiliary_loss_mlp": 0.01236031, + "balance_loss_clip": 1.14909589, + "balance_loss_mlp": 1.0271765, + "epoch": 0.519464903051255, + "flos": 24355317879360.0, + "grad_norm": 1.6263863627604973, + "language_loss": 0.773561, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.80051672, + "num_input_tokens_seen": 185683750, + "step": 8640, + "time_per_iteration": 2.804685592651367 + }, + { + "auxiliary_loss_clip": 0.01459763, + "auxiliary_loss_mlp": 0.012257, + "balance_loss_clip": 1.14894557, + "balance_loss_mlp": 1.01913452, + "epoch": 0.519525026303923, + "flos": 14063446853280.0, + "grad_norm": 1.7370974218748025, + "language_loss": 0.66223407, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68908876, + "num_input_tokens_seen": 185700625, + "step": 8641, + "time_per_iteration": 2.9011480808258057 + }, + { + "auxiliary_loss_clip": 0.01456858, + "auxiliary_loss_mlp": 0.01244127, + "balance_loss_clip": 1.14706826, + "balance_loss_mlp": 1.038324, + "epoch": 0.519585149556591, + "flos": 28838167777440.0, + "grad_norm": 1.7093110414877073, + "language_loss": 0.76646531, + "learning_rate": 1.97020728331885e-06, + "loss": 0.79347515, + "num_input_tokens_seen": 185721155, + "step": 8642, + "time_per_iteration": 2.8108677864074707 + }, + { + "auxiliary_loss_clip": 0.01457911, + "auxiliary_loss_mlp": 0.01234398, + "balance_loss_clip": 1.14620805, + "balance_loss_mlp": 1.02611578, + "epoch": 0.519645272809259, + "flos": 25375292159040.0, + "grad_norm": 1.8395528902628595, + "language_loss": 0.83264375, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85956681, + "num_input_tokens_seen": 185740990, + "step": 8643, + "time_per_iteration": 2.8148858547210693 + }, + { + "auxiliary_loss_clip": 0.01454409, + "auxiliary_loss_mlp": 0.01238971, + "balance_loss_clip": 1.14306855, + "balance_loss_mlp": 1.03164256, + "epoch": 0.519705396061927, + "flos": 25375140446400.0, + "grad_norm": 2.951844218372528, + "language_loss": 0.70176983, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72870356, + "num_input_tokens_seen": 185762235, + "step": 8644, + "time_per_iteration": 2.8334927558898926 + }, + { + "auxiliary_loss_clip": 0.01446489, + "auxiliary_loss_mlp": 0.01240098, + "balance_loss_clip": 1.13603532, + "balance_loss_mlp": 1.03353274, + "epoch": 0.5197655193145949, + "flos": 28478794818240.0, + "grad_norm": 1.6733239599808463, + "language_loss": 0.8005448, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.8274107, + "num_input_tokens_seen": 185783415, + "step": 8645, + "time_per_iteration": 2.8115415573120117 + }, + { + "auxiliary_loss_clip": 0.01447542, + "auxiliary_loss_mlp": 0.0123089, + "balance_loss_clip": 1.13618326, + "balance_loss_mlp": 1.02337074, + "epoch": 0.5198256425672629, + "flos": 20011096863360.0, + "grad_norm": 2.0257097625760405, + "language_loss": 0.78004074, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80682504, + "num_input_tokens_seen": 185801345, + "step": 8646, + "time_per_iteration": 2.836941957473755 + }, + { + "auxiliary_loss_clip": 0.01458733, + "auxiliary_loss_mlp": 0.01239424, + "balance_loss_clip": 1.14883089, + "balance_loss_mlp": 1.03209496, + "epoch": 0.5198857658199308, + "flos": 19830936281760.0, + "grad_norm": 2.019890133728593, + "language_loss": 0.66163266, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68861419, + "num_input_tokens_seen": 185820815, + "step": 8647, + "time_per_iteration": 2.7409088611602783 + }, + { + "auxiliary_loss_clip": 0.01456053, + "auxiliary_loss_mlp": 0.01251919, + "balance_loss_clip": 1.14464533, + "balance_loss_mlp": 1.04344594, + "epoch": 0.5199458890725989, + "flos": 24464210719680.0, + "grad_norm": 3.284370178977185, + "language_loss": 0.71607733, + "learning_rate": 1.967870793377763e-06, + "loss": 0.74315703, + "num_input_tokens_seen": 185841450, + "step": 8648, + "time_per_iteration": 2.81754994392395 + }, + { + "auxiliary_loss_clip": 0.01456547, + "auxiliary_loss_mlp": 0.01244222, + "balance_loss_clip": 1.14560366, + "balance_loss_mlp": 1.03765595, + "epoch": 0.5200060123252668, + "flos": 23407066478880.0, + "grad_norm": 2.1754224655810708, + "language_loss": 0.64217079, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66917849, + "num_input_tokens_seen": 185859935, + "step": 8649, + "time_per_iteration": 2.7514076232910156 + }, + { + "auxiliary_loss_clip": 0.01461576, + "auxiliary_loss_mlp": 0.01245197, + "balance_loss_clip": 1.15169048, + "balance_loss_mlp": 1.03538895, + "epoch": 0.5200661355779348, + "flos": 17203247125920.0, + "grad_norm": 5.123107434267076, + "language_loss": 0.70596749, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.73303521, + "num_input_tokens_seen": 185876795, + "step": 8650, + "time_per_iteration": 2.7129969596862793 + }, + { + "auxiliary_loss_clip": 0.0145786, + "auxiliary_loss_mlp": 0.01241322, + "balance_loss_clip": 1.14976263, + "balance_loss_mlp": 1.03551936, + "epoch": 0.5201262588306027, + "flos": 18517053775680.0, + "grad_norm": 1.6619771164531023, + "language_loss": 0.77603734, + "learning_rate": 1.966702564655496e-06, + "loss": 0.80302918, + "num_input_tokens_seen": 185895570, + "step": 8651, + "time_per_iteration": 2.7789645195007324 + }, + { + "auxiliary_loss_clip": 0.01465588, + "auxiliary_loss_mlp": 0.01249349, + "balance_loss_clip": 1.15646482, + "balance_loss_mlp": 1.03954065, + "epoch": 0.5201863820832707, + "flos": 18621091811520.0, + "grad_norm": 1.6919073101704472, + "language_loss": 0.78815365, + "learning_rate": 1.966313157587003e-06, + "loss": 0.81530303, + "num_input_tokens_seen": 185913700, + "step": 8652, + "time_per_iteration": 2.7808997631073 + }, + { + "auxiliary_loss_clip": 0.01459675, + "auxiliary_loss_mlp": 0.01238307, + "balance_loss_clip": 1.14873624, + "balance_loss_mlp": 1.03231359, + "epoch": 0.5202465053359386, + "flos": 22859378383680.0, + "grad_norm": 2.3627272531087526, + "language_loss": 0.70388925, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.73086905, + "num_input_tokens_seen": 185932460, + "step": 8653, + "time_per_iteration": 2.817100763320923 + }, + { + "auxiliary_loss_clip": 0.01458968, + "auxiliary_loss_mlp": 0.01246152, + "balance_loss_clip": 1.15048122, + "balance_loss_mlp": 1.03786922, + "epoch": 0.5203066285886067, + "flos": 21983722210080.0, + "grad_norm": 1.6487390084006666, + "language_loss": 0.78986812, + "learning_rate": 1.965534347297008e-06, + "loss": 0.81691927, + "num_input_tokens_seen": 185952030, + "step": 8654, + "time_per_iteration": 2.782932758331299 + }, + { + "auxiliary_loss_clip": 0.01460151, + "auxiliary_loss_mlp": 0.01249658, + "balance_loss_clip": 1.15100455, + "balance_loss_mlp": 1.04118443, + "epoch": 0.5203667518412746, + "flos": 20235595828320.0, + "grad_norm": 1.903359859425611, + "language_loss": 0.84146529, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86856341, + "num_input_tokens_seen": 185973130, + "step": 8655, + "time_per_iteration": 2.842869997024536 + }, + { + "auxiliary_loss_clip": 0.01456001, + "auxiliary_loss_mlp": 0.01241209, + "balance_loss_clip": 1.14720047, + "balance_loss_mlp": 1.03636014, + "epoch": 0.5204268750939426, + "flos": 15707269702080.0, + "grad_norm": 3.568337102729523, + "language_loss": 0.66460669, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.69157881, + "num_input_tokens_seen": 185990200, + "step": 8656, + "time_per_iteration": 2.8203530311584473 + }, + { + "auxiliary_loss_clip": 0.01458053, + "auxiliary_loss_mlp": 0.012397, + "balance_loss_clip": 1.14923692, + "balance_loss_mlp": 1.03218079, + "epoch": 0.5204869983466105, + "flos": 27451690044480.0, + "grad_norm": 2.339984291757351, + "language_loss": 0.73641181, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.76338935, + "num_input_tokens_seen": 186009880, + "step": 8657, + "time_per_iteration": 2.818192720413208 + }, + { + "auxiliary_loss_clip": 0.01458228, + "auxiliary_loss_mlp": 0.01237879, + "balance_loss_clip": 1.15049386, + "balance_loss_mlp": 1.02959633, + "epoch": 0.5205471215992785, + "flos": 20597585830560.0, + "grad_norm": 2.1705498861144883, + "language_loss": 0.71558511, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.7425462, + "num_input_tokens_seen": 186026680, + "step": 8658, + "time_per_iteration": 2.8356683254241943 + }, + { + "auxiliary_loss_clip": 0.01457698, + "auxiliary_loss_mlp": 0.01232164, + "balance_loss_clip": 1.15008342, + "balance_loss_mlp": 1.0238812, + "epoch": 0.5206072448519465, + "flos": 22130126364960.0, + "grad_norm": 1.9215992052395803, + "language_loss": 0.8342483, + "learning_rate": 1.963587344701897e-06, + "loss": 0.86114693, + "num_input_tokens_seen": 186046920, + "step": 8659, + "time_per_iteration": 4.228291273117065 + }, + { + "auxiliary_loss_clip": 0.01455059, + "auxiliary_loss_mlp": 0.01257286, + "balance_loss_clip": 1.14566219, + "balance_loss_mlp": 1.04881251, + "epoch": 0.5206673681046144, + "flos": 18332265958560.0, + "grad_norm": 2.1272988609225068, + "language_loss": 0.75610769, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.78323114, + "num_input_tokens_seen": 186062090, + "step": 8660, + "time_per_iteration": 2.8307197093963623 + }, + { + "auxiliary_loss_clip": 0.0146052, + "auxiliary_loss_mlp": 0.01240532, + "balance_loss_clip": 1.15169597, + "balance_loss_mlp": 1.03301239, + "epoch": 0.5207274913572825, + "flos": 20232334006560.0, + "grad_norm": 2.039782636780055, + "language_loss": 0.77867651, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.80568707, + "num_input_tokens_seen": 186081135, + "step": 8661, + "time_per_iteration": 2.7512285709381104 + }, + { + "auxiliary_loss_clip": 0.01454126, + "auxiliary_loss_mlp": 0.01246432, + "balance_loss_clip": 1.14533556, + "balance_loss_mlp": 1.0410111, + "epoch": 0.5207876146099504, + "flos": 22129216089120.0, + "grad_norm": 1.78813674990663, + "language_loss": 0.69781941, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72482497, + "num_input_tokens_seen": 186099700, + "step": 8662, + "time_per_iteration": 2.8657314777374268 + }, + { + "auxiliary_loss_clip": 0.01453894, + "auxiliary_loss_mlp": 0.01235715, + "balance_loss_clip": 1.14414573, + "balance_loss_mlp": 1.03086591, + "epoch": 0.5208477378626184, + "flos": 23881211143200.0, + "grad_norm": 1.5836442562173982, + "language_loss": 0.69198382, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71887994, + "num_input_tokens_seen": 186119740, + "step": 8663, + "time_per_iteration": 2.8142642974853516 + }, + { + "auxiliary_loss_clip": 0.01457415, + "auxiliary_loss_mlp": 0.01243927, + "balance_loss_clip": 1.1485877, + "balance_loss_mlp": 1.03545427, + "epoch": 0.5209078611152863, + "flos": 20963747930400.0, + "grad_norm": 1.624493140867602, + "language_loss": 0.76639152, + "learning_rate": 1.961640376626072e-06, + "loss": 0.793405, + "num_input_tokens_seen": 186140645, + "step": 8664, + "time_per_iteration": 2.7455668449401855 + }, + { + "auxiliary_loss_clip": 0.0145483, + "auxiliary_loss_mlp": 0.01238284, + "balance_loss_clip": 1.14512861, + "balance_loss_mlp": 1.03019261, + "epoch": 0.5209679843679543, + "flos": 20669839704000.0, + "grad_norm": 2.054134853681988, + "language_loss": 0.76488781, + "learning_rate": 1.961250987315646e-06, + "loss": 0.79181898, + "num_input_tokens_seen": 186160130, + "step": 8665, + "time_per_iteration": 2.8371214866638184 + }, + { + "auxiliary_loss_clip": 0.01458107, + "auxiliary_loss_mlp": 0.01244418, + "balance_loss_clip": 1.14846945, + "balance_loss_mlp": 1.03747058, + "epoch": 0.5210281076206222, + "flos": 20229223897440.0, + "grad_norm": 2.10559154349605, + "language_loss": 0.72377127, + "learning_rate": 1.960861599474586e-06, + "loss": 0.75079656, + "num_input_tokens_seen": 186179485, + "step": 8666, + "time_per_iteration": 2.745765209197998 + }, + { + "auxiliary_loss_clip": 0.01457979, + "auxiliary_loss_mlp": 0.01251367, + "balance_loss_clip": 1.14854038, + "balance_loss_mlp": 1.03965151, + "epoch": 0.5210882308732903, + "flos": 16071800891040.0, + "grad_norm": 2.0887901119411216, + "language_loss": 0.6816811, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.70877457, + "num_input_tokens_seen": 186197140, + "step": 8667, + "time_per_iteration": 2.8113341331481934 + }, + { + "auxiliary_loss_clip": 0.01459447, + "auxiliary_loss_mlp": 0.01229863, + "balance_loss_clip": 1.15054548, + "balance_loss_mlp": 1.02444196, + "epoch": 0.5211483541259582, + "flos": 24827528207520.0, + "grad_norm": 1.4579938187962544, + "language_loss": 0.81196117, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83885425, + "num_input_tokens_seen": 186216800, + "step": 8668, + "time_per_iteration": 4.2667200565338135 + }, + { + "auxiliary_loss_clip": 0.01455469, + "auxiliary_loss_mlp": 0.01243045, + "balance_loss_clip": 1.1472528, + "balance_loss_mlp": 1.0378139, + "epoch": 0.5212084773786262, + "flos": 20372442086880.0, + "grad_norm": 2.553400764156476, + "language_loss": 0.63902187, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.66600704, + "num_input_tokens_seen": 186235320, + "step": 8669, + "time_per_iteration": 4.387009620666504 + }, + { + "auxiliary_loss_clip": 0.01454628, + "auxiliary_loss_mlp": 0.01244655, + "balance_loss_clip": 1.14590847, + "balance_loss_mlp": 1.03885221, + "epoch": 0.5212686006312941, + "flos": 23147521529760.0, + "grad_norm": 1.548688365878892, + "language_loss": 0.66451252, + "learning_rate": 1.959304063099325e-06, + "loss": 0.69150531, + "num_input_tokens_seen": 186254460, + "step": 8670, + "time_per_iteration": 2.8143837451934814 + }, + { + "auxiliary_loss_clip": 0.01455315, + "auxiliary_loss_mlp": 0.01243639, + "balance_loss_clip": 1.14674926, + "balance_loss_mlp": 1.03764522, + "epoch": 0.5213287238839621, + "flos": 27776054947680.0, + "grad_norm": 2.256861751416579, + "language_loss": 0.75780702, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78479648, + "num_input_tokens_seen": 186269465, + "step": 8671, + "time_per_iteration": 2.81986665725708 + }, + { + "auxiliary_loss_clip": 0.01459066, + "auxiliary_loss_mlp": 0.01244002, + "balance_loss_clip": 1.14987528, + "balance_loss_mlp": 1.03591084, + "epoch": 0.5213888471366301, + "flos": 19939715337600.0, + "grad_norm": 2.030883689656213, + "language_loss": 0.7845304, + "learning_rate": 1.958525304111796e-06, + "loss": 0.81156111, + "num_input_tokens_seen": 186288660, + "step": 8672, + "time_per_iteration": 2.745258331298828 + }, + { + "auxiliary_loss_clip": 0.01454414, + "auxiliary_loss_mlp": 0.01238069, + "balance_loss_clip": 1.14517474, + "balance_loss_mlp": 1.03398287, + "epoch": 0.521448970389298, + "flos": 16984627025760.0, + "grad_norm": 1.7892071379276167, + "language_loss": 0.72370636, + "learning_rate": 1.958135926969736e-06, + "loss": 0.75063121, + "num_input_tokens_seen": 186305760, + "step": 8673, + "time_per_iteration": 4.183932781219482 + }, + { + "auxiliary_loss_clip": 0.01454224, + "auxiliary_loss_mlp": 0.01241454, + "balance_loss_clip": 1.14502072, + "balance_loss_mlp": 1.03679585, + "epoch": 0.5215090936419661, + "flos": 18991729434240.0, + "grad_norm": 1.6885596172662514, + "language_loss": 0.7468394, + "learning_rate": 1.957746551415166e-06, + "loss": 0.7737962, + "num_input_tokens_seen": 186324135, + "step": 8674, + "time_per_iteration": 2.782524824142456 + }, + { + "auxiliary_loss_clip": 0.01456804, + "auxiliary_loss_mlp": 0.01241365, + "balance_loss_clip": 1.14797246, + "balance_loss_mlp": 1.03479922, + "epoch": 0.521569216894634, + "flos": 16145268465600.0, + "grad_norm": 2.7651970083178954, + "language_loss": 0.86171794, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88869965, + "num_input_tokens_seen": 186340205, + "step": 8675, + "time_per_iteration": 2.759716272354126 + }, + { + "auxiliary_loss_clip": 0.01558648, + "auxiliary_loss_mlp": 0.01216606, + "balance_loss_clip": 1.27129853, + "balance_loss_mlp": 1.0205307, + "epoch": 0.521629340147302, + "flos": 57585041380800.0, + "grad_norm": 0.867290242597589, + "language_loss": 0.6302768, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65802932, + "num_input_tokens_seen": 186396940, + "step": 8676, + "time_per_iteration": 3.2919886112213135 + }, + { + "auxiliary_loss_clip": 0.01457603, + "auxiliary_loss_mlp": 0.01233859, + "balance_loss_clip": 1.14884949, + "balance_loss_mlp": 1.02862823, + "epoch": 0.5216894633999699, + "flos": 26799319134720.0, + "grad_norm": 1.8160760705605694, + "language_loss": 0.68819493, + "learning_rate": 1.956578434424046e-06, + "loss": 0.71510947, + "num_input_tokens_seen": 186418680, + "step": 8677, + "time_per_iteration": 2.7930123805999756 + }, + { + "auxiliary_loss_clip": 0.01459163, + "auxiliary_loss_mlp": 0.01242852, + "balance_loss_clip": 1.14925647, + "balance_loss_mlp": 1.03609562, + "epoch": 0.5217495866526379, + "flos": 26361092802240.0, + "grad_norm": 1.6648967403520079, + "language_loss": 0.65322411, + "learning_rate": 1.956189065367086e-06, + "loss": 0.68024433, + "num_input_tokens_seen": 186438265, + "step": 8678, + "time_per_iteration": 2.8890559673309326 + }, + { + "auxiliary_loss_clip": 0.014543, + "auxiliary_loss_mlp": 0.01239615, + "balance_loss_clip": 1.14505577, + "balance_loss_mlp": 1.03285813, + "epoch": 0.5218097099053058, + "flos": 23586127143840.0, + "grad_norm": 2.3149194621201974, + "language_loss": 0.68377721, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.71071631, + "num_input_tokens_seen": 186456870, + "step": 8679, + "time_per_iteration": 2.745863437652588 + }, + { + "auxiliary_loss_clip": 0.01450273, + "auxiliary_loss_mlp": 0.01232961, + "balance_loss_clip": 1.14220083, + "balance_loss_mlp": 1.02849352, + "epoch": 0.5218698331579739, + "flos": 18079358437440.0, + "grad_norm": 2.7017853223373107, + "language_loss": 0.66808391, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69491619, + "num_input_tokens_seen": 186476425, + "step": 8680, + "time_per_iteration": 2.8143980503082275 + }, + { + "auxiliary_loss_clip": 0.01461605, + "auxiliary_loss_mlp": 0.01245227, + "balance_loss_clip": 1.15415478, + "balance_loss_mlp": 1.03961444, + "epoch": 0.5219299564106418, + "flos": 19283134402080.0, + "grad_norm": 3.4563819307789205, + "language_loss": 0.83651447, + "learning_rate": 1.955020968223156e-06, + "loss": 0.86358273, + "num_input_tokens_seen": 186492555, + "step": 8681, + "time_per_iteration": 2.796679973602295 + }, + { + "auxiliary_loss_clip": 0.01457184, + "auxiliary_loss_mlp": 0.01232179, + "balance_loss_clip": 1.14891815, + "balance_loss_mlp": 1.02599454, + "epoch": 0.5219900796633098, + "flos": 26653787327520.0, + "grad_norm": 1.792692864145104, + "language_loss": 0.77828163, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.8051753, + "num_input_tokens_seen": 186513190, + "step": 8682, + "time_per_iteration": 2.814208745956421 + }, + { + "auxiliary_loss_clip": 0.01462825, + "auxiliary_loss_mlp": 0.01243457, + "balance_loss_clip": 1.15448499, + "balance_loss_mlp": 1.03593791, + "epoch": 0.5220502029159777, + "flos": 34315541795520.0, + "grad_norm": 1.6500316638524661, + "language_loss": 0.6924603, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71952313, + "num_input_tokens_seen": 186534830, + "step": 8683, + "time_per_iteration": 2.9451444149017334 + }, + { + "auxiliary_loss_clip": 0.01464771, + "auxiliary_loss_mlp": 0.01229521, + "balance_loss_clip": 1.15635157, + "balance_loss_mlp": 1.02371848, + "epoch": 0.5221103261686457, + "flos": 22158269424000.0, + "grad_norm": 1.9691965032471508, + "language_loss": 0.76391065, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.79085356, + "num_input_tokens_seen": 186554390, + "step": 8684, + "time_per_iteration": 2.7783267498016357 + }, + { + "auxiliary_loss_clip": 0.01462078, + "auxiliary_loss_mlp": 0.01233677, + "balance_loss_clip": 1.15330625, + "balance_loss_mlp": 1.02539444, + "epoch": 0.5221704494213137, + "flos": 19210425390720.0, + "grad_norm": 1.704208464463368, + "language_loss": 0.755485, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.78244257, + "num_input_tokens_seen": 186572360, + "step": 8685, + "time_per_iteration": 2.8844261169433594 + }, + { + "auxiliary_loss_clip": 0.01458566, + "auxiliary_loss_mlp": 0.01241893, + "balance_loss_clip": 1.15051508, + "balance_loss_mlp": 1.03608966, + "epoch": 0.5222305726739817, + "flos": 19356070982400.0, + "grad_norm": 1.7643411455800297, + "language_loss": 0.81096387, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83796841, + "num_input_tokens_seen": 186590655, + "step": 8686, + "time_per_iteration": 2.732309103012085 + }, + { + "auxiliary_loss_clip": 0.0145985, + "auxiliary_loss_mlp": 0.01231607, + "balance_loss_clip": 1.14998603, + "balance_loss_mlp": 1.0258038, + "epoch": 0.5222906959266497, + "flos": 27817055652960.0, + "grad_norm": 1.8509464582355555, + "language_loss": 0.69884348, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72575796, + "num_input_tokens_seen": 186610345, + "step": 8687, + "time_per_iteration": 2.760676622390747 + }, + { + "auxiliary_loss_clip": 0.01453629, + "auxiliary_loss_mlp": 0.01232986, + "balance_loss_clip": 1.14432764, + "balance_loss_mlp": 1.02718306, + "epoch": 0.5223508191793176, + "flos": 12714594219360.0, + "grad_norm": 2.6757093778030807, + "language_loss": 0.83079278, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85765898, + "num_input_tokens_seen": 186624360, + "step": 8688, + "time_per_iteration": 2.73580002784729 + }, + { + "auxiliary_loss_clip": 0.01459839, + "auxiliary_loss_mlp": 0.01235393, + "balance_loss_clip": 1.14872205, + "balance_loss_mlp": 1.02768254, + "epoch": 0.5224109424319856, + "flos": 15634029696480.0, + "grad_norm": 4.170381962762795, + "language_loss": 0.7344625, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.76141483, + "num_input_tokens_seen": 186638680, + "step": 8689, + "time_per_iteration": 2.8508005142211914 + }, + { + "auxiliary_loss_clip": 0.01459533, + "auxiliary_loss_mlp": 0.01238946, + "balance_loss_clip": 1.14958322, + "balance_loss_mlp": 1.03409648, + "epoch": 0.5224710656846535, + "flos": 15744060381600.0, + "grad_norm": 2.0932069653435734, + "language_loss": 0.82949382, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85647857, + "num_input_tokens_seen": 186655840, + "step": 8690, + "time_per_iteration": 2.7638437747955322 + }, + { + "auxiliary_loss_clip": 0.0145665, + "auxiliary_loss_mlp": 0.01230248, + "balance_loss_clip": 1.14704764, + "balance_loss_mlp": 1.01986778, + "epoch": 0.5225311889373215, + "flos": 26033390220960.0, + "grad_norm": 2.6583630730631342, + "language_loss": 0.78773725, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81460625, + "num_input_tokens_seen": 186674150, + "step": 8691, + "time_per_iteration": 2.7873876094818115 + }, + { + "auxiliary_loss_clip": 0.01457019, + "auxiliary_loss_mlp": 0.01232735, + "balance_loss_clip": 1.14665151, + "balance_loss_mlp": 1.02483368, + "epoch": 0.5225913121899894, + "flos": 18371256471360.0, + "grad_norm": 2.3506158931573187, + "language_loss": 0.77072203, + "learning_rate": 1.950738079725646e-06, + "loss": 0.79761952, + "num_input_tokens_seen": 186690675, + "step": 8692, + "time_per_iteration": 2.7610223293304443 + }, + { + "auxiliary_loss_clip": 0.01463615, + "auxiliary_loss_mlp": 0.01236423, + "balance_loss_clip": 1.15443718, + "balance_loss_mlp": 1.03023839, + "epoch": 0.5226514354426575, + "flos": 29275825187520.0, + "grad_norm": 2.1143162913383495, + "language_loss": 0.72961712, + "learning_rate": 1.950348737138691e-06, + "loss": 0.75661749, + "num_input_tokens_seen": 186710380, + "step": 8693, + "time_per_iteration": 2.8330578804016113 + }, + { + "auxiliary_loss_clip": 0.0145107, + "auxiliary_loss_mlp": 0.01235246, + "balance_loss_clip": 1.13947535, + "balance_loss_mlp": 1.02600932, + "epoch": 0.5227115586953254, + "flos": 22855320070560.0, + "grad_norm": 2.3544094720045403, + "language_loss": 0.8214103, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84827352, + "num_input_tokens_seen": 186729135, + "step": 8694, + "time_per_iteration": 2.8183491230010986 + }, + { + "auxiliary_loss_clip": 0.01556808, + "auxiliary_loss_mlp": 0.0119146, + "balance_loss_clip": 1.26703942, + "balance_loss_mlp": 0.9930954, + "epoch": 0.5227716819479934, + "flos": 57480206853600.0, + "grad_norm": 0.7704235682979746, + "language_loss": 0.55669022, + "learning_rate": 1.949570057627888e-06, + "loss": 0.58417284, + "num_input_tokens_seen": 186791115, + "step": 8695, + "time_per_iteration": 3.4056143760681152 + }, + { + "auxiliary_loss_clip": 0.01458195, + "auxiliary_loss_mlp": 0.01229291, + "balance_loss_clip": 1.14646697, + "balance_loss_mlp": 1.02138984, + "epoch": 0.5228318052006613, + "flos": 13809970409760.0, + "grad_norm": 1.832219675030357, + "language_loss": 0.73137444, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75824928, + "num_input_tokens_seen": 186808660, + "step": 8696, + "time_per_iteration": 2.833500623703003 + }, + { + "auxiliary_loss_clip": 0.01451502, + "auxiliary_loss_mlp": 0.01231512, + "balance_loss_clip": 1.14106035, + "balance_loss_mlp": 1.02570915, + "epoch": 0.5228919284533293, + "flos": 15597997580160.0, + "grad_norm": 1.9397665221412363, + "language_loss": 0.71414536, + "learning_rate": 1.948791385766319e-06, + "loss": 0.7409755, + "num_input_tokens_seen": 186825900, + "step": 8697, + "time_per_iteration": 2.8452816009521484 + }, + { + "auxiliary_loss_clip": 0.01447908, + "auxiliary_loss_mlp": 0.01230521, + "balance_loss_clip": 1.13725126, + "balance_loss_mlp": 1.0250994, + "epoch": 0.5229520517059973, + "flos": 22493671421760.0, + "grad_norm": 2.149321725574245, + "language_loss": 0.80681539, + "learning_rate": 1.948402052740906e-06, + "loss": 0.83359969, + "num_input_tokens_seen": 186843735, + "step": 8698, + "time_per_iteration": 4.154904842376709 + }, + { + "auxiliary_loss_clip": 0.01454506, + "auxiliary_loss_mlp": 0.01237128, + "balance_loss_clip": 1.14461899, + "balance_loss_mlp": 1.03227878, + "epoch": 0.5230121749586653, + "flos": 22093070188320.0, + "grad_norm": 1.662029259750308, + "language_loss": 0.74213886, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76905519, + "num_input_tokens_seen": 186862440, + "step": 8699, + "time_per_iteration": 2.8225584030151367 + }, + { + "auxiliary_loss_clip": 0.01449854, + "auxiliary_loss_mlp": 0.01231267, + "balance_loss_clip": 1.13890696, + "balance_loss_mlp": 1.023175, + "epoch": 0.5230722982113333, + "flos": 22129329873600.0, + "grad_norm": 1.715999843971457, + "language_loss": 0.72977269, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75658387, + "num_input_tokens_seen": 186880940, + "step": 8700, + "time_per_iteration": 2.8238277435302734 + }, + { + "auxiliary_loss_clip": 0.01451601, + "auxiliary_loss_mlp": 0.01233821, + "balance_loss_clip": 1.13982236, + "balance_loss_mlp": 1.02725482, + "epoch": 0.5231324214640012, + "flos": 25011595389600.0, + "grad_norm": 2.271801561447542, + "language_loss": 0.66846901, + "learning_rate": 1.947234065463318e-06, + "loss": 0.69532323, + "num_input_tokens_seen": 186900785, + "step": 8701, + "time_per_iteration": 2.7674670219421387 + }, + { + "auxiliary_loss_clip": 0.01453959, + "auxiliary_loss_mlp": 0.0124608, + "balance_loss_clip": 1.14472866, + "balance_loss_mlp": 1.04351926, + "epoch": 0.5231925447166692, + "flos": 25742971385280.0, + "grad_norm": 2.2945722307422707, + "language_loss": 0.6686675, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.69566786, + "num_input_tokens_seen": 186920895, + "step": 8702, + "time_per_iteration": 2.856567144393921 + }, + { + "auxiliary_loss_clip": 0.01454502, + "auxiliary_loss_mlp": 0.01234804, + "balance_loss_clip": 1.14374793, + "balance_loss_mlp": 1.02900136, + "epoch": 0.5232526679693371, + "flos": 21436072043040.0, + "grad_norm": 1.8995352071823335, + "language_loss": 0.76663703, + "learning_rate": 1.946455417258101e-06, + "loss": 0.79353005, + "num_input_tokens_seen": 186940605, + "step": 8703, + "time_per_iteration": 2.737877130508423 + }, + { + "auxiliary_loss_clip": 0.01452707, + "auxiliary_loss_mlp": 0.01245529, + "balance_loss_clip": 1.14338851, + "balance_loss_mlp": 1.0381999, + "epoch": 0.5233127912220051, + "flos": 35301304510560.0, + "grad_norm": 2.2891624512164346, + "language_loss": 0.77139711, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.79837942, + "num_input_tokens_seen": 186960820, + "step": 8704, + "time_per_iteration": 2.8505280017852783 + }, + { + "auxiliary_loss_clip": 0.01461715, + "auxiliary_loss_mlp": 0.0124901, + "balance_loss_clip": 1.15469229, + "balance_loss_mlp": 1.04358828, + "epoch": 0.523372914474673, + "flos": 17052974298720.0, + "grad_norm": 3.040508470869084, + "language_loss": 0.78256392, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80967116, + "num_input_tokens_seen": 186976240, + "step": 8705, + "time_per_iteration": 4.195703029632568 + }, + { + "auxiliary_loss_clip": 0.01451924, + "auxiliary_loss_mlp": 0.01235097, + "balance_loss_clip": 1.1421653, + "balance_loss_mlp": 1.02719593, + "epoch": 0.5234330377273411, + "flos": 18408312648000.0, + "grad_norm": 2.4568397287747077, + "language_loss": 0.69786525, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.72473544, + "num_input_tokens_seen": 186992855, + "step": 8706, + "time_per_iteration": 4.245158910751343 + }, + { + "auxiliary_loss_clip": 0.01574311, + "auxiliary_loss_mlp": 0.01234978, + "balance_loss_clip": 1.28852606, + "balance_loss_mlp": 1.0411911, + "epoch": 0.523493160980009, + "flos": 65857710551040.0, + "grad_norm": 0.6831005441862444, + "language_loss": 0.52405155, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.55214441, + "num_input_tokens_seen": 187051205, + "step": 8707, + "time_per_iteration": 3.3915164470672607 + }, + { + "auxiliary_loss_clip": 0.01454586, + "auxiliary_loss_mlp": 0.01245447, + "balance_loss_clip": 1.14481473, + "balance_loss_mlp": 1.04040682, + "epoch": 0.523553284232677, + "flos": 21874260447360.0, + "grad_norm": 1.9697122872589485, + "language_loss": 0.74321073, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.77021104, + "num_input_tokens_seen": 187070540, + "step": 8708, + "time_per_iteration": 2.8575894832611084 + }, + { + "auxiliary_loss_clip": 0.01458513, + "auxiliary_loss_mlp": 0.01240534, + "balance_loss_clip": 1.14856219, + "balance_loss_mlp": 1.03530347, + "epoch": 0.5236134074853449, + "flos": 20850189926400.0, + "grad_norm": 1.6057151104256429, + "language_loss": 0.77228606, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79927647, + "num_input_tokens_seen": 187089975, + "step": 8709, + "time_per_iteration": 2.8164892196655273 + }, + { + "auxiliary_loss_clip": 0.01453406, + "auxiliary_loss_mlp": 0.01248115, + "balance_loss_clip": 1.14304864, + "balance_loss_mlp": 1.04193103, + "epoch": 0.5236735307380129, + "flos": 25522796230560.0, + "grad_norm": 2.127984224854258, + "language_loss": 0.83971316, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.86672837, + "num_input_tokens_seen": 187108775, + "step": 8710, + "time_per_iteration": 2.796696901321411 + }, + { + "auxiliary_loss_clip": 0.01452318, + "auxiliary_loss_mlp": 0.01243444, + "balance_loss_clip": 1.14368093, + "balance_loss_mlp": 1.03878498, + "epoch": 0.523733653990681, + "flos": 23585178939840.0, + "grad_norm": 2.3104463377412228, + "language_loss": 0.69615674, + "learning_rate": 1.943340906834908e-06, + "loss": 0.72311437, + "num_input_tokens_seen": 187128830, + "step": 8711, + "time_per_iteration": 4.307425260543823 + }, + { + "auxiliary_loss_clip": 0.01456429, + "auxiliary_loss_mlp": 0.01245721, + "balance_loss_clip": 1.14773393, + "balance_loss_mlp": 1.03915501, + "epoch": 0.5237937772433489, + "flos": 21108407389920.0, + "grad_norm": 1.790577543765073, + "language_loss": 0.82888258, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.8559041, + "num_input_tokens_seen": 187149570, + "step": 8712, + "time_per_iteration": 2.8366317749023438 + }, + { + "auxiliary_loss_clip": 0.01455935, + "auxiliary_loss_mlp": 0.01237096, + "balance_loss_clip": 1.14538503, + "balance_loss_mlp": 1.03033972, + "epoch": 0.5238539004960169, + "flos": 19174962196800.0, + "grad_norm": 1.7950907338703395, + "language_loss": 0.69648123, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.72341144, + "num_input_tokens_seen": 187170575, + "step": 8713, + "time_per_iteration": 2.8279573917388916 + }, + { + "auxiliary_loss_clip": 0.01455524, + "auxiliary_loss_mlp": 0.01242371, + "balance_loss_clip": 1.14540553, + "balance_loss_mlp": 1.03389788, + "epoch": 0.5239140237486848, + "flos": 17889791672160.0, + "grad_norm": 3.457777795587658, + "language_loss": 0.76606083, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.7930398, + "num_input_tokens_seen": 187187190, + "step": 8714, + "time_per_iteration": 2.7431180477142334 + }, + { + "auxiliary_loss_clip": 0.01453173, + "auxiliary_loss_mlp": 0.01234759, + "balance_loss_clip": 1.14375412, + "balance_loss_mlp": 1.02609491, + "epoch": 0.5239741470013528, + "flos": 17932309503840.0, + "grad_norm": 2.108654142406245, + "language_loss": 0.76191866, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78879797, + "num_input_tokens_seen": 187204350, + "step": 8715, + "time_per_iteration": 2.728680372238159 + }, + { + "auxiliary_loss_clip": 0.01457518, + "auxiliary_loss_mlp": 0.0124914, + "balance_loss_clip": 1.14870536, + "balance_loss_mlp": 1.04524446, + "epoch": 0.5240342702540207, + "flos": 30996074007360.0, + "grad_norm": 1.746156579894723, + "language_loss": 0.71276504, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73983163, + "num_input_tokens_seen": 187225605, + "step": 8716, + "time_per_iteration": 2.8708670139312744 + }, + { + "auxiliary_loss_clip": 0.01452409, + "auxiliary_loss_mlp": 0.01237178, + "balance_loss_clip": 1.14263177, + "balance_loss_mlp": 1.03290105, + "epoch": 0.5240943935066887, + "flos": 25007157794880.0, + "grad_norm": 1.9550093836376896, + "language_loss": 0.86937678, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89627266, + "num_input_tokens_seen": 187241335, + "step": 8717, + "time_per_iteration": 2.850480556488037 + }, + { + "auxiliary_loss_clip": 0.01452674, + "auxiliary_loss_mlp": 0.01239801, + "balance_loss_clip": 1.14314139, + "balance_loss_mlp": 1.03456998, + "epoch": 0.5241545167593566, + "flos": 23661339413760.0, + "grad_norm": 1.8342142111129327, + "language_loss": 0.61356318, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.64048797, + "num_input_tokens_seen": 187259925, + "step": 8718, + "time_per_iteration": 2.8440051078796387 + }, + { + "auxiliary_loss_clip": 0.01457455, + "auxiliary_loss_mlp": 0.01247918, + "balance_loss_clip": 1.14764333, + "balance_loss_mlp": 1.03830075, + "epoch": 0.5242146400120247, + "flos": 23402439243360.0, + "grad_norm": 1.7936109739476107, + "language_loss": 0.721174, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74822772, + "num_input_tokens_seen": 187279035, + "step": 8719, + "time_per_iteration": 2.767428159713745 + }, + { + "auxiliary_loss_clip": 0.01454634, + "auxiliary_loss_mlp": 0.01233558, + "balance_loss_clip": 1.14597881, + "balance_loss_mlp": 1.02737331, + "epoch": 0.5242747632646926, + "flos": 17751011077440.0, + "grad_norm": 2.014631841323421, + "language_loss": 0.73095918, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.75784111, + "num_input_tokens_seen": 187297555, + "step": 8720, + "time_per_iteration": 2.888852119445801 + }, + { + "auxiliary_loss_clip": 0.01459614, + "auxiliary_loss_mlp": 0.01235383, + "balance_loss_clip": 1.15035725, + "balance_loss_mlp": 1.02843559, + "epoch": 0.5243348865173606, + "flos": 32600451205440.0, + "grad_norm": 2.0624840998085983, + "language_loss": 0.70096093, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72791088, + "num_input_tokens_seen": 187320265, + "step": 8721, + "time_per_iteration": 2.857417106628418 + }, + { + "auxiliary_loss_clip": 0.01458363, + "auxiliary_loss_mlp": 0.01235133, + "balance_loss_clip": 1.14879489, + "balance_loss_mlp": 1.02971196, + "epoch": 0.5243950097700285, + "flos": 25486498617120.0, + "grad_norm": 1.894924696167617, + "language_loss": 0.86503571, + "learning_rate": 1.939058681065813e-06, + "loss": 0.89197069, + "num_input_tokens_seen": 187338045, + "step": 8722, + "time_per_iteration": 2.8986215591430664 + }, + { + "auxiliary_loss_clip": 0.0145419, + "auxiliary_loss_mlp": 0.01237132, + "balance_loss_clip": 1.14472556, + "balance_loss_mlp": 1.03285503, + "epoch": 0.5244551330226965, + "flos": 15270219142560.0, + "grad_norm": 1.675468592886273, + "language_loss": 0.80043429, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82734752, + "num_input_tokens_seen": 187356040, + "step": 8723, + "time_per_iteration": 2.8503870964050293 + }, + { + "auxiliary_loss_clip": 0.01467534, + "auxiliary_loss_mlp": 0.01243142, + "balance_loss_clip": 1.15802217, + "balance_loss_mlp": 1.03428769, + "epoch": 0.5245152562753645, + "flos": 22239322630560.0, + "grad_norm": 1.9921292829787132, + "language_loss": 0.7539345, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.7810412, + "num_input_tokens_seen": 187374185, + "step": 8724, + "time_per_iteration": 2.7766425609588623 + }, + { + "auxiliary_loss_clip": 0.01457719, + "auxiliary_loss_mlp": 0.01243121, + "balance_loss_clip": 1.14712894, + "balance_loss_mlp": 1.0346477, + "epoch": 0.5245753795280325, + "flos": 29429170195680.0, + "grad_norm": 1.6115444144341946, + "language_loss": 0.70223665, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72924501, + "num_input_tokens_seen": 187396640, + "step": 8725, + "time_per_iteration": 2.9200878143310547 + }, + { + "auxiliary_loss_clip": 0.01554385, + "auxiliary_loss_mlp": 0.01199791, + "balance_loss_clip": 1.26676774, + "balance_loss_mlp": 1.00066376, + "epoch": 0.5246355027807005, + "flos": 58840817217120.0, + "grad_norm": 0.7568233772565741, + "language_loss": 0.55602276, + "learning_rate": 1.937501576352568e-06, + "loss": 0.58356452, + "num_input_tokens_seen": 187455945, + "step": 8726, + "time_per_iteration": 3.2887227535247803 + }, + { + "auxiliary_loss_clip": 0.01550415, + "auxiliary_loss_mlp": 0.01195511, + "balance_loss_clip": 1.26261723, + "balance_loss_mlp": 0.99638367, + "epoch": 0.5246956260333684, + "flos": 64533359872800.0, + "grad_norm": 0.7954075356413924, + "language_loss": 0.58341515, + "learning_rate": 1.937112306062219e-06, + "loss": 0.61087441, + "num_input_tokens_seen": 187519975, + "step": 8727, + "time_per_iteration": 3.259145736694336 + }, + { + "auxiliary_loss_clip": 0.01458232, + "auxiliary_loss_mlp": 0.01247779, + "balance_loss_clip": 1.14812148, + "balance_loss_mlp": 1.04216659, + "epoch": 0.5247557492860364, + "flos": 24535857742560.0, + "grad_norm": 1.2677211097798642, + "language_loss": 0.70673561, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.73379576, + "num_input_tokens_seen": 187541775, + "step": 8728, + "time_per_iteration": 2.8367018699645996 + }, + { + "auxiliary_loss_clip": 0.01462084, + "auxiliary_loss_mlp": 0.01233396, + "balance_loss_clip": 1.15226746, + "balance_loss_mlp": 1.02759361, + "epoch": 0.5248158725387043, + "flos": 18808079461920.0, + "grad_norm": 1.6447108572736722, + "language_loss": 0.69617212, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.72312689, + "num_input_tokens_seen": 187560425, + "step": 8729, + "time_per_iteration": 2.80021333694458 + }, + { + "auxiliary_loss_clip": 0.014651, + "auxiliary_loss_mlp": 0.01249791, + "balance_loss_clip": 1.15538597, + "balance_loss_mlp": 1.04093599, + "epoch": 0.5248759957913723, + "flos": 20957755281120.0, + "grad_norm": 1.6915939964026092, + "language_loss": 0.83272249, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85987139, + "num_input_tokens_seen": 187579930, + "step": 8730, + "time_per_iteration": 2.7960946559906006 + }, + { + "auxiliary_loss_clip": 0.01460719, + "auxiliary_loss_mlp": 0.01242411, + "balance_loss_clip": 1.15193617, + "balance_loss_mlp": 1.03698921, + "epoch": 0.5249361190440403, + "flos": 18662964864480.0, + "grad_norm": 2.446186681648468, + "language_loss": 0.79428661, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.82131791, + "num_input_tokens_seen": 187595365, + "step": 8731, + "time_per_iteration": 2.804140567779541 + }, + { + "auxiliary_loss_clip": 0.01458735, + "auxiliary_loss_mlp": 0.01247355, + "balance_loss_clip": 1.14866996, + "balance_loss_mlp": 1.04403162, + "epoch": 0.5249962422967083, + "flos": 24865646372640.0, + "grad_norm": 1.731118232699503, + "language_loss": 0.83201408, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85907501, + "num_input_tokens_seen": 187614715, + "step": 8732, + "time_per_iteration": 2.8607842922210693 + }, + { + "auxiliary_loss_clip": 0.01460142, + "auxiliary_loss_mlp": 0.01232513, + "balance_loss_clip": 1.14980435, + "balance_loss_mlp": 1.02670979, + "epoch": 0.5250563655493762, + "flos": 15264188565120.0, + "grad_norm": 1.6154221539111189, + "language_loss": 0.77506459, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.80199111, + "num_input_tokens_seen": 187630745, + "step": 8733, + "time_per_iteration": 2.7129340171813965 + }, + { + "auxiliary_loss_clip": 0.01457238, + "auxiliary_loss_mlp": 0.01244571, + "balance_loss_clip": 1.14411664, + "balance_loss_mlp": 1.03914988, + "epoch": 0.5251164888020442, + "flos": 18627198245280.0, + "grad_norm": 2.2936682519727927, + "language_loss": 0.81900859, + "learning_rate": 1.934387481628208e-06, + "loss": 0.84602672, + "num_input_tokens_seen": 187648200, + "step": 8734, + "time_per_iteration": 2.812541961669922 + }, + { + "auxiliary_loss_clip": 0.01451116, + "auxiliary_loss_mlp": 0.01233693, + "balance_loss_clip": 1.14022231, + "balance_loss_mlp": 1.02674556, + "epoch": 0.5251766120547121, + "flos": 29713065387840.0, + "grad_norm": 1.4237130024019322, + "language_loss": 0.76808393, + "learning_rate": 1.933998230828826e-06, + "loss": 0.79493201, + "num_input_tokens_seen": 187669205, + "step": 8735, + "time_per_iteration": 2.873030424118042 + }, + { + "auxiliary_loss_clip": 0.01451415, + "auxiliary_loss_mlp": 0.01230781, + "balance_loss_clip": 1.14094305, + "balance_loss_mlp": 1.02669501, + "epoch": 0.5252367353073801, + "flos": 23442529672800.0, + "grad_norm": 1.5562492987895913, + "language_loss": 0.8022573, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82907927, + "num_input_tokens_seen": 187690890, + "step": 8736, + "time_per_iteration": 4.211497783660889 + }, + { + "auxiliary_loss_clip": 0.01450983, + "auxiliary_loss_mlp": 0.01251038, + "balance_loss_clip": 1.14008904, + "balance_loss_mlp": 1.04389977, + "epoch": 0.5252968585600482, + "flos": 30813372239040.0, + "grad_norm": 2.017147914813406, + "language_loss": 0.70104063, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.72806084, + "num_input_tokens_seen": 187713045, + "step": 8737, + "time_per_iteration": 2.906384229660034 + }, + { + "auxiliary_loss_clip": 0.01446792, + "auxiliary_loss_mlp": 0.01236418, + "balance_loss_clip": 1.13677001, + "balance_loss_mlp": 1.03080559, + "epoch": 0.5253569818127161, + "flos": 20630204412480.0, + "grad_norm": 1.465492307632924, + "language_loss": 0.77166361, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79849565, + "num_input_tokens_seen": 187733640, + "step": 8738, + "time_per_iteration": 2.824751615524292 + }, + { + "auxiliary_loss_clip": 0.01523716, + "auxiliary_loss_mlp": 0.01234428, + "balance_loss_clip": 1.23303282, + "balance_loss_mlp": 1.03759003, + "epoch": 0.5254171050653841, + "flos": 63435214926720.0, + "grad_norm": 0.7532261579821501, + "language_loss": 0.54433954, + "learning_rate": 1.932441252806837e-06, + "loss": 0.57192099, + "num_input_tokens_seen": 187792930, + "step": 8739, + "time_per_iteration": 3.352654457092285 + }, + { + "auxiliary_loss_clip": 0.014488, + "auxiliary_loss_mlp": 0.01237702, + "balance_loss_clip": 1.13757253, + "balance_loss_mlp": 1.03304374, + "epoch": 0.525477228318052, + "flos": 34673245915680.0, + "grad_norm": 1.757120497093526, + "language_loss": 0.84687632, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.87374127, + "num_input_tokens_seen": 187812495, + "step": 8740, + "time_per_iteration": 2.9451334476470947 + }, + { + "auxiliary_loss_clip": 0.01453021, + "auxiliary_loss_mlp": 0.01245326, + "balance_loss_clip": 1.14103127, + "balance_loss_mlp": 1.04085803, + "epoch": 0.52553735157072, + "flos": 17932764641760.0, + "grad_norm": 2.037949035668273, + "language_loss": 0.69722402, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.72420752, + "num_input_tokens_seen": 187829685, + "step": 8741, + "time_per_iteration": 2.8674492835998535 + }, + { + "auxiliary_loss_clip": 0.01449323, + "auxiliary_loss_mlp": 0.01247606, + "balance_loss_clip": 1.13711989, + "balance_loss_mlp": 1.0410403, + "epoch": 0.5255974748233879, + "flos": 9942359388480.0, + "grad_norm": 2.1193988449549828, + "language_loss": 0.65907013, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68603939, + "num_input_tokens_seen": 187846495, + "step": 8742, + "time_per_iteration": 2.783073663711548 + }, + { + "auxiliary_loss_clip": 0.01454569, + "auxiliary_loss_mlp": 0.01244166, + "balance_loss_clip": 1.14353108, + "balance_loss_mlp": 1.03531134, + "epoch": 0.5256575980760559, + "flos": 16870044961440.0, + "grad_norm": 2.197659315156192, + "language_loss": 0.6345973, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.66158462, + "num_input_tokens_seen": 187862010, + "step": 8743, + "time_per_iteration": 2.8248836994171143 + }, + { + "auxiliary_loss_clip": 0.01521169, + "auxiliary_loss_mlp": 0.0121875, + "balance_loss_clip": 1.22750759, + "balance_loss_mlp": 1.02191162, + "epoch": 0.5257177213287239, + "flos": 62393545375200.0, + "grad_norm": 0.772091489980549, + "language_loss": 0.54093206, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56833124, + "num_input_tokens_seen": 187922730, + "step": 8744, + "time_per_iteration": 6.255696058273315 + }, + { + "auxiliary_loss_clip": 0.01448562, + "auxiliary_loss_mlp": 0.01240643, + "balance_loss_clip": 1.13765049, + "balance_loss_mlp": 1.03045297, + "epoch": 0.5257778445813919, + "flos": 20778998041440.0, + "grad_norm": 2.319710276353283, + "language_loss": 0.75638592, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.78327799, + "num_input_tokens_seen": 187940160, + "step": 8745, + "time_per_iteration": 2.8115580081939697 + }, + { + "auxiliary_loss_clip": 0.01451772, + "auxiliary_loss_mlp": 0.01241606, + "balance_loss_clip": 1.14048982, + "balance_loss_mlp": 1.03580344, + "epoch": 0.5258379678340598, + "flos": 17020469501280.0, + "grad_norm": 2.0603839919244438, + "language_loss": 0.81220812, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.83914185, + "num_input_tokens_seen": 187958625, + "step": 8746, + "time_per_iteration": 2.821065902709961 + }, + { + "auxiliary_loss_clip": 0.01451068, + "auxiliary_loss_mlp": 0.01232044, + "balance_loss_clip": 1.13930249, + "balance_loss_mlp": 1.02547836, + "epoch": 0.5258980910867278, + "flos": 21070858147200.0, + "grad_norm": 1.8576634475854605, + "language_loss": 0.75131178, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77814293, + "num_input_tokens_seen": 187977575, + "step": 8747, + "time_per_iteration": 2.7707159519195557 + }, + { + "auxiliary_loss_clip": 0.01445191, + "auxiliary_loss_mlp": 0.01233921, + "balance_loss_clip": 1.13472939, + "balance_loss_mlp": 1.02735567, + "epoch": 0.5259582143393957, + "flos": 18006421857120.0, + "grad_norm": 1.8099705787388347, + "language_loss": 0.82792705, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.85471815, + "num_input_tokens_seen": 187996650, + "step": 8748, + "time_per_iteration": 2.8134982585906982 + }, + { + "auxiliary_loss_clip": 0.01449681, + "auxiliary_loss_mlp": 0.01241565, + "balance_loss_clip": 1.1398046, + "balance_loss_mlp": 1.03309202, + "epoch": 0.5260183375920637, + "flos": 22786062521760.0, + "grad_norm": 1.9088759583229482, + "language_loss": 0.80573678, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.83264929, + "num_input_tokens_seen": 188013510, + "step": 8749, + "time_per_iteration": 2.8145787715911865 + }, + { + "auxiliary_loss_clip": 0.01454097, + "auxiliary_loss_mlp": 0.01249806, + "balance_loss_clip": 1.14395928, + "balance_loss_mlp": 1.04285908, + "epoch": 0.5260784608447318, + "flos": 27054843698880.0, + "grad_norm": 2.8633926372998837, + "language_loss": 0.72556293, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.75260198, + "num_input_tokens_seen": 188032085, + "step": 8750, + "time_per_iteration": 4.3465704917907715 + }, + { + "auxiliary_loss_clip": 0.01442403, + "auxiliary_loss_mlp": 0.01231529, + "balance_loss_clip": 1.1334399, + "balance_loss_mlp": 1.02496338, + "epoch": 0.5261385840973997, + "flos": 20664643546080.0, + "grad_norm": 1.4266887287635117, + "language_loss": 0.76598263, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.79272199, + "num_input_tokens_seen": 188050590, + "step": 8751, + "time_per_iteration": 2.832658529281616 + }, + { + "auxiliary_loss_clip": 0.01442797, + "auxiliary_loss_mlp": 0.01233907, + "balance_loss_clip": 1.13271558, + "balance_loss_mlp": 1.02772307, + "epoch": 0.5261987073500677, + "flos": 23624890087680.0, + "grad_norm": 1.9318852070344545, + "language_loss": 0.76116931, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78793633, + "num_input_tokens_seen": 188071620, + "step": 8752, + "time_per_iteration": 2.846517324447632 + }, + { + "auxiliary_loss_clip": 0.01445663, + "auxiliary_loss_mlp": 0.01238304, + "balance_loss_clip": 1.13587904, + "balance_loss_mlp": 1.03059351, + "epoch": 0.5262588306027356, + "flos": 27638715623040.0, + "grad_norm": 1.9107811511741535, + "language_loss": 0.68077707, + "learning_rate": 1.926992158720058e-06, + "loss": 0.70761681, + "num_input_tokens_seen": 188091740, + "step": 8753, + "time_per_iteration": 2.7788443565368652 + }, + { + "auxiliary_loss_clip": 0.01444131, + "auxiliary_loss_mlp": 0.01237893, + "balance_loss_clip": 1.13366199, + "balance_loss_mlp": 1.03056455, + "epoch": 0.5263189538554036, + "flos": 21761726503680.0, + "grad_norm": 1.7345910529943347, + "language_loss": 0.83889937, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.86571962, + "num_input_tokens_seen": 188111165, + "step": 8754, + "time_per_iteration": 2.7894911766052246 + }, + { + "auxiliary_loss_clip": 0.01439632, + "auxiliary_loss_mlp": 0.0123192, + "balance_loss_clip": 1.12931919, + "balance_loss_mlp": 1.02745199, + "epoch": 0.5263790771080715, + "flos": 14277743143200.0, + "grad_norm": 2.4799374994785874, + "language_loss": 0.87214428, + "learning_rate": 1.926213760058522e-06, + "loss": 0.8988598, + "num_input_tokens_seen": 188127825, + "step": 8755, + "time_per_iteration": 2.746511459350586 + }, + { + "auxiliary_loss_clip": 0.0148929, + "auxiliary_loss_mlp": 0.01197983, + "balance_loss_clip": 1.19814348, + "balance_loss_mlp": 1.00114441, + "epoch": 0.5264392003607395, + "flos": 65813410095840.0, + "grad_norm": 0.7463468358968298, + "language_loss": 0.58786714, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.61473989, + "num_input_tokens_seen": 188194050, + "step": 8756, + "time_per_iteration": 3.4392282962799072 + }, + { + "auxiliary_loss_clip": 0.01438081, + "auxiliary_loss_mlp": 0.01232529, + "balance_loss_clip": 1.12842345, + "balance_loss_mlp": 1.02596283, + "epoch": 0.5264993236134075, + "flos": 21034522605600.0, + "grad_norm": 1.6732321089552384, + "language_loss": 0.70764637, + "learning_rate": 1.925435372588913e-06, + "loss": 0.73435247, + "num_input_tokens_seen": 188212565, + "step": 8757, + "time_per_iteration": 2.831507921218872 + }, + { + "auxiliary_loss_clip": 0.01443811, + "auxiliary_loss_mlp": 0.01241305, + "balance_loss_clip": 1.13316917, + "balance_loss_mlp": 1.03550172, + "epoch": 0.5265594468660755, + "flos": 16620095836800.0, + "grad_norm": 1.6512594513108136, + "language_loss": 0.87681848, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.9036696, + "num_input_tokens_seen": 188229505, + "step": 8758, + "time_per_iteration": 2.773366689682007 + }, + { + "auxiliary_loss_clip": 0.01438383, + "auxiliary_loss_mlp": 0.0124717, + "balance_loss_clip": 1.12721467, + "balance_loss_mlp": 1.03984153, + "epoch": 0.5266195701187434, + "flos": 24136432282080.0, + "grad_norm": 1.9883101740633673, + "language_loss": 0.75986081, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78671634, + "num_input_tokens_seen": 188250395, + "step": 8759, + "time_per_iteration": 2.8199031352996826 + }, + { + "auxiliary_loss_clip": 0.01442245, + "auxiliary_loss_mlp": 0.01242892, + "balance_loss_clip": 1.1318512, + "balance_loss_mlp": 1.03708875, + "epoch": 0.5266796933714114, + "flos": 15845519302560.0, + "grad_norm": 2.20376456999844, + "language_loss": 0.71468186, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.74153316, + "num_input_tokens_seen": 188266785, + "step": 8760, + "time_per_iteration": 2.7473278045654297 + }, + { + "auxiliary_loss_clip": 0.01441115, + "auxiliary_loss_mlp": 0.01237679, + "balance_loss_clip": 1.12958455, + "balance_loss_mlp": 1.02806163, + "epoch": 0.5267398166240793, + "flos": 20953090117440.0, + "grad_norm": 2.5268450916991156, + "language_loss": 0.75642985, + "learning_rate": 1.923878631697736e-06, + "loss": 0.78321779, + "num_input_tokens_seen": 188282525, + "step": 8761, + "time_per_iteration": 2.7877094745635986 + }, + { + "auxiliary_loss_clip": 0.01435012, + "auxiliary_loss_mlp": 0.01219809, + "balance_loss_clip": 1.1245203, + "balance_loss_mlp": 1.0147686, + "epoch": 0.5267999398767473, + "flos": 20998528417440.0, + "grad_norm": 1.948886738969746, + "language_loss": 0.70770013, + "learning_rate": 1.923489453654373e-06, + "loss": 0.73424828, + "num_input_tokens_seen": 188301395, + "step": 8762, + "time_per_iteration": 2.8207337856292725 + }, + { + "auxiliary_loss_clip": 0.01486885, + "auxiliary_loss_mlp": 0.012071, + "balance_loss_clip": 1.19280386, + "balance_loss_mlp": 1.0094986, + "epoch": 0.5268600631294152, + "flos": 66855610641600.0, + "grad_norm": 0.9179881647197058, + "language_loss": 0.65339124, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.68033111, + "num_input_tokens_seen": 188357665, + "step": 8763, + "time_per_iteration": 3.2500293254852295 + }, + { + "auxiliary_loss_clip": 0.01439122, + "auxiliary_loss_mlp": 0.01239672, + "balance_loss_clip": 1.12820971, + "balance_loss_mlp": 1.03234363, + "epoch": 0.5269201863820833, + "flos": 17167480506720.0, + "grad_norm": 2.107222940101451, + "language_loss": 0.70943493, + "learning_rate": 1.922711106286265e-06, + "loss": 0.73622286, + "num_input_tokens_seen": 188376935, + "step": 8764, + "time_per_iteration": 2.8334925174713135 + }, + { + "auxiliary_loss_clip": 0.01440118, + "auxiliary_loss_mlp": 0.01234811, + "balance_loss_clip": 1.12915134, + "balance_loss_mlp": 1.02881742, + "epoch": 0.5269803096347513, + "flos": 20524725106560.0, + "grad_norm": 1.7603833342714779, + "language_loss": 0.73937619, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76612544, + "num_input_tokens_seen": 188394995, + "step": 8765, + "time_per_iteration": 2.726039409637451 + }, + { + "auxiliary_loss_clip": 0.01439685, + "auxiliary_loss_mlp": 0.01234471, + "balance_loss_clip": 1.12758017, + "balance_loss_mlp": 1.02809572, + "epoch": 0.5270404328874192, + "flos": 27233221656960.0, + "grad_norm": 1.5145272984864295, + "language_loss": 0.85725868, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.88400024, + "num_input_tokens_seen": 188415475, + "step": 8766, + "time_per_iteration": 2.9075047969818115 + }, + { + "auxiliary_loss_clip": 0.01442066, + "auxiliary_loss_mlp": 0.01240347, + "balance_loss_clip": 1.13111067, + "balance_loss_mlp": 1.0335902, + "epoch": 0.5271005561400872, + "flos": 23112627258240.0, + "grad_norm": 1.8486627928869606, + "language_loss": 0.7941761, + "learning_rate": 1.921543607252017e-06, + "loss": 0.82100022, + "num_input_tokens_seen": 188435665, + "step": 8767, + "time_per_iteration": 2.7782881259918213 + }, + { + "auxiliary_loss_clip": 0.01445607, + "auxiliary_loss_mlp": 0.01239046, + "balance_loss_clip": 1.13384247, + "balance_loss_mlp": 1.02828383, + "epoch": 0.5271606793927551, + "flos": 22566987283680.0, + "grad_norm": 2.4908317514705725, + "language_loss": 0.7391175, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.76596403, + "num_input_tokens_seen": 188455405, + "step": 8768, + "time_per_iteration": 2.813340425491333 + }, + { + "auxiliary_loss_clip": 0.01436921, + "auxiliary_loss_mlp": 0.01225499, + "balance_loss_clip": 1.12509727, + "balance_loss_mlp": 1.01816976, + "epoch": 0.5272208026454231, + "flos": 18765902983680.0, + "grad_norm": 2.195282887711225, + "language_loss": 0.73670822, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76333249, + "num_input_tokens_seen": 188472940, + "step": 8769, + "time_per_iteration": 2.7889742851257324 + }, + { + "auxiliary_loss_clip": 0.01435876, + "auxiliary_loss_mlp": 0.01227696, + "balance_loss_clip": 1.12533367, + "balance_loss_mlp": 1.01941371, + "epoch": 0.5272809258980911, + "flos": 20414239283520.0, + "grad_norm": 3.420060196345227, + "language_loss": 0.73866773, + "learning_rate": 1.920376134993436e-06, + "loss": 0.76530343, + "num_input_tokens_seen": 188493035, + "step": 8770, + "time_per_iteration": 2.7654306888580322 + }, + { + "auxiliary_loss_clip": 0.01440786, + "auxiliary_loss_mlp": 0.01235155, + "balance_loss_clip": 1.1295346, + "balance_loss_mlp": 1.02572775, + "epoch": 0.5273410491507591, + "flos": 28259264442240.0, + "grad_norm": 2.0697399172388966, + "language_loss": 0.68124187, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70800126, + "num_input_tokens_seen": 188513860, + "step": 8771, + "time_per_iteration": 2.895827293395996 + }, + { + "auxiliary_loss_clip": 0.01435394, + "auxiliary_loss_mlp": 0.01232646, + "balance_loss_clip": 1.12334764, + "balance_loss_mlp": 1.02627063, + "epoch": 0.527401172403427, + "flos": 22457335880160.0, + "grad_norm": 2.4312830925990343, + "language_loss": 0.76412165, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.79080206, + "num_input_tokens_seen": 188533345, + "step": 8772, + "time_per_iteration": 2.897143840789795 + }, + { + "auxiliary_loss_clip": 0.01442108, + "auxiliary_loss_mlp": 0.01245195, + "balance_loss_clip": 1.13074529, + "balance_loss_mlp": 1.03786635, + "epoch": 0.527461295656095, + "flos": 21033119263680.0, + "grad_norm": 2.8980691460071157, + "language_loss": 0.6571123, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68398535, + "num_input_tokens_seen": 188551550, + "step": 8773, + "time_per_iteration": 2.8298087120056152 + }, + { + "auxiliary_loss_clip": 0.01437078, + "auxiliary_loss_mlp": 0.01240336, + "balance_loss_clip": 1.12597609, + "balance_loss_mlp": 1.03491437, + "epoch": 0.5275214189087629, + "flos": 26324377979040.0, + "grad_norm": 2.503716065736702, + "language_loss": 0.86392176, + "learning_rate": 1.91881954765502e-06, + "loss": 0.89069593, + "num_input_tokens_seen": 188571615, + "step": 8774, + "time_per_iteration": 2.8460097312927246 + }, + { + "auxiliary_loss_clip": 0.01439275, + "auxiliary_loss_mlp": 0.01235423, + "balance_loss_clip": 1.12850952, + "balance_loss_mlp": 1.03057361, + "epoch": 0.5275815421614309, + "flos": 20049063315840.0, + "grad_norm": 1.837469896836543, + "language_loss": 0.80062556, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82737249, + "num_input_tokens_seen": 188591965, + "step": 8775, + "time_per_iteration": 4.236938714981079 + }, + { + "auxiliary_loss_clip": 0.01440353, + "auxiliary_loss_mlp": 0.01229092, + "balance_loss_clip": 1.12905908, + "balance_loss_mlp": 1.02405167, + "epoch": 0.5276416654140988, + "flos": 21434251491360.0, + "grad_norm": 1.6723302193233422, + "language_loss": 0.83738303, + "learning_rate": 1.918041272397012e-06, + "loss": 0.86407745, + "num_input_tokens_seen": 188610675, + "step": 8776, + "time_per_iteration": 2.8322741985321045 + }, + { + "auxiliary_loss_clip": 0.01439173, + "auxiliary_loss_mlp": 0.01236053, + "balance_loss_clip": 1.12855792, + "balance_loss_mlp": 1.02910602, + "epoch": 0.5277017886667669, + "flos": 17166987440640.0, + "grad_norm": 1.9742199556168707, + "language_loss": 0.67642528, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70317757, + "num_input_tokens_seen": 188628235, + "step": 8777, + "time_per_iteration": 2.7974250316619873 + }, + { + "auxiliary_loss_clip": 0.01443829, + "auxiliary_loss_mlp": 0.01238824, + "balance_loss_clip": 1.13369238, + "balance_loss_mlp": 1.03035057, + "epoch": 0.5277619119194349, + "flos": 20450195543520.0, + "grad_norm": 1.5152256714496004, + "language_loss": 0.82228053, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84910709, + "num_input_tokens_seen": 188648925, + "step": 8778, + "time_per_iteration": 2.7808053493499756 + }, + { + "auxiliary_loss_clip": 0.01439661, + "auxiliary_loss_mlp": 0.01239106, + "balance_loss_clip": 1.12864804, + "balance_loss_mlp": 1.02967954, + "epoch": 0.5278220351721028, + "flos": 24063381917280.0, + "grad_norm": 2.8114781380123235, + "language_loss": 0.79396319, + "learning_rate": 1.916873882856013e-06, + "loss": 0.82075083, + "num_input_tokens_seen": 188668125, + "step": 8779, + "time_per_iteration": 2.7860209941864014 + }, + { + "auxiliary_loss_clip": 0.01435052, + "auxiliary_loss_mlp": 0.01236423, + "balance_loss_clip": 1.12323427, + "balance_loss_mlp": 1.03252733, + "epoch": 0.5278821584247708, + "flos": 24645129864480.0, + "grad_norm": 3.050911319271432, + "language_loss": 0.77364212, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.80035686, + "num_input_tokens_seen": 188684410, + "step": 8780, + "time_per_iteration": 2.8044981956481934 + }, + { + "auxiliary_loss_clip": 0.01440914, + "auxiliary_loss_mlp": 0.01243589, + "balance_loss_clip": 1.12875021, + "balance_loss_mlp": 1.03759491, + "epoch": 0.5279422816774387, + "flos": 35411183483040.0, + "grad_norm": 1.5539228218273191, + "language_loss": 0.69281256, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71965754, + "num_input_tokens_seen": 188706130, + "step": 8781, + "time_per_iteration": 2.841609239578247 + }, + { + "auxiliary_loss_clip": 0.01431814, + "auxiliary_loss_mlp": 0.01227295, + "balance_loss_clip": 1.1208632, + "balance_loss_mlp": 1.02416193, + "epoch": 0.5280024049301068, + "flos": 22969295284320.0, + "grad_norm": 1.7947449323010016, + "language_loss": 0.72056466, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74715579, + "num_input_tokens_seen": 188725030, + "step": 8782, + "time_per_iteration": 4.218434810638428 + }, + { + "auxiliary_loss_clip": 0.01438966, + "auxiliary_loss_mlp": 0.0123831, + "balance_loss_clip": 1.12708831, + "balance_loss_mlp": 1.03174448, + "epoch": 0.5280625281827747, + "flos": 21509805114720.0, + "grad_norm": 1.6073696886723658, + "language_loss": 0.6827938, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70956659, + "num_input_tokens_seen": 188744325, + "step": 8783, + "time_per_iteration": 2.8129618167877197 + }, + { + "auxiliary_loss_clip": 0.01441091, + "auxiliary_loss_mlp": 0.01244827, + "balance_loss_clip": 1.12923288, + "balance_loss_mlp": 1.0365448, + "epoch": 0.5281226514354427, + "flos": 31210635794400.0, + "grad_norm": 2.076678036958877, + "language_loss": 0.69600224, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.72286141, + "num_input_tokens_seen": 188765100, + "step": 8784, + "time_per_iteration": 2.8517792224884033 + }, + { + "auxiliary_loss_clip": 0.01430531, + "auxiliary_loss_mlp": 0.01240068, + "balance_loss_clip": 1.11733782, + "balance_loss_mlp": 1.03426528, + "epoch": 0.5281827746881106, + "flos": 25079904734400.0, + "grad_norm": 4.5037549813819355, + "language_loss": 0.74739182, + "learning_rate": 1.91453918928048e-06, + "loss": 0.7740978, + "num_input_tokens_seen": 188783995, + "step": 8785, + "time_per_iteration": 2.808061122894287 + }, + { + "auxiliary_loss_clip": 0.0143675, + "auxiliary_loss_mlp": 0.01250118, + "balance_loss_clip": 1.12332106, + "balance_loss_mlp": 1.0441246, + "epoch": 0.5282428979407786, + "flos": 20633352449760.0, + "grad_norm": 2.0397843494069234, + "language_loss": 0.83354813, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.86041683, + "num_input_tokens_seen": 188803120, + "step": 8786, + "time_per_iteration": 2.71677303314209 + }, + { + "auxiliary_loss_clip": 0.01438764, + "auxiliary_loss_mlp": 0.01229344, + "balance_loss_clip": 1.12615383, + "balance_loss_mlp": 1.02468574, + "epoch": 0.5283030211934465, + "flos": 22421607189120.0, + "grad_norm": 2.5012149481357038, + "language_loss": 0.82545614, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.85213727, + "num_input_tokens_seen": 188820960, + "step": 8787, + "time_per_iteration": 2.8070294857025146 + }, + { + "auxiliary_loss_clip": 0.01436545, + "auxiliary_loss_mlp": 0.01234855, + "balance_loss_clip": 1.12420034, + "balance_loss_mlp": 1.03210378, + "epoch": 0.5283631444461145, + "flos": 23617304455680.0, + "grad_norm": 2.1213758339082704, + "language_loss": 0.83259386, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85930789, + "num_input_tokens_seen": 188837165, + "step": 8788, + "time_per_iteration": 4.269347190856934 + }, + { + "auxiliary_loss_clip": 0.01445548, + "auxiliary_loss_mlp": 0.01236608, + "balance_loss_clip": 1.13253069, + "balance_loss_mlp": 1.02927911, + "epoch": 0.5284232676987825, + "flos": 32674791127680.0, + "grad_norm": 1.867667591138073, + "language_loss": 0.74759042, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.77441198, + "num_input_tokens_seen": 188858555, + "step": 8789, + "time_per_iteration": 2.8213210105895996 + }, + { + "auxiliary_loss_clip": 0.01439299, + "auxiliary_loss_mlp": 0.01234899, + "balance_loss_clip": 1.12640309, + "balance_loss_mlp": 1.03062248, + "epoch": 0.5284833909514505, + "flos": 26763362874720.0, + "grad_norm": 1.6532772420812827, + "language_loss": 0.6990971, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.72583914, + "num_input_tokens_seen": 188879050, + "step": 8790, + "time_per_iteration": 2.8541595935821533 + }, + { + "auxiliary_loss_clip": 0.01437135, + "auxiliary_loss_mlp": 0.01229024, + "balance_loss_clip": 1.12450778, + "balance_loss_mlp": 1.02360272, + "epoch": 0.5285435142041185, + "flos": 22092880547520.0, + "grad_norm": 1.6113513309900385, + "language_loss": 0.7916863, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81834793, + "num_input_tokens_seen": 188898885, + "step": 8791, + "time_per_iteration": 2.8863000869750977 + }, + { + "auxiliary_loss_clip": 0.01445715, + "auxiliary_loss_mlp": 0.0123726, + "balance_loss_clip": 1.13158917, + "balance_loss_mlp": 1.03183866, + "epoch": 0.5286036374567864, + "flos": 20377524460320.0, + "grad_norm": 3.3642967761950726, + "language_loss": 0.66169417, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68852401, + "num_input_tokens_seen": 188917225, + "step": 8792, + "time_per_iteration": 2.825578451156616 + }, + { + "auxiliary_loss_clip": 0.01435738, + "auxiliary_loss_mlp": 0.01235545, + "balance_loss_clip": 1.12220192, + "balance_loss_mlp": 1.03222156, + "epoch": 0.5286637607094544, + "flos": 24354445531680.0, + "grad_norm": 2.116823625996502, + "language_loss": 0.79920077, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.82591349, + "num_input_tokens_seen": 188936120, + "step": 8793, + "time_per_iteration": 2.8206400871276855 + }, + { + "auxiliary_loss_clip": 0.01439569, + "auxiliary_loss_mlp": 0.01242991, + "balance_loss_clip": 1.12646008, + "balance_loss_mlp": 1.0369978, + "epoch": 0.5287238839621223, + "flos": 17272883956320.0, + "grad_norm": 1.8453130250377259, + "language_loss": 0.84954011, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.87636566, + "num_input_tokens_seen": 188953405, + "step": 8794, + "time_per_iteration": 2.7889304161071777 + }, + { + "auxiliary_loss_clip": 0.01437016, + "auxiliary_loss_mlp": 0.01239933, + "balance_loss_clip": 1.12403131, + "balance_loss_mlp": 1.03374803, + "epoch": 0.5287840072147904, + "flos": 17568840303360.0, + "grad_norm": 2.78977603176401, + "language_loss": 0.67845029, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.70521975, + "num_input_tokens_seen": 188971150, + "step": 8795, + "time_per_iteration": 2.7651848793029785 + }, + { + "auxiliary_loss_clip": 0.01440438, + "auxiliary_loss_mlp": 0.01236083, + "balance_loss_clip": 1.1270709, + "balance_loss_mlp": 1.02818191, + "epoch": 0.5288441304674583, + "flos": 18554109952320.0, + "grad_norm": 1.7506717134693166, + "language_loss": 0.80150473, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82826996, + "num_input_tokens_seen": 188989550, + "step": 8796, + "time_per_iteration": 2.741537570953369 + }, + { + "auxiliary_loss_clip": 0.01446876, + "auxiliary_loss_mlp": 0.01243302, + "balance_loss_clip": 1.1349684, + "balance_loss_mlp": 1.03616357, + "epoch": 0.5289042537201263, + "flos": 20816661068640.0, + "grad_norm": 3.140255689815449, + "language_loss": 0.69479513, + "learning_rate": 1.909870155310071e-06, + "loss": 0.72169685, + "num_input_tokens_seen": 189008795, + "step": 8797, + "time_per_iteration": 2.807684898376465 + }, + { + "auxiliary_loss_clip": 0.01438268, + "auxiliary_loss_mlp": 0.01223734, + "balance_loss_clip": 1.12676752, + "balance_loss_mlp": 1.01888466, + "epoch": 0.5289643769727942, + "flos": 15736702318560.0, + "grad_norm": 1.6232661580224488, + "language_loss": 0.82379144, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.85041147, + "num_input_tokens_seen": 189025540, + "step": 8798, + "time_per_iteration": 2.77984619140625 + }, + { + "auxiliary_loss_clip": 0.01438759, + "auxiliary_loss_mlp": 0.01230189, + "balance_loss_clip": 1.12636709, + "balance_loss_mlp": 1.02343261, + "epoch": 0.5290245002254622, + "flos": 19539379601280.0, + "grad_norm": 2.8820512652452024, + "language_loss": 0.71038288, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.73707235, + "num_input_tokens_seen": 189044885, + "step": 8799, + "time_per_iteration": 2.735222339630127 + }, + { + "auxiliary_loss_clip": 0.01441751, + "auxiliary_loss_mlp": 0.01232438, + "balance_loss_clip": 1.12998343, + "balance_loss_mlp": 1.02758908, + "epoch": 0.5290846234781301, + "flos": 15816959033760.0, + "grad_norm": 4.209325463482923, + "language_loss": 0.69197237, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71871424, + "num_input_tokens_seen": 189061280, + "step": 8800, + "time_per_iteration": 2.8168461322784424 + }, + { + "auxiliary_loss_clip": 0.01515018, + "auxiliary_loss_mlp": 0.01209747, + "balance_loss_clip": 1.21356177, + "balance_loss_mlp": 1.01138306, + "epoch": 0.5291447467307981, + "flos": 70063871971680.0, + "grad_norm": 0.9478718333960942, + "language_loss": 0.5682615, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59550917, + "num_input_tokens_seen": 189114775, + "step": 8801, + "time_per_iteration": 3.2285923957824707 + }, + { + "auxiliary_loss_clip": 0.01441303, + "auxiliary_loss_mlp": 0.01234513, + "balance_loss_clip": 1.12917435, + "balance_loss_mlp": 1.0296638, + "epoch": 0.529204869983466, + "flos": 28366602228000.0, + "grad_norm": 1.7546848909814854, + "language_loss": 0.63960838, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.66636658, + "num_input_tokens_seen": 189134700, + "step": 8802, + "time_per_iteration": 2.848508596420288 + }, + { + "auxiliary_loss_clip": 0.01436417, + "auxiliary_loss_mlp": 0.01234712, + "balance_loss_clip": 1.12429595, + "balance_loss_mlp": 1.03100705, + "epoch": 0.5292649932361341, + "flos": 33761254200480.0, + "grad_norm": 2.067666259543329, + "language_loss": 0.69099581, + "learning_rate": 1.907535821289003e-06, + "loss": 0.71770716, + "num_input_tokens_seen": 189155365, + "step": 8803, + "time_per_iteration": 2.8472821712493896 + }, + { + "auxiliary_loss_clip": 0.01435062, + "auxiliary_loss_mlp": 0.01233417, + "balance_loss_clip": 1.12324119, + "balance_loss_mlp": 1.02685094, + "epoch": 0.5293251164888021, + "flos": 20449702477440.0, + "grad_norm": 1.6868336342362384, + "language_loss": 0.76277614, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.7894609, + "num_input_tokens_seen": 189173885, + "step": 8804, + "time_per_iteration": 2.7946035861968994 + }, + { + "auxiliary_loss_clip": 0.01482646, + "auxiliary_loss_mlp": 0.01202728, + "balance_loss_clip": 1.18487322, + "balance_loss_mlp": 1.00436401, + "epoch": 0.52938523974147, + "flos": 66552864789600.0, + "grad_norm": 0.7560480289576073, + "language_loss": 0.52895403, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55580771, + "num_input_tokens_seen": 189236515, + "step": 8805, + "time_per_iteration": 3.2911295890808105 + }, + { + "auxiliary_loss_clip": 0.01502311, + "auxiliary_loss_mlp": 0.01202858, + "balance_loss_clip": 1.20716333, + "balance_loss_mlp": 1.00373077, + "epoch": 0.529445362994138, + "flos": 67158545770080.0, + "grad_norm": 0.7505143626446515, + "language_loss": 0.63753861, + "learning_rate": 1.906368701413693e-06, + "loss": 0.6645903, + "num_input_tokens_seen": 189300500, + "step": 8806, + "time_per_iteration": 3.1499717235565186 + }, + { + "auxiliary_loss_clip": 0.01437241, + "auxiliary_loss_mlp": 0.01235009, + "balance_loss_clip": 1.12733495, + "balance_loss_mlp": 1.0276798, + "epoch": 0.5295054862468059, + "flos": 17751276574560.0, + "grad_norm": 2.0604891281240865, + "language_loss": 0.72365177, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.75037432, + "num_input_tokens_seen": 189319745, + "step": 8807, + "time_per_iteration": 2.722045421600342 + }, + { + "auxiliary_loss_clip": 0.0144236, + "auxiliary_loss_mlp": 0.01247249, + "balance_loss_clip": 1.13348055, + "balance_loss_mlp": 1.0424, + "epoch": 0.529565609499474, + "flos": 11398815305280.0, + "grad_norm": 2.1897997610304962, + "language_loss": 0.69363105, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.72052717, + "num_input_tokens_seen": 189334550, + "step": 8808, + "time_per_iteration": 2.751094102859497 + }, + { + "auxiliary_loss_clip": 0.01439085, + "auxiliary_loss_mlp": 0.01234389, + "balance_loss_clip": 1.12987757, + "balance_loss_mlp": 1.03087449, + "epoch": 0.5296257327521419, + "flos": 17197975111680.0, + "grad_norm": 1.9663483167798952, + "language_loss": 0.86450148, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.89123625, + "num_input_tokens_seen": 189351735, + "step": 8809, + "time_per_iteration": 2.719658613204956 + }, + { + "auxiliary_loss_clip": 0.01446142, + "auxiliary_loss_mlp": 0.01241273, + "balance_loss_clip": 1.13665342, + "balance_loss_mlp": 1.0337534, + "epoch": 0.5296858560048099, + "flos": 39967007889600.0, + "grad_norm": 1.7257559210090225, + "language_loss": 0.63687736, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66375148, + "num_input_tokens_seen": 189373105, + "step": 8810, + "time_per_iteration": 3.074526786804199 + }, + { + "auxiliary_loss_clip": 0.0144757, + "auxiliary_loss_mlp": 0.01232018, + "balance_loss_clip": 1.13822329, + "balance_loss_mlp": 1.02564323, + "epoch": 0.5297459792574778, + "flos": 20963634145920.0, + "grad_norm": 1.5318861592194255, + "language_loss": 0.68005913, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.706855, + "num_input_tokens_seen": 189394615, + "step": 8811, + "time_per_iteration": 2.795076608657837 + }, + { + "auxiliary_loss_clip": 0.01506506, + "auxiliary_loss_mlp": 0.01195122, + "balance_loss_clip": 1.21102917, + "balance_loss_mlp": 0.99752045, + "epoch": 0.5298061025101458, + "flos": 66529956180960.0, + "grad_norm": 0.6665416843973563, + "language_loss": 0.53322208, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.5602383, + "num_input_tokens_seen": 189459750, + "step": 8812, + "time_per_iteration": 3.4350619316101074 + }, + { + "auxiliary_loss_clip": 0.01505599, + "auxiliary_loss_mlp": 0.01200409, + "balance_loss_clip": 1.20982969, + "balance_loss_mlp": 1.0043335, + "epoch": 0.5298662257628137, + "flos": 67669860395520.0, + "grad_norm": 0.7302563325026307, + "language_loss": 0.56369323, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.59075332, + "num_input_tokens_seen": 189527540, + "step": 8813, + "time_per_iteration": 3.33304762840271 + }, + { + "auxiliary_loss_clip": 0.01444076, + "auxiliary_loss_mlp": 0.01241484, + "balance_loss_clip": 1.13550234, + "balance_loss_mlp": 1.03987694, + "epoch": 0.5299263490154817, + "flos": 19648462082400.0, + "grad_norm": 1.6905765524362313, + "language_loss": 0.81803238, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.84488797, + "num_input_tokens_seen": 189546900, + "step": 8814, + "time_per_iteration": 4.0729522705078125 + }, + { + "auxiliary_loss_clip": 0.01443209, + "auxiliary_loss_mlp": 0.01237026, + "balance_loss_clip": 1.13278222, + "balance_loss_mlp": 1.03255844, + "epoch": 0.5299864722681497, + "flos": 22057227712800.0, + "grad_norm": 1.5917479410752613, + "language_loss": 0.85100234, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87780476, + "num_input_tokens_seen": 189566490, + "step": 8815, + "time_per_iteration": 2.8819267749786377 + }, + { + "auxiliary_loss_clip": 0.01444111, + "auxiliary_loss_mlp": 0.01243408, + "balance_loss_clip": 1.13477182, + "balance_loss_mlp": 1.03989387, + "epoch": 0.5300465955208177, + "flos": 21765936529440.0, + "grad_norm": 2.666580064272006, + "language_loss": 0.66668832, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.69356346, + "num_input_tokens_seen": 189585580, + "step": 8816, + "time_per_iteration": 2.871396780014038 + }, + { + "auxiliary_loss_clip": 0.01446637, + "auxiliary_loss_mlp": 0.01233844, + "balance_loss_clip": 1.13740182, + "balance_loss_mlp": 1.0297581, + "epoch": 0.5301067187734857, + "flos": 42999811729920.0, + "grad_norm": 1.6519067550174502, + "language_loss": 0.72488618, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.75169098, + "num_input_tokens_seen": 189608485, + "step": 8817, + "time_per_iteration": 2.9777486324310303 + }, + { + "auxiliary_loss_clip": 0.01439636, + "auxiliary_loss_mlp": 0.01236939, + "balance_loss_clip": 1.13013899, + "balance_loss_mlp": 1.03285241, + "epoch": 0.5301668420261536, + "flos": 20556167915520.0, + "grad_norm": 1.8245207258742802, + "language_loss": 0.6494782, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.6762439, + "num_input_tokens_seen": 189627815, + "step": 8818, + "time_per_iteration": 2.823739528656006 + }, + { + "auxiliary_loss_clip": 0.01447139, + "auxiliary_loss_mlp": 0.01235234, + "balance_loss_clip": 1.13798118, + "balance_loss_mlp": 1.02943122, + "epoch": 0.5302269652788216, + "flos": 17488204306560.0, + "grad_norm": 3.035648382345808, + "language_loss": 0.74756479, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77438855, + "num_input_tokens_seen": 189644850, + "step": 8819, + "time_per_iteration": 2.7895636558532715 + }, + { + "auxiliary_loss_clip": 0.01443597, + "auxiliary_loss_mlp": 0.01237483, + "balance_loss_clip": 1.13286996, + "balance_loss_mlp": 1.03148961, + "epoch": 0.5302870885314895, + "flos": 14575937251680.0, + "grad_norm": 2.046414703388482, + "language_loss": 0.81842732, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84523809, + "num_input_tokens_seen": 189660945, + "step": 8820, + "time_per_iteration": 4.177633047103882 + }, + { + "auxiliary_loss_clip": 0.01444511, + "auxiliary_loss_mlp": 0.0124154, + "balance_loss_clip": 1.13587999, + "balance_loss_mlp": 1.03802633, + "epoch": 0.5303472117841576, + "flos": 23440140198720.0, + "grad_norm": 2.060634762940414, + "language_loss": 0.72569907, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.75255954, + "num_input_tokens_seen": 189680425, + "step": 8821, + "time_per_iteration": 4.30115818977356 + }, + { + "auxiliary_loss_clip": 0.01442936, + "auxiliary_loss_mlp": 0.01238146, + "balance_loss_clip": 1.13371015, + "balance_loss_mlp": 1.03672981, + "epoch": 0.5304073350368255, + "flos": 22711115748960.0, + "grad_norm": 1.6022867035444546, + "language_loss": 0.74067235, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76748317, + "num_input_tokens_seen": 189700375, + "step": 8822, + "time_per_iteration": 2.7799367904663086 + }, + { + "auxiliary_loss_clip": 0.0144542, + "auxiliary_loss_mlp": 0.01235957, + "balance_loss_clip": 1.13448262, + "balance_loss_mlp": 1.03091741, + "epoch": 0.5304674582894935, + "flos": 27931068794880.0, + "grad_norm": 1.8198234693896116, + "language_loss": 0.67419112, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.70100486, + "num_input_tokens_seen": 189721225, + "step": 8823, + "time_per_iteration": 2.830085039138794 + }, + { + "auxiliary_loss_clip": 0.01439758, + "auxiliary_loss_mlp": 0.01242688, + "balance_loss_clip": 1.1299088, + "balance_loss_mlp": 1.03802919, + "epoch": 0.5305275815421614, + "flos": 21252725496000.0, + "grad_norm": 2.280101991943241, + "language_loss": 0.6968987, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.72372317, + "num_input_tokens_seen": 189740170, + "step": 8824, + "time_per_iteration": 2.7322957515716553 + }, + { + "auxiliary_loss_clip": 0.01445779, + "auxiliary_loss_mlp": 0.01241711, + "balance_loss_clip": 1.13524377, + "balance_loss_mlp": 1.03781581, + "epoch": 0.5305877047948294, + "flos": 17605024132320.0, + "grad_norm": 2.898505180929255, + "language_loss": 0.76162845, + "learning_rate": 1.898977700702689e-06, + "loss": 0.78850335, + "num_input_tokens_seen": 189757890, + "step": 8825, + "time_per_iteration": 2.8371615409851074 + }, + { + "auxiliary_loss_clip": 0.0144492, + "auxiliary_loss_mlp": 0.01231072, + "balance_loss_clip": 1.13529801, + "balance_loss_mlp": 1.02679491, + "epoch": 0.5306478280474973, + "flos": 15197320490400.0, + "grad_norm": 1.8379428718338384, + "language_loss": 0.85300183, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87976182, + "num_input_tokens_seen": 189775390, + "step": 8826, + "time_per_iteration": 4.338330507278442 + }, + { + "auxiliary_loss_clip": 0.01439273, + "auxiliary_loss_mlp": 0.01231722, + "balance_loss_clip": 1.12914872, + "balance_loss_mlp": 1.02878034, + "epoch": 0.5307079513001653, + "flos": 15343041938400.0, + "grad_norm": 1.5620376647570824, + "language_loss": 0.64714319, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.67385316, + "num_input_tokens_seen": 189793975, + "step": 8827, + "time_per_iteration": 2.771465539932251 + }, + { + "auxiliary_loss_clip": 0.01446381, + "auxiliary_loss_mlp": 0.01247777, + "balance_loss_clip": 1.13574219, + "balance_loss_mlp": 1.04254603, + "epoch": 0.5307680745528333, + "flos": 43547841178560.0, + "grad_norm": 1.823391334338317, + "language_loss": 0.59965312, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62659472, + "num_input_tokens_seen": 189817870, + "step": 8828, + "time_per_iteration": 2.974167585372925 + }, + { + "auxiliary_loss_clip": 0.01448724, + "auxiliary_loss_mlp": 0.01245847, + "balance_loss_clip": 1.13688803, + "balance_loss_mlp": 1.03890002, + "epoch": 0.5308281978055013, + "flos": 20051073508320.0, + "grad_norm": 1.6211560277335129, + "language_loss": 0.81567216, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.84261787, + "num_input_tokens_seen": 189837905, + "step": 8829, + "time_per_iteration": 2.827518939971924 + }, + { + "auxiliary_loss_clip": 0.01444134, + "auxiliary_loss_mlp": 0.01237472, + "balance_loss_clip": 1.13263953, + "balance_loss_mlp": 1.03491175, + "epoch": 0.5308883210581693, + "flos": 20706023532960.0, + "grad_norm": 2.271328767367858, + "language_loss": 0.78383791, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.81065404, + "num_input_tokens_seen": 189856970, + "step": 8830, + "time_per_iteration": 2.7702531814575195 + }, + { + "auxiliary_loss_clip": 0.01441462, + "auxiliary_loss_mlp": 0.01232497, + "balance_loss_clip": 1.12977803, + "balance_loss_mlp": 1.02802968, + "epoch": 0.5309484443108372, + "flos": 14357241295200.0, + "grad_norm": 2.0098428848706984, + "language_loss": 0.80934536, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.83608496, + "num_input_tokens_seen": 189872830, + "step": 8831, + "time_per_iteration": 2.7275779247283936 + }, + { + "auxiliary_loss_clip": 0.0144193, + "auxiliary_loss_mlp": 0.01243272, + "balance_loss_clip": 1.12989879, + "balance_loss_mlp": 1.03727877, + "epoch": 0.5310085675635052, + "flos": 20012538133440.0, + "grad_norm": 2.0249768250162172, + "language_loss": 0.73229301, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75914502, + "num_input_tokens_seen": 189891635, + "step": 8832, + "time_per_iteration": 2.789970874786377 + }, + { + "auxiliary_loss_clip": 0.01447559, + "auxiliary_loss_mlp": 0.01244732, + "balance_loss_clip": 1.13600802, + "balance_loss_mlp": 1.03854728, + "epoch": 0.5310686908161731, + "flos": 22129481586240.0, + "grad_norm": 2.0821974084507686, + "language_loss": 0.75546223, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.78238517, + "num_input_tokens_seen": 189909050, + "step": 8833, + "time_per_iteration": 2.7834365367889404 + }, + { + "auxiliary_loss_clip": 0.01440494, + "auxiliary_loss_mlp": 0.01241058, + "balance_loss_clip": 1.12803555, + "balance_loss_mlp": 1.03792572, + "epoch": 0.5311288140688412, + "flos": 24720228349920.0, + "grad_norm": 2.6500299276753507, + "language_loss": 0.7334938, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.76030922, + "num_input_tokens_seen": 189927405, + "step": 8834, + "time_per_iteration": 2.8274168968200684 + }, + { + "auxiliary_loss_clip": 0.01435839, + "auxiliary_loss_mlp": 0.01244319, + "balance_loss_clip": 1.12405622, + "balance_loss_mlp": 1.03813457, + "epoch": 0.5311889373215091, + "flos": 24100020884160.0, + "grad_norm": 1.7195044554246384, + "language_loss": 0.78217065, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.80897224, + "num_input_tokens_seen": 189947740, + "step": 8835, + "time_per_iteration": 2.833106279373169 + }, + { + "auxiliary_loss_clip": 0.01441446, + "auxiliary_loss_mlp": 0.01242055, + "balance_loss_clip": 1.1290884, + "balance_loss_mlp": 1.03510737, + "epoch": 0.5312490605741771, + "flos": 22019147475840.0, + "grad_norm": 2.3462490877964974, + "language_loss": 0.72363949, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.75047445, + "num_input_tokens_seen": 189966495, + "step": 8836, + "time_per_iteration": 2.7695422172546387 + }, + { + "auxiliary_loss_clip": 0.0144037, + "auxiliary_loss_mlp": 0.01238092, + "balance_loss_clip": 1.1281817, + "balance_loss_mlp": 1.03286171, + "epoch": 0.531309183826845, + "flos": 19392558236640.0, + "grad_norm": 1.9948432110285965, + "language_loss": 0.80594492, + "learning_rate": 1.894310406375987e-06, + "loss": 0.83272958, + "num_input_tokens_seen": 189985325, + "step": 8837, + "time_per_iteration": 2.7016220092773438 + }, + { + "auxiliary_loss_clip": 0.01443593, + "auxiliary_loss_mlp": 0.01239967, + "balance_loss_clip": 1.13095784, + "balance_loss_mlp": 1.03073049, + "epoch": 0.531369307079513, + "flos": 20191940151840.0, + "grad_norm": 1.8036869401710651, + "language_loss": 0.85668719, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88352275, + "num_input_tokens_seen": 190003290, + "step": 8838, + "time_per_iteration": 2.7890472412109375 + }, + { + "auxiliary_loss_clip": 0.01438664, + "auxiliary_loss_mlp": 0.01237437, + "balance_loss_clip": 1.1275295, + "balance_loss_mlp": 1.03277826, + "epoch": 0.5314294303321809, + "flos": 18882153887040.0, + "grad_norm": 2.385460280294817, + "language_loss": 0.72910428, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.75586534, + "num_input_tokens_seen": 190023260, + "step": 8839, + "time_per_iteration": 2.7639997005462646 + }, + { + "auxiliary_loss_clip": 0.01444556, + "auxiliary_loss_mlp": 0.01233012, + "balance_loss_clip": 1.13113713, + "balance_loss_mlp": 1.02911687, + "epoch": 0.531489553584849, + "flos": 23042307720960.0, + "grad_norm": 1.5413413848263473, + "language_loss": 0.76555663, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.79233229, + "num_input_tokens_seen": 190042035, + "step": 8840, + "time_per_iteration": 2.7606287002563477 + }, + { + "auxiliary_loss_clip": 0.01443524, + "auxiliary_loss_mlp": 0.01236824, + "balance_loss_clip": 1.1310457, + "balance_loss_mlp": 1.03140259, + "epoch": 0.5315496768375169, + "flos": 19792438835040.0, + "grad_norm": 2.3845004571137594, + "language_loss": 0.77322704, + "learning_rate": 1.892754768590216e-06, + "loss": 0.80003059, + "num_input_tokens_seen": 190057545, + "step": 8841, + "time_per_iteration": 2.75875186920166 + }, + { + "auxiliary_loss_clip": 0.01546325, + "auxiliary_loss_mlp": 0.01224518, + "balance_loss_clip": 1.24361157, + "balance_loss_mlp": 1.02844238, + "epoch": 0.5316098000901849, + "flos": 71030480601600.0, + "grad_norm": 0.6934570593586968, + "language_loss": 0.56723589, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.59494436, + "num_input_tokens_seen": 190123800, + "step": 8842, + "time_per_iteration": 3.413762331008911 + }, + { + "auxiliary_loss_clip": 0.01445298, + "auxiliary_loss_mlp": 0.01248246, + "balance_loss_clip": 1.13181114, + "balance_loss_mlp": 1.04415965, + "epoch": 0.5316699233428529, + "flos": 16437431996640.0, + "grad_norm": 1.9071336893706914, + "language_loss": 0.73597181, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.76290727, + "num_input_tokens_seen": 190141625, + "step": 8843, + "time_per_iteration": 2.7946853637695312 + }, + { + "auxiliary_loss_clip": 0.01543841, + "auxiliary_loss_mlp": 0.0121666, + "balance_loss_clip": 1.2409538, + "balance_loss_mlp": 1.01982117, + "epoch": 0.5317300465955208, + "flos": 67428407178720.0, + "grad_norm": 0.8638797124685569, + "language_loss": 0.60914838, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63675344, + "num_input_tokens_seen": 190198110, + "step": 8844, + "time_per_iteration": 3.357630729675293 + }, + { + "auxiliary_loss_clip": 0.0154335, + "auxiliary_loss_mlp": 0.01211227, + "balance_loss_clip": 1.24079514, + "balance_loss_mlp": 1.01438904, + "epoch": 0.5317901698481888, + "flos": 59513442128640.0, + "grad_norm": 0.832896475570646, + "language_loss": 0.6214416, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64898741, + "num_input_tokens_seen": 190259950, + "step": 8845, + "time_per_iteration": 3.2626354694366455 + }, + { + "auxiliary_loss_clip": 0.01447355, + "auxiliary_loss_mlp": 0.0123698, + "balance_loss_clip": 1.13398671, + "balance_loss_mlp": 1.03251195, + "epoch": 0.5318502931008567, + "flos": 19130889310560.0, + "grad_norm": 2.8266669474387385, + "language_loss": 0.7542721, + "learning_rate": 1.890810312970474e-06, + "loss": 0.78111541, + "num_input_tokens_seen": 190278265, + "step": 8846, + "time_per_iteration": 2.8298678398132324 + }, + { + "auxiliary_loss_clip": 0.01439383, + "auxiliary_loss_mlp": 0.01232406, + "balance_loss_clip": 1.12648213, + "balance_loss_mlp": 1.02889216, + "epoch": 0.5319104163535248, + "flos": 24683361814080.0, + "grad_norm": 3.82589964186694, + "language_loss": 0.75522029, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.7819382, + "num_input_tokens_seen": 190298400, + "step": 8847, + "time_per_iteration": 2.8802154064178467 + }, + { + "auxiliary_loss_clip": 0.01440988, + "auxiliary_loss_mlp": 0.01228698, + "balance_loss_clip": 1.12797058, + "balance_loss_mlp": 1.02327693, + "epoch": 0.5319705396061927, + "flos": 19387248294240.0, + "grad_norm": 1.6270466743303467, + "language_loss": 0.87961781, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.90631467, + "num_input_tokens_seen": 190316235, + "step": 8848, + "time_per_iteration": 2.804269313812256 + }, + { + "auxiliary_loss_clip": 0.01446271, + "auxiliary_loss_mlp": 0.01243487, + "balance_loss_clip": 1.13272703, + "balance_loss_mlp": 1.03577697, + "epoch": 0.5320306628588607, + "flos": 18261263714400.0, + "grad_norm": 2.056825523806919, + "language_loss": 0.74587965, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.77277732, + "num_input_tokens_seen": 190335060, + "step": 8849, + "time_per_iteration": 2.806180953979492 + }, + { + "auxiliary_loss_clip": 0.01438677, + "auxiliary_loss_mlp": 0.01243458, + "balance_loss_clip": 1.12568426, + "balance_loss_mlp": 1.03937149, + "epoch": 0.5320907861115286, + "flos": 23734579419360.0, + "grad_norm": 2.486979510022027, + "language_loss": 0.79948652, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.82630789, + "num_input_tokens_seen": 190353265, + "step": 8850, + "time_per_iteration": 2.8181426525115967 + }, + { + "auxiliary_loss_clip": 0.01437826, + "auxiliary_loss_mlp": 0.01238713, + "balance_loss_clip": 1.12510204, + "balance_loss_mlp": 1.03672493, + "epoch": 0.5321509093641966, + "flos": 34498091851200.0, + "grad_norm": 1.3957636580985286, + "language_loss": 0.55017728, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57694268, + "num_input_tokens_seen": 190376575, + "step": 8851, + "time_per_iteration": 2.852883815765381 + }, + { + "auxiliary_loss_clip": 0.01441316, + "auxiliary_loss_mlp": 0.01240826, + "balance_loss_clip": 1.12804842, + "balance_loss_mlp": 1.03635788, + "epoch": 0.5322110326168645, + "flos": 20012993271360.0, + "grad_norm": 1.5817464135826012, + "language_loss": 0.68485284, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.71167427, + "num_input_tokens_seen": 190395185, + "step": 8852, + "time_per_iteration": 4.185418605804443 + }, + { + "auxiliary_loss_clip": 0.01530996, + "auxiliary_loss_mlp": 0.01199661, + "balance_loss_clip": 1.22950113, + "balance_loss_mlp": 1.00053406, + "epoch": 0.5322711558695326, + "flos": 64637663405760.0, + "grad_norm": 0.7993085132088933, + "language_loss": 0.62659264, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.65389919, + "num_input_tokens_seen": 190452595, + "step": 8853, + "time_per_iteration": 3.3021953105926514 + }, + { + "auxiliary_loss_clip": 0.01435176, + "auxiliary_loss_mlp": 0.01247464, + "balance_loss_clip": 1.12200427, + "balance_loss_mlp": 1.04242373, + "epoch": 0.5323312791222005, + "flos": 14941113219360.0, + "grad_norm": 2.175877474660549, + "language_loss": 0.79707134, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.82389772, + "num_input_tokens_seen": 190469140, + "step": 8854, + "time_per_iteration": 2.741560220718384 + }, + { + "auxiliary_loss_clip": 0.01444623, + "auxiliary_loss_mlp": 0.0123138, + "balance_loss_clip": 1.13208389, + "balance_loss_mlp": 1.0265305, + "epoch": 0.5323914023748685, + "flos": 23443098595200.0, + "grad_norm": 1.766644112293599, + "language_loss": 0.73367071, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.76043081, + "num_input_tokens_seen": 190489015, + "step": 8855, + "time_per_iteration": 2.862600326538086 + }, + { + "auxiliary_loss_clip": 0.01438274, + "auxiliary_loss_mlp": 0.01234777, + "balance_loss_clip": 1.12596714, + "balance_loss_mlp": 1.03088117, + "epoch": 0.5324515256275365, + "flos": 26288573431680.0, + "grad_norm": 2.050581884493881, + "language_loss": 0.65334594, + "learning_rate": 1.886921714110507e-06, + "loss": 0.68007642, + "num_input_tokens_seen": 190508065, + "step": 8856, + "time_per_iteration": 2.865725517272949 + }, + { + "auxiliary_loss_clip": 0.01432096, + "auxiliary_loss_mlp": 0.01246194, + "balance_loss_clip": 1.11886668, + "balance_loss_mlp": 1.0405817, + "epoch": 0.5325116488802044, + "flos": 26873393559840.0, + "grad_norm": 1.7747269719274856, + "language_loss": 0.77905267, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.80583555, + "num_input_tokens_seen": 190527045, + "step": 8857, + "time_per_iteration": 2.8257718086242676 + }, + { + "auxiliary_loss_clip": 0.01437539, + "auxiliary_loss_mlp": 0.01237432, + "balance_loss_clip": 1.12405539, + "balance_loss_mlp": 1.03563499, + "epoch": 0.5325717721328724, + "flos": 25887099850560.0, + "grad_norm": 1.8858531866791584, + "language_loss": 0.71097487, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73772454, + "num_input_tokens_seen": 190544075, + "step": 8858, + "time_per_iteration": 4.3138628005981445 + }, + { + "auxiliary_loss_clip": 0.01450529, + "auxiliary_loss_mlp": 0.01233192, + "balance_loss_clip": 1.13544738, + "balance_loss_mlp": 1.02700734, + "epoch": 0.5326318953855403, + "flos": 21801513507840.0, + "grad_norm": 1.6667114525925462, + "language_loss": 0.69680625, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.72364342, + "num_input_tokens_seen": 190566030, + "step": 8859, + "time_per_iteration": 2.8060925006866455 + }, + { + "auxiliary_loss_clip": 0.01449236, + "auxiliary_loss_mlp": 0.01230489, + "balance_loss_clip": 1.13547277, + "balance_loss_mlp": 1.02773738, + "epoch": 0.5326920186382084, + "flos": 20925022914720.0, + "grad_norm": 3.0107550812308768, + "language_loss": 0.69355589, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.72035313, + "num_input_tokens_seen": 190585605, + "step": 8860, + "time_per_iteration": 4.32903790473938 + }, + { + "auxiliary_loss_clip": 0.01440626, + "auxiliary_loss_mlp": 0.01231267, + "balance_loss_clip": 1.12796509, + "balance_loss_mlp": 1.02813458, + "epoch": 0.5327521418908763, + "flos": 21435541048800.0, + "grad_norm": 2.270027980809485, + "language_loss": 0.77712905, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80384803, + "num_input_tokens_seen": 190604625, + "step": 8861, + "time_per_iteration": 2.826754331588745 + }, + { + "auxiliary_loss_clip": 0.01448493, + "auxiliary_loss_mlp": 0.01239316, + "balance_loss_clip": 1.13572526, + "balance_loss_mlp": 1.03370345, + "epoch": 0.5328122651435443, + "flos": 21762295426080.0, + "grad_norm": 2.1790927764028654, + "language_loss": 0.85860556, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.88548362, + "num_input_tokens_seen": 190625060, + "step": 8862, + "time_per_iteration": 2.855468511581421 + }, + { + "auxiliary_loss_clip": 0.01443473, + "auxiliary_loss_mlp": 0.01241277, + "balance_loss_clip": 1.13157427, + "balance_loss_mlp": 1.03337574, + "epoch": 0.5328723883962122, + "flos": 18298471603680.0, + "grad_norm": 1.91477890141367, + "language_loss": 0.61845756, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.64530504, + "num_input_tokens_seen": 190643150, + "step": 8863, + "time_per_iteration": 2.771854877471924 + }, + { + "auxiliary_loss_clip": 0.01447549, + "auxiliary_loss_mlp": 0.01242938, + "balance_loss_clip": 1.13523555, + "balance_loss_mlp": 1.04018641, + "epoch": 0.5329325116488802, + "flos": 25377302351520.0, + "grad_norm": 4.326781076055172, + "language_loss": 0.73148382, + "learning_rate": 1.883811143046377e-06, + "loss": 0.7583887, + "num_input_tokens_seen": 190662725, + "step": 8864, + "time_per_iteration": 4.433160066604614 + }, + { + "auxiliary_loss_clip": 0.01443414, + "auxiliary_loss_mlp": 0.01236427, + "balance_loss_clip": 1.1312927, + "balance_loss_mlp": 1.03062415, + "epoch": 0.5329926349015481, + "flos": 25594405325280.0, + "grad_norm": 1.6870730329778814, + "language_loss": 0.64267749, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66947591, + "num_input_tokens_seen": 190683680, + "step": 8865, + "time_per_iteration": 2.7700324058532715 + }, + { + "auxiliary_loss_clip": 0.01440775, + "auxiliary_loss_mlp": 0.01241964, + "balance_loss_clip": 1.12788725, + "balance_loss_mlp": 1.03997636, + "epoch": 0.5330527581542162, + "flos": 22891617684000.0, + "grad_norm": 2.6671305533660425, + "language_loss": 0.78737593, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.81420326, + "num_input_tokens_seen": 190703350, + "step": 8866, + "time_per_iteration": 2.8327202796936035 + }, + { + "auxiliary_loss_clip": 0.01441654, + "auxiliary_loss_mlp": 0.01236921, + "balance_loss_clip": 1.12838686, + "balance_loss_mlp": 1.03245282, + "epoch": 0.5331128814068841, + "flos": 16028183142720.0, + "grad_norm": 2.0488283053144603, + "language_loss": 0.73704827, + "learning_rate": 1.882644751189108e-06, + "loss": 0.76383406, + "num_input_tokens_seen": 190721170, + "step": 8867, + "time_per_iteration": 2.7771008014678955 + }, + { + "auxiliary_loss_clip": 0.01440485, + "auxiliary_loss_mlp": 0.0123448, + "balance_loss_clip": 1.12730885, + "balance_loss_mlp": 1.02943993, + "epoch": 0.5331730046595521, + "flos": 39347634843360.0, + "grad_norm": 12.357101152199853, + "language_loss": 0.71985543, + "learning_rate": 1.88225596278394e-06, + "loss": 0.74660504, + "num_input_tokens_seen": 190743795, + "step": 8868, + "time_per_iteration": 2.8918304443359375 + }, + { + "auxiliary_loss_clip": 0.01432977, + "auxiliary_loss_mlp": 0.01238212, + "balance_loss_clip": 1.11890471, + "balance_loss_mlp": 1.03546071, + "epoch": 0.5332331279122201, + "flos": 24026667094080.0, + "grad_norm": 1.8530468139084422, + "language_loss": 0.78389502, + "learning_rate": 1.881867178843637e-06, + "loss": 0.81060696, + "num_input_tokens_seen": 190761560, + "step": 8869, + "time_per_iteration": 2.792041778564453 + }, + { + "auxiliary_loss_clip": 0.01437665, + "auxiliary_loss_mlp": 0.01242184, + "balance_loss_clip": 1.12149453, + "balance_loss_mlp": 1.03866982, + "epoch": 0.533293251164888, + "flos": 17131258749600.0, + "grad_norm": 1.8919343920250113, + "language_loss": 0.7611801, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.78797865, + "num_input_tokens_seen": 190778875, + "step": 8870, + "time_per_iteration": 2.7327895164489746 + }, + { + "auxiliary_loss_clip": 0.01443336, + "auxiliary_loss_mlp": 0.01240413, + "balance_loss_clip": 1.12689018, + "balance_loss_mlp": 1.03499186, + "epoch": 0.533353374417556, + "flos": 22128495454080.0, + "grad_norm": 1.6824142142570884, + "language_loss": 0.75173932, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77857685, + "num_input_tokens_seen": 190799830, + "step": 8871, + "time_per_iteration": 2.854780673980713 + }, + { + "auxiliary_loss_clip": 0.01441183, + "auxiliary_loss_mlp": 0.01245131, + "balance_loss_clip": 1.1249938, + "balance_loss_mlp": 1.04066277, + "epoch": 0.533413497670224, + "flos": 15012798170400.0, + "grad_norm": 2.0446831790039623, + "language_loss": 0.72448099, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.75134408, + "num_input_tokens_seen": 190817155, + "step": 8872, + "time_per_iteration": 2.7280144691467285 + }, + { + "auxiliary_loss_clip": 0.01445959, + "auxiliary_loss_mlp": 0.01239476, + "balance_loss_clip": 1.12988615, + "balance_loss_mlp": 1.03081203, + "epoch": 0.533473620922892, + "flos": 19612012756320.0, + "grad_norm": 2.4013177938650805, + "language_loss": 0.64996248, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67681682, + "num_input_tokens_seen": 190835240, + "step": 8873, + "time_per_iteration": 2.7503864765167236 + }, + { + "auxiliary_loss_clip": 0.01445836, + "auxiliary_loss_mlp": 0.01240438, + "balance_loss_clip": 1.13050663, + "balance_loss_mlp": 1.03549361, + "epoch": 0.5335337441755599, + "flos": 14284380571200.0, + "grad_norm": 2.1919704299833414, + "language_loss": 0.79444993, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82131267, + "num_input_tokens_seen": 190851620, + "step": 8874, + "time_per_iteration": 2.7746522426605225 + }, + { + "auxiliary_loss_clip": 0.01445176, + "auxiliary_loss_mlp": 0.01232102, + "balance_loss_clip": 1.12732768, + "balance_loss_mlp": 1.02877879, + "epoch": 0.5335938674282279, + "flos": 20817002422080.0, + "grad_norm": 1.7259089990548049, + "language_loss": 0.69538355, + "learning_rate": 1.879534569789582e-06, + "loss": 0.72215635, + "num_input_tokens_seen": 190870545, + "step": 8875, + "time_per_iteration": 2.823375701904297 + }, + { + "auxiliary_loss_clip": 0.01552816, + "auxiliary_loss_mlp": 0.0120372, + "balance_loss_clip": 1.24365246, + "balance_loss_mlp": 1.00764465, + "epoch": 0.5336539906808958, + "flos": 71404190405280.0, + "grad_norm": 0.7225674783959055, + "language_loss": 0.59584218, + "learning_rate": 1.879145817516126e-06, + "loss": 0.6234076, + "num_input_tokens_seen": 190931995, + "step": 8876, + "time_per_iteration": 3.4326467514038086 + }, + { + "auxiliary_loss_clip": 0.01443893, + "auxiliary_loss_mlp": 0.01236007, + "balance_loss_clip": 1.12657344, + "balance_loss_mlp": 1.03096724, + "epoch": 0.5337141139335638, + "flos": 20154846047040.0, + "grad_norm": 1.9649360209984368, + "language_loss": 0.7468937, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.77369273, + "num_input_tokens_seen": 190949890, + "step": 8877, + "time_per_iteration": 2.806150436401367 + }, + { + "auxiliary_loss_clip": 0.01550131, + "auxiliary_loss_mlp": 0.01209686, + "balance_loss_clip": 1.24140143, + "balance_loss_mlp": 1.0128479, + "epoch": 0.5337742371862317, + "flos": 67734832062240.0, + "grad_norm": 0.7543190395924575, + "language_loss": 0.57118303, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59878117, + "num_input_tokens_seen": 191008480, + "step": 8878, + "time_per_iteration": 3.20931077003479 + }, + { + "auxiliary_loss_clip": 0.0144466, + "auxiliary_loss_mlp": 0.01241606, + "balance_loss_clip": 1.12643278, + "balance_loss_mlp": 1.03828287, + "epoch": 0.5338343604388998, + "flos": 25011216108000.0, + "grad_norm": 1.5778128308184538, + "language_loss": 0.72494566, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.75180829, + "num_input_tokens_seen": 191028995, + "step": 8879, + "time_per_iteration": 2.833585500717163 + }, + { + "auxiliary_loss_clip": 0.01437388, + "auxiliary_loss_mlp": 0.01235914, + "balance_loss_clip": 1.12176967, + "balance_loss_mlp": 1.03049278, + "epoch": 0.5338944836915677, + "flos": 17603089796160.0, + "grad_norm": 2.3283471399021898, + "language_loss": 0.83236325, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85909623, + "num_input_tokens_seen": 191045285, + "step": 8880, + "time_per_iteration": 2.772981882095337 + }, + { + "auxiliary_loss_clip": 0.01434559, + "auxiliary_loss_mlp": 0.01230282, + "balance_loss_clip": 1.11944151, + "balance_loss_mlp": 1.02905691, + "epoch": 0.5339546069442357, + "flos": 21726187453440.0, + "grad_norm": 1.4340900379932242, + "language_loss": 0.79150975, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81815815, + "num_input_tokens_seen": 191066105, + "step": 8881, + "time_per_iteration": 2.804065227508545 + }, + { + "auxiliary_loss_clip": 0.01498207, + "auxiliary_loss_mlp": 0.01213402, + "balance_loss_clip": 1.19616091, + "balance_loss_mlp": 1.01580048, + "epoch": 0.5340147301969036, + "flos": 69729494034240.0, + "grad_norm": 0.790379841724632, + "language_loss": 0.59207267, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61918879, + "num_input_tokens_seen": 191126315, + "step": 8882, + "time_per_iteration": 3.2148749828338623 + }, + { + "auxiliary_loss_clip": 0.01497998, + "auxiliary_loss_mlp": 0.01213097, + "balance_loss_clip": 1.1959269, + "balance_loss_mlp": 1.01473236, + "epoch": 0.5340748534495716, + "flos": 63885995844480.0, + "grad_norm": 0.859326048497624, + "language_loss": 0.63619328, + "learning_rate": 1.876424680745913e-06, + "loss": 0.66330421, + "num_input_tokens_seen": 191174240, + "step": 8883, + "time_per_iteration": 3.0314083099365234 + }, + { + "auxiliary_loss_clip": 0.01440879, + "auxiliary_loss_mlp": 0.0124211, + "balance_loss_clip": 1.12616086, + "balance_loss_mlp": 1.03592551, + "epoch": 0.5341349767022396, + "flos": 28696845996000.0, + "grad_norm": 3.94331789303401, + "language_loss": 0.82813859, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.85496843, + "num_input_tokens_seen": 191193335, + "step": 8884, + "time_per_iteration": 2.837414264678955 + }, + { + "auxiliary_loss_clip": 0.01446297, + "auxiliary_loss_mlp": 0.01242491, + "balance_loss_clip": 1.13033891, + "balance_loss_mlp": 1.03878641, + "epoch": 0.5341950999549075, + "flos": 16291558836000.0, + "grad_norm": 3.6562957348265104, + "language_loss": 0.72307706, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74996495, + "num_input_tokens_seen": 191210900, + "step": 8885, + "time_per_iteration": 2.780038595199585 + }, + { + "auxiliary_loss_clip": 0.01436534, + "auxiliary_loss_mlp": 0.01242844, + "balance_loss_clip": 1.12107468, + "balance_loss_mlp": 1.03913879, + "epoch": 0.5342552232075756, + "flos": 14357393007840.0, + "grad_norm": 1.8177068898921942, + "language_loss": 0.78246737, + "learning_rate": 1.87525854926798e-06, + "loss": 0.8092612, + "num_input_tokens_seen": 191226730, + "step": 8886, + "time_per_iteration": 2.7568440437316895 + }, + { + "auxiliary_loss_clip": 0.01436915, + "auxiliary_loss_mlp": 0.01243207, + "balance_loss_clip": 1.12186551, + "balance_loss_mlp": 1.03873885, + "epoch": 0.5343153464602435, + "flos": 30300502559040.0, + "grad_norm": 1.5466224725083166, + "language_loss": 0.7497257, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.77652687, + "num_input_tokens_seen": 191250435, + "step": 8887, + "time_per_iteration": 2.922145366668701 + }, + { + "auxiliary_loss_clip": 0.01438093, + "auxiliary_loss_mlp": 0.01238817, + "balance_loss_clip": 1.12247252, + "balance_loss_mlp": 1.03759193, + "epoch": 0.5343754697129115, + "flos": 15598111364640.0, + "grad_norm": 2.8009810325984152, + "language_loss": 0.69207805, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.7188471, + "num_input_tokens_seen": 191268315, + "step": 8888, + "time_per_iteration": 2.7880711555480957 + }, + { + "auxiliary_loss_clip": 0.01437609, + "auxiliary_loss_mlp": 0.0124441, + "balance_loss_clip": 1.12183809, + "balance_loss_mlp": 1.03917885, + "epoch": 0.5344355929655794, + "flos": 16911500804640.0, + "grad_norm": 2.5663812321176818, + "language_loss": 0.77558839, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.80240858, + "num_input_tokens_seen": 191287000, + "step": 8889, + "time_per_iteration": 2.8474225997924805 + }, + { + "auxiliary_loss_clip": 0.01439263, + "auxiliary_loss_mlp": 0.01249784, + "balance_loss_clip": 1.12386847, + "balance_loss_mlp": 1.04760504, + "epoch": 0.5344957162182474, + "flos": 16799952993120.0, + "grad_norm": 12.222983894319565, + "language_loss": 0.69395512, + "learning_rate": 1.873703773589102e-06, + "loss": 0.72084558, + "num_input_tokens_seen": 191304565, + "step": 8890, + "time_per_iteration": 4.298233985900879 + }, + { + "auxiliary_loss_clip": 0.01435346, + "auxiliary_loss_mlp": 0.0125484, + "balance_loss_clip": 1.1189568, + "balance_loss_mlp": 1.051898, + "epoch": 0.5345558394709153, + "flos": 12706781018400.0, + "grad_norm": 2.4743164715472385, + "language_loss": 0.76722729, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79412913, + "num_input_tokens_seen": 191318300, + "step": 8891, + "time_per_iteration": 2.7701382637023926 + }, + { + "auxiliary_loss_clip": 0.01434694, + "auxiliary_loss_mlp": 0.01238352, + "balance_loss_clip": 1.12024689, + "balance_loss_mlp": 1.03808069, + "epoch": 0.5346159627235834, + "flos": 22457222095680.0, + "grad_norm": 2.3636563806080173, + "language_loss": 0.73991388, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76664436, + "num_input_tokens_seen": 191337925, + "step": 8892, + "time_per_iteration": 2.7751095294952393 + }, + { + "auxiliary_loss_clip": 0.0143216, + "auxiliary_loss_mlp": 0.012403, + "balance_loss_clip": 1.11530924, + "balance_loss_mlp": 1.04060054, + "epoch": 0.5346760859762513, + "flos": 22417776444960.0, + "grad_norm": 1.6842311166553687, + "language_loss": 0.87764084, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90436542, + "num_input_tokens_seen": 191357120, + "step": 8893, + "time_per_iteration": 2.8233845233917236 + }, + { + "auxiliary_loss_clip": 0.01440215, + "auxiliary_loss_mlp": 0.0123925, + "balance_loss_clip": 1.12409985, + "balance_loss_mlp": 1.03812003, + "epoch": 0.5347362092289193, + "flos": 22818112181280.0, + "grad_norm": 1.8692025363948768, + "language_loss": 0.72776812, + "learning_rate": 1.872149074536869e-06, + "loss": 0.75456274, + "num_input_tokens_seen": 191375395, + "step": 8894, + "time_per_iteration": 2.7574214935302734 + }, + { + "auxiliary_loss_clip": 0.01439477, + "auxiliary_loss_mlp": 0.01227488, + "balance_loss_clip": 1.12063169, + "balance_loss_mlp": 1.02359271, + "epoch": 0.5347963324815872, + "flos": 23221595954880.0, + "grad_norm": 1.8635124104894387, + "language_loss": 0.74970943, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.77637911, + "num_input_tokens_seen": 191395595, + "step": 8895, + "time_per_iteration": 2.8130035400390625 + }, + { + "auxiliary_loss_clip": 0.01440442, + "auxiliary_loss_mlp": 0.01240672, + "balance_loss_clip": 1.1239109, + "balance_loss_mlp": 1.03753972, + "epoch": 0.5348564557342552, + "flos": 22603550394240.0, + "grad_norm": 1.782287340772965, + "language_loss": 0.76843166, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79524279, + "num_input_tokens_seen": 191413730, + "step": 8896, + "time_per_iteration": 4.232854843139648 + }, + { + "auxiliary_loss_clip": 0.01441004, + "auxiliary_loss_mlp": 0.01231192, + "balance_loss_clip": 1.12299752, + "balance_loss_mlp": 1.02653396, + "epoch": 0.5349165789869232, + "flos": 18004222023840.0, + "grad_norm": 1.7331927229298578, + "language_loss": 0.78455758, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.81127954, + "num_input_tokens_seen": 191432400, + "step": 8897, + "time_per_iteration": 4.29032826423645 + }, + { + "auxiliary_loss_clip": 0.01440848, + "auxiliary_loss_mlp": 0.0123455, + "balance_loss_clip": 1.12347889, + "balance_loss_mlp": 1.02912903, + "epoch": 0.5349767022395912, + "flos": 17161374072960.0, + "grad_norm": 2.1581415477205774, + "language_loss": 0.75919878, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.78595275, + "num_input_tokens_seen": 191448855, + "step": 8898, + "time_per_iteration": 2.74761700630188 + }, + { + "auxiliary_loss_clip": 0.01507737, + "auxiliary_loss_mlp": 0.01193703, + "balance_loss_clip": 1.20037127, + "balance_loss_mlp": 0.99610138, + "epoch": 0.5350368254922592, + "flos": 70999606715040.0, + "grad_norm": 0.842256238828179, + "language_loss": 0.57947356, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60648799, + "num_input_tokens_seen": 191519690, + "step": 8899, + "time_per_iteration": 3.5485568046569824 + }, + { + "auxiliary_loss_clip": 0.01438332, + "auxiliary_loss_mlp": 0.01233291, + "balance_loss_clip": 1.12140632, + "balance_loss_mlp": 1.02710652, + "epoch": 0.5350969487449271, + "flos": 27420323091840.0, + "grad_norm": 1.7365259261641441, + "language_loss": 0.70211071, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72882688, + "num_input_tokens_seen": 191539380, + "step": 8900, + "time_per_iteration": 2.845449924468994 + }, + { + "auxiliary_loss_clip": 0.01439315, + "auxiliary_loss_mlp": 0.01238489, + "balance_loss_clip": 1.12283278, + "balance_loss_mlp": 1.03402114, + "epoch": 0.5351570719975951, + "flos": 19318066601760.0, + "grad_norm": 4.362529943261556, + "language_loss": 0.71573627, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.74251431, + "num_input_tokens_seen": 191557400, + "step": 8901, + "time_per_iteration": 2.796525478363037 + }, + { + "auxiliary_loss_clip": 0.014388, + "auxiliary_loss_mlp": 0.0123777, + "balance_loss_clip": 1.12293649, + "balance_loss_mlp": 1.03292084, + "epoch": 0.535217195250263, + "flos": 19830632856480.0, + "grad_norm": 2.334023359565564, + "language_loss": 0.77277339, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79953915, + "num_input_tokens_seen": 191575860, + "step": 8902, + "time_per_iteration": 4.320901393890381 + }, + { + "auxiliary_loss_clip": 0.01438562, + "auxiliary_loss_mlp": 0.01235719, + "balance_loss_clip": 1.12324548, + "balance_loss_mlp": 1.03334892, + "epoch": 0.535277318502931, + "flos": 22130088436800.0, + "grad_norm": 1.516664633497323, + "language_loss": 0.70108646, + "learning_rate": 1.868651286721281e-06, + "loss": 0.72782928, + "num_input_tokens_seen": 191595775, + "step": 8903, + "time_per_iteration": 2.8338608741760254 + }, + { + "auxiliary_loss_clip": 0.01434039, + "auxiliary_loss_mlp": 0.01250607, + "balance_loss_clip": 1.11771488, + "balance_loss_mlp": 1.05014491, + "epoch": 0.5353374417555989, + "flos": 25048234356480.0, + "grad_norm": 1.9877569001292736, + "language_loss": 0.72333372, + "learning_rate": 1.86826266833795e-06, + "loss": 0.75018013, + "num_input_tokens_seen": 191617785, + "step": 8904, + "time_per_iteration": 2.8860647678375244 + }, + { + "auxiliary_loss_clip": 0.01437707, + "auxiliary_loss_mlp": 0.01252791, + "balance_loss_clip": 1.12192988, + "balance_loss_mlp": 1.04965854, + "epoch": 0.535397565008267, + "flos": 19390623900480.0, + "grad_norm": 3.428684269372512, + "language_loss": 0.73359072, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.76049566, + "num_input_tokens_seen": 191636900, + "step": 8905, + "time_per_iteration": 2.7651586532592773 + }, + { + "auxiliary_loss_clip": 0.01433897, + "auxiliary_loss_mlp": 0.01232975, + "balance_loss_clip": 1.11910713, + "balance_loss_mlp": 1.03117716, + "epoch": 0.5354576882609349, + "flos": 21473393716800.0, + "grad_norm": 1.5299787698284582, + "language_loss": 0.8385129, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.86518162, + "num_input_tokens_seen": 191656720, + "step": 8906, + "time_per_iteration": 2.7719295024871826 + }, + { + "auxiliary_loss_clip": 0.01431991, + "auxiliary_loss_mlp": 0.01238626, + "balance_loss_clip": 1.11555362, + "balance_loss_mlp": 1.03492165, + "epoch": 0.5355178115136029, + "flos": 20779642820160.0, + "grad_norm": 1.9141156182854344, + "language_loss": 0.74019766, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.76690376, + "num_input_tokens_seen": 191674445, + "step": 8907, + "time_per_iteration": 2.8129653930664062 + }, + { + "auxiliary_loss_clip": 0.0143599, + "auxiliary_loss_mlp": 0.01240988, + "balance_loss_clip": 1.1192894, + "balance_loss_mlp": 1.03728294, + "epoch": 0.5355779347662708, + "flos": 23516300672640.0, + "grad_norm": 2.0368162739662807, + "language_loss": 0.76352972, + "learning_rate": 1.866708244906912e-06, + "loss": 0.79029948, + "num_input_tokens_seen": 191695000, + "step": 8908, + "time_per_iteration": 2.9666061401367188 + }, + { + "auxiliary_loss_clip": 0.01441269, + "auxiliary_loss_mlp": 0.01250683, + "balance_loss_clip": 1.12388563, + "balance_loss_mlp": 1.04697847, + "epoch": 0.5356380580189388, + "flos": 20305308515040.0, + "grad_norm": 2.0826486142382734, + "language_loss": 0.74311554, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.77003509, + "num_input_tokens_seen": 191713295, + "step": 8909, + "time_per_iteration": 2.846745729446411 + }, + { + "auxiliary_loss_clip": 0.01452607, + "auxiliary_loss_mlp": 0.0123289, + "balance_loss_clip": 1.13562858, + "balance_loss_mlp": 1.02899432, + "epoch": 0.5356981812716068, + "flos": 21363818169600.0, + "grad_norm": 2.2228488454893363, + "language_loss": 0.84117115, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86802614, + "num_input_tokens_seen": 191732725, + "step": 8910, + "time_per_iteration": 2.7885220050811768 + }, + { + "auxiliary_loss_clip": 0.014469, + "auxiliary_loss_mlp": 0.01235257, + "balance_loss_clip": 1.1290431, + "balance_loss_mlp": 1.02926373, + "epoch": 0.5357583045242748, + "flos": 23113423749600.0, + "grad_norm": 1.8414618962688734, + "language_loss": 0.81861752, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.84543908, + "num_input_tokens_seen": 191753765, + "step": 8911, + "time_per_iteration": 2.8263132572174072 + }, + { + "auxiliary_loss_clip": 0.01443979, + "auxiliary_loss_mlp": 0.012398, + "balance_loss_clip": 1.12796712, + "balance_loss_mlp": 1.03838348, + "epoch": 0.5358184277769428, + "flos": 21143870583840.0, + "grad_norm": 1.8781636835945181, + "language_loss": 0.6914351, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71827281, + "num_input_tokens_seen": 191773560, + "step": 8912, + "time_per_iteration": 2.8297982215881348 + }, + { + "auxiliary_loss_clip": 0.01449607, + "auxiliary_loss_mlp": 0.01240967, + "balance_loss_clip": 1.1326046, + "balance_loss_mlp": 1.03802562, + "epoch": 0.5358785510296107, + "flos": 16283859419520.0, + "grad_norm": 1.8661730812926633, + "language_loss": 0.7162683, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.74317408, + "num_input_tokens_seen": 191791255, + "step": 8913, + "time_per_iteration": 2.761591672897339 + }, + { + "auxiliary_loss_clip": 0.01445787, + "auxiliary_loss_mlp": 0.01243481, + "balance_loss_clip": 1.12683213, + "balance_loss_mlp": 1.04034853, + "epoch": 0.5359386742822787, + "flos": 16978444735680.0, + "grad_norm": 1.6691586326344625, + "language_loss": 0.71949601, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74638867, + "num_input_tokens_seen": 191809325, + "step": 8914, + "time_per_iteration": 2.769915819168091 + }, + { + "auxiliary_loss_clip": 0.01445882, + "auxiliary_loss_mlp": 0.01247831, + "balance_loss_clip": 1.13012588, + "balance_loss_mlp": 1.0410744, + "epoch": 0.5359987975349466, + "flos": 20814992229600.0, + "grad_norm": 2.229021115383745, + "language_loss": 0.70687407, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.73381126, + "num_input_tokens_seen": 191829795, + "step": 8915, + "time_per_iteration": 2.810418128967285 + }, + { + "auxiliary_loss_clip": 0.01443604, + "auxiliary_loss_mlp": 0.01239625, + "balance_loss_clip": 1.1269244, + "balance_loss_mlp": 1.03725553, + "epoch": 0.5360589207876146, + "flos": 22202114741280.0, + "grad_norm": 2.111387339864719, + "language_loss": 0.7494399, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77627218, + "num_input_tokens_seen": 191850840, + "step": 8916, + "time_per_iteration": 2.81350040435791 + }, + { + "auxiliary_loss_clip": 0.014365, + "auxiliary_loss_mlp": 0.01250645, + "balance_loss_clip": 1.11924696, + "balance_loss_mlp": 1.04884779, + "epoch": 0.5361190440402825, + "flos": 31397054522400.0, + "grad_norm": 2.7230202889987427, + "language_loss": 0.72257954, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74945098, + "num_input_tokens_seen": 191869520, + "step": 8917, + "time_per_iteration": 2.8669204711914062 + }, + { + "auxiliary_loss_clip": 0.01445414, + "auxiliary_loss_mlp": 0.01237641, + "balance_loss_clip": 1.12918985, + "balance_loss_mlp": 1.03355443, + "epoch": 0.5361791672929506, + "flos": 16071687106560.0, + "grad_norm": 2.335656773266709, + "language_loss": 0.71913743, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.74596798, + "num_input_tokens_seen": 191887240, + "step": 8918, + "time_per_iteration": 2.763782262802124 + }, + { + "auxiliary_loss_clip": 0.01443322, + "auxiliary_loss_mlp": 0.01242764, + "balance_loss_clip": 1.12799978, + "balance_loss_mlp": 1.03753376, + "epoch": 0.5362392905456185, + "flos": 20743041781440.0, + "grad_norm": 4.056381276085041, + "language_loss": 0.75230819, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77916902, + "num_input_tokens_seen": 191905690, + "step": 8919, + "time_per_iteration": 2.7929940223693848 + }, + { + "auxiliary_loss_clip": 0.01441571, + "auxiliary_loss_mlp": 0.01236605, + "balance_loss_clip": 1.1245718, + "balance_loss_mlp": 1.03347278, + "epoch": 0.5362994137982865, + "flos": 17341572582720.0, + "grad_norm": 1.980433850383376, + "language_loss": 0.7175948, + "learning_rate": 1.862045463611864e-06, + "loss": 0.74437654, + "num_input_tokens_seen": 191920725, + "step": 8920, + "time_per_iteration": 2.807321071624756 + }, + { + "auxiliary_loss_clip": 0.01446119, + "auxiliary_loss_mlp": 0.01231258, + "balance_loss_clip": 1.13072371, + "balance_loss_mlp": 1.0288887, + "epoch": 0.5363595370509544, + "flos": 42817261674240.0, + "grad_norm": 1.505448320849516, + "language_loss": 0.68653655, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.7133103, + "num_input_tokens_seen": 191944645, + "step": 8921, + "time_per_iteration": 2.9719245433807373 + }, + { + "auxiliary_loss_clip": 0.01445833, + "auxiliary_loss_mlp": 0.01240057, + "balance_loss_clip": 1.13056612, + "balance_loss_mlp": 1.03501666, + "epoch": 0.5364196603036224, + "flos": 19173976064640.0, + "grad_norm": 1.8356399400868286, + "language_loss": 0.81672156, + "learning_rate": 1.86126840594594e-06, + "loss": 0.84358048, + "num_input_tokens_seen": 191962265, + "step": 8922, + "time_per_iteration": 2.7190847396850586 + }, + { + "auxiliary_loss_clip": 0.01439052, + "auxiliary_loss_mlp": 0.01232782, + "balance_loss_clip": 1.12314916, + "balance_loss_mlp": 1.02716947, + "epoch": 0.5364797835562904, + "flos": 17932992210720.0, + "grad_norm": 2.2802764617851365, + "language_loss": 0.7691555, + "learning_rate": 1.860879884996686e-06, + "loss": 0.79587376, + "num_input_tokens_seen": 191978850, + "step": 8923, + "time_per_iteration": 2.769582509994507 + }, + { + "auxiliary_loss_clip": 0.0144076, + "auxiliary_loss_mlp": 0.01248603, + "balance_loss_clip": 1.12530434, + "balance_loss_mlp": 1.04318142, + "epoch": 0.5365399068089584, + "flos": 30230751944160.0, + "grad_norm": 1.4260053882014603, + "language_loss": 0.70554829, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.7324419, + "num_input_tokens_seen": 192002000, + "step": 8924, + "time_per_iteration": 2.8521804809570312 + }, + { + "auxiliary_loss_clip": 0.01444239, + "auxiliary_loss_mlp": 0.01248177, + "balance_loss_clip": 1.12674499, + "balance_loss_mlp": 1.04180217, + "epoch": 0.5366000300616264, + "flos": 24893561862720.0, + "grad_norm": 2.0313141060738826, + "language_loss": 0.86863351, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89555764, + "num_input_tokens_seen": 192019100, + "step": 8925, + "time_per_iteration": 2.83941388130188 + }, + { + "auxiliary_loss_clip": 0.01432261, + "auxiliary_loss_mlp": 0.01239008, + "balance_loss_clip": 1.11574054, + "balance_loss_mlp": 1.03644753, + "epoch": 0.5366601533142943, + "flos": 29829316291200.0, + "grad_norm": 1.6190808061013702, + "language_loss": 0.78277791, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80949056, + "num_input_tokens_seen": 192041660, + "step": 8926, + "time_per_iteration": 2.8537914752960205 + }, + { + "auxiliary_loss_clip": 0.01448262, + "auxiliary_loss_mlp": 0.01236374, + "balance_loss_clip": 1.13101745, + "balance_loss_mlp": 1.02961731, + "epoch": 0.5367202765669623, + "flos": 27201854704320.0, + "grad_norm": 2.053454071786395, + "language_loss": 0.67235446, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69920081, + "num_input_tokens_seen": 192063540, + "step": 8927, + "time_per_iteration": 2.8387176990509033 + }, + { + "auxiliary_loss_clip": 0.01440182, + "auxiliary_loss_mlp": 0.01225552, + "balance_loss_clip": 1.12210643, + "balance_loss_mlp": 1.0214653, + "epoch": 0.5367803998196302, + "flos": 20232030581280.0, + "grad_norm": 2.302885019914878, + "language_loss": 0.73573434, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.76239163, + "num_input_tokens_seen": 192081760, + "step": 8928, + "time_per_iteration": 4.013944149017334 + }, + { + "auxiliary_loss_clip": 0.01441508, + "auxiliary_loss_mlp": 0.01235443, + "balance_loss_clip": 1.12505913, + "balance_loss_mlp": 1.03288293, + "epoch": 0.5368405230722982, + "flos": 32157028715040.0, + "grad_norm": 1.8977049172079503, + "language_loss": 0.62964445, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65641391, + "num_input_tokens_seen": 192101620, + "step": 8929, + "time_per_iteration": 2.8880128860473633 + }, + { + "auxiliary_loss_clip": 0.01442783, + "auxiliary_loss_mlp": 0.01235827, + "balance_loss_clip": 1.12413895, + "balance_loss_mlp": 1.03193128, + "epoch": 0.5369006463249661, + "flos": 26250227697600.0, + "grad_norm": 3.7837092070981875, + "language_loss": 0.66035622, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68714225, + "num_input_tokens_seen": 192121805, + "step": 8930, + "time_per_iteration": 2.8393609523773193 + }, + { + "auxiliary_loss_clip": 0.01443492, + "auxiliary_loss_mlp": 0.01227449, + "balance_loss_clip": 1.12497854, + "balance_loss_mlp": 1.02488863, + "epoch": 0.5369607695776342, + "flos": 26213588730720.0, + "grad_norm": 1.6638825730155138, + "language_loss": 0.66604298, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69275236, + "num_input_tokens_seen": 192141765, + "step": 8931, + "time_per_iteration": 2.9241082668304443 + }, + { + "auxiliary_loss_clip": 0.01444897, + "auxiliary_loss_mlp": 0.01223725, + "balance_loss_clip": 1.12662613, + "balance_loss_mlp": 1.01754069, + "epoch": 0.5370208928303021, + "flos": 25011329892480.0, + "grad_norm": 1.66402326823427, + "language_loss": 0.75552225, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.78220856, + "num_input_tokens_seen": 192161560, + "step": 8932, + "time_per_iteration": 2.7832396030426025 + }, + { + "auxiliary_loss_clip": 0.01446707, + "auxiliary_loss_mlp": 0.01233438, + "balance_loss_clip": 1.1274308, + "balance_loss_mlp": 1.02954221, + "epoch": 0.5370810160829701, + "flos": 31794659431200.0, + "grad_norm": 1.8294633995645297, + "language_loss": 0.65885621, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68565768, + "num_input_tokens_seen": 192180190, + "step": 8933, + "time_per_iteration": 2.840285301208496 + }, + { + "auxiliary_loss_clip": 0.01446151, + "auxiliary_loss_mlp": 0.01234031, + "balance_loss_clip": 1.12789619, + "balance_loss_mlp": 1.03051686, + "epoch": 0.537141139335638, + "flos": 23844913529760.0, + "grad_norm": 1.7870793880784879, + "language_loss": 0.83014369, + "learning_rate": 1.856606505975565e-06, + "loss": 0.85694551, + "num_input_tokens_seen": 192198855, + "step": 8934, + "time_per_iteration": 4.2610344886779785 + }, + { + "auxiliary_loss_clip": 0.01442468, + "auxiliary_loss_mlp": 0.01225318, + "balance_loss_clip": 1.12391722, + "balance_loss_mlp": 1.01951528, + "epoch": 0.537201262588306, + "flos": 18510454275840.0, + "grad_norm": 1.9961128844602014, + "language_loss": 0.7986142, + "learning_rate": 1.856218049303999e-06, + "loss": 0.82529211, + "num_input_tokens_seen": 192216555, + "step": 8935, + "time_per_iteration": 4.245995998382568 + }, + { + "auxiliary_loss_clip": 0.0144432, + "auxiliary_loss_mlp": 0.01228643, + "balance_loss_clip": 1.1250217, + "balance_loss_mlp": 1.02245903, + "epoch": 0.537261385840974, + "flos": 25665142072320.0, + "grad_norm": 6.340351704184592, + "language_loss": 0.83593106, + "learning_rate": 1.855829598084659e-06, + "loss": 0.86266071, + "num_input_tokens_seen": 192236910, + "step": 8936, + "time_per_iteration": 2.841034173965454 + }, + { + "auxiliary_loss_clip": 0.01452989, + "auxiliary_loss_mlp": 0.01233894, + "balance_loss_clip": 1.13363421, + "balance_loss_mlp": 1.03057075, + "epoch": 0.537321509093642, + "flos": 40738246745760.0, + "grad_norm": 1.2525919762253135, + "language_loss": 0.72753048, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.7543993, + "num_input_tokens_seen": 192260790, + "step": 8937, + "time_per_iteration": 2.9418070316314697 + }, + { + "auxiliary_loss_clip": 0.01442063, + "auxiliary_loss_mlp": 0.01231476, + "balance_loss_clip": 1.12305427, + "balance_loss_mlp": 1.02529192, + "epoch": 0.53738163234631, + "flos": 17240417087040.0, + "grad_norm": 2.805894368028558, + "language_loss": 0.80938876, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83612412, + "num_input_tokens_seen": 192277230, + "step": 8938, + "time_per_iteration": 2.7726926803588867 + }, + { + "auxiliary_loss_clip": 0.01444592, + "auxiliary_loss_mlp": 0.01234202, + "balance_loss_clip": 1.12619698, + "balance_loss_mlp": 1.02935314, + "epoch": 0.5374417555989779, + "flos": 12823562916000.0, + "grad_norm": 2.7739614493023352, + "language_loss": 0.80792129, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.83470929, + "num_input_tokens_seen": 192292840, + "step": 8939, + "time_per_iteration": 2.7825417518615723 + }, + { + "auxiliary_loss_clip": 0.01539998, + "auxiliary_loss_mlp": 0.01189903, + "balance_loss_clip": 1.22835088, + "balance_loss_mlp": 0.99382782, + "epoch": 0.5375018788516459, + "flos": 67262545877760.0, + "grad_norm": 0.6964851992578656, + "language_loss": 0.52421165, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.55151069, + "num_input_tokens_seen": 192358240, + "step": 8940, + "time_per_iteration": 4.7826032638549805 + }, + { + "auxiliary_loss_clip": 0.01442207, + "auxiliary_loss_mlp": 0.01235262, + "balance_loss_clip": 1.12316954, + "balance_loss_mlp": 1.03098536, + "epoch": 0.5375620021043138, + "flos": 18116149116960.0, + "grad_norm": 2.3848712180539335, + "language_loss": 0.72062778, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.74740249, + "num_input_tokens_seen": 192377370, + "step": 8941, + "time_per_iteration": 2.800321578979492 + }, + { + "auxiliary_loss_clip": 0.01444654, + "auxiliary_loss_mlp": 0.01233256, + "balance_loss_clip": 1.12797356, + "balance_loss_mlp": 1.03184021, + "epoch": 0.5376221253569818, + "flos": 23151655699200.0, + "grad_norm": 2.2117428271577992, + "language_loss": 0.79596698, + "learning_rate": 1.853499006090237e-06, + "loss": 0.8227461, + "num_input_tokens_seen": 192396450, + "step": 8942, + "time_per_iteration": 2.782045841217041 + }, + { + "auxiliary_loss_clip": 0.01444892, + "auxiliary_loss_mlp": 0.01249871, + "balance_loss_clip": 1.12873125, + "balance_loss_mlp": 1.0472157, + "epoch": 0.5376822486096497, + "flos": 29974961882880.0, + "grad_norm": 1.6920192526378601, + "language_loss": 0.69970047, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72664809, + "num_input_tokens_seen": 192417390, + "step": 8943, + "time_per_iteration": 2.861514091491699 + }, + { + "auxiliary_loss_clip": 0.01546507, + "auxiliary_loss_mlp": 0.01194305, + "balance_loss_clip": 1.23647463, + "balance_loss_mlp": 0.99746704, + "epoch": 0.5377423718623178, + "flos": 54175379335200.0, + "grad_norm": 0.8135874850414495, + "language_loss": 0.5961836, + "learning_rate": 1.852722186377645e-06, + "loss": 0.62359172, + "num_input_tokens_seen": 192478060, + "step": 8944, + "time_per_iteration": 3.3137967586517334 + }, + { + "auxiliary_loss_clip": 0.01438685, + "auxiliary_loss_mlp": 0.01236631, + "balance_loss_clip": 1.12144828, + "balance_loss_mlp": 1.03178215, + "epoch": 0.5378024951149857, + "flos": 23259258982080.0, + "grad_norm": 6.0030903524140085, + "language_loss": 0.77372843, + "learning_rate": 1.852333784891169e-06, + "loss": 0.80048156, + "num_input_tokens_seen": 192495985, + "step": 8945, + "time_per_iteration": 2.8266165256500244 + }, + { + "auxiliary_loss_clip": 0.01439274, + "auxiliary_loss_mlp": 0.01233701, + "balance_loss_clip": 1.12045276, + "balance_loss_mlp": 1.0326668, + "epoch": 0.5378626183676537, + "flos": 24026363668800.0, + "grad_norm": 3.1927061592805823, + "language_loss": 0.69172609, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.71845579, + "num_input_tokens_seen": 192515445, + "step": 8946, + "time_per_iteration": 2.834430456161499 + }, + { + "auxiliary_loss_clip": 0.01447104, + "auxiliary_loss_mlp": 0.01235297, + "balance_loss_clip": 1.12808406, + "balance_loss_mlp": 1.03349996, + "epoch": 0.5379227416203216, + "flos": 27164229605280.0, + "grad_norm": 1.8706642122662454, + "language_loss": 0.76823765, + "learning_rate": 1.851556998731498e-06, + "loss": 0.79506165, + "num_input_tokens_seen": 192536530, + "step": 8947, + "time_per_iteration": 2.845120429992676 + }, + { + "auxiliary_loss_clip": 0.01441042, + "auxiliary_loss_mlp": 0.01240279, + "balance_loss_clip": 1.12335682, + "balance_loss_mlp": 1.0365746, + "epoch": 0.5379828648729896, + "flos": 24683968664640.0, + "grad_norm": 1.7309771356094403, + "language_loss": 0.60062754, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62744081, + "num_input_tokens_seen": 192556075, + "step": 8948, + "time_per_iteration": 2.8279898166656494 + }, + { + "auxiliary_loss_clip": 0.01446616, + "auxiliary_loss_mlp": 0.01237965, + "balance_loss_clip": 1.12890351, + "balance_loss_mlp": 1.03216207, + "epoch": 0.5380429881256577, + "flos": 22525190087040.0, + "grad_norm": 1.762376292359276, + "language_loss": 0.79328418, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.82012999, + "num_input_tokens_seen": 192575535, + "step": 8949, + "time_per_iteration": 2.831528902053833 + }, + { + "auxiliary_loss_clip": 0.01440969, + "auxiliary_loss_mlp": 0.01240825, + "balance_loss_clip": 1.12262225, + "balance_loss_mlp": 1.03883672, + "epoch": 0.5381031113783256, + "flos": 26981982974880.0, + "grad_norm": 1.788506018829135, + "language_loss": 0.78224361, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80906153, + "num_input_tokens_seen": 192594490, + "step": 8950, + "time_per_iteration": 2.831131935119629 + }, + { + "auxiliary_loss_clip": 0.01448532, + "auxiliary_loss_mlp": 0.01239238, + "balance_loss_clip": 1.13260508, + "balance_loss_mlp": 1.03648722, + "epoch": 0.5381632346309936, + "flos": 24756412178880.0, + "grad_norm": 2.3519642672293393, + "language_loss": 0.73067534, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.75755298, + "num_input_tokens_seen": 192615650, + "step": 8951, + "time_per_iteration": 2.819733142852783 + }, + { + "auxiliary_loss_clip": 0.01438687, + "auxiliary_loss_mlp": 0.0124793, + "balance_loss_clip": 1.12265182, + "balance_loss_mlp": 1.04594159, + "epoch": 0.5382233578836615, + "flos": 15561889607520.0, + "grad_norm": 1.8560647655217477, + "language_loss": 0.75111437, + "learning_rate": 1.849615132097085e-06, + "loss": 0.77798057, + "num_input_tokens_seen": 192633840, + "step": 8952, + "time_per_iteration": 2.754542112350464 + }, + { + "auxiliary_loss_clip": 0.01442704, + "auxiliary_loss_mlp": 0.01241484, + "balance_loss_clip": 1.12497759, + "balance_loss_mlp": 1.0373981, + "epoch": 0.5382834811363295, + "flos": 25086959372160.0, + "grad_norm": 1.5727547198764535, + "language_loss": 0.79446417, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.82130605, + "num_input_tokens_seen": 192655890, + "step": 8953, + "time_per_iteration": 2.8700296878814697 + }, + { + "auxiliary_loss_clip": 0.01441341, + "auxiliary_loss_mlp": 0.01231346, + "balance_loss_clip": 1.12615025, + "balance_loss_mlp": 1.02935791, + "epoch": 0.5383436043889974, + "flos": 13299603988320.0, + "grad_norm": 1.9230934692017623, + "language_loss": 0.80393958, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.83066642, + "num_input_tokens_seen": 192673025, + "step": 8954, + "time_per_iteration": 2.800318956375122 + }, + { + "auxiliary_loss_clip": 0.01445144, + "auxiliary_loss_mlp": 0.01235648, + "balance_loss_clip": 1.12934566, + "balance_loss_mlp": 1.03213429, + "epoch": 0.5384037276416654, + "flos": 23041625014080.0, + "grad_norm": 2.4616518640969423, + "language_loss": 0.76514721, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.79195517, + "num_input_tokens_seen": 192692190, + "step": 8955, + "time_per_iteration": 2.7802724838256836 + }, + { + "auxiliary_loss_clip": 0.01442225, + "auxiliary_loss_mlp": 0.01231481, + "balance_loss_clip": 1.12583709, + "balance_loss_mlp": 1.02701306, + "epoch": 0.5384638508943334, + "flos": 20633238665280.0, + "grad_norm": 1.6469519682895495, + "language_loss": 0.78431857, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.81105566, + "num_input_tokens_seen": 192710380, + "step": 8956, + "time_per_iteration": 2.820953607559204 + }, + { + "auxiliary_loss_clip": 0.01511778, + "auxiliary_loss_mlp": 0.01239601, + "balance_loss_clip": 1.20785582, + "balance_loss_mlp": 1.04505157, + "epoch": 0.5385239741470014, + "flos": 66743759404800.0, + "grad_norm": 0.8502351407845139, + "language_loss": 0.63367373, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.66118753, + "num_input_tokens_seen": 192768995, + "step": 8957, + "time_per_iteration": 3.251481294631958 + }, + { + "auxiliary_loss_clip": 0.0150854, + "auxiliary_loss_mlp": 0.01232956, + "balance_loss_clip": 1.20425379, + "balance_loss_mlp": 1.03955078, + "epoch": 0.5385840973996693, + "flos": 64723230063360.0, + "grad_norm": 0.7133385216747912, + "language_loss": 0.51520753, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.54262251, + "num_input_tokens_seen": 192825585, + "step": 8958, + "time_per_iteration": 3.300173044204712 + }, + { + "auxiliary_loss_clip": 0.01449397, + "auxiliary_loss_mlp": 0.01244572, + "balance_loss_clip": 1.1331315, + "balance_loss_mlp": 1.03972244, + "epoch": 0.5386442206523373, + "flos": 26144255325600.0, + "grad_norm": 1.6294936348088387, + "language_loss": 0.77144849, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79838818, + "num_input_tokens_seen": 192847335, + "step": 8959, + "time_per_iteration": 2.8918557167053223 + }, + { + "auxiliary_loss_clip": 0.01439906, + "auxiliary_loss_mlp": 0.01241244, + "balance_loss_clip": 1.12396741, + "balance_loss_mlp": 1.03811193, + "epoch": 0.5387043439050052, + "flos": 18253374657120.0, + "grad_norm": 3.177657625966365, + "language_loss": 0.83259952, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.85941106, + "num_input_tokens_seen": 192862205, + "step": 8960, + "time_per_iteration": 2.8118648529052734 + }, + { + "auxiliary_loss_clip": 0.01439635, + "auxiliary_loss_mlp": 0.01241918, + "balance_loss_clip": 1.12391424, + "balance_loss_mlp": 1.03745055, + "epoch": 0.5387644671576732, + "flos": 29790856772640.0, + "grad_norm": 1.5570015724639528, + "language_loss": 0.78483355, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.81164908, + "num_input_tokens_seen": 192883695, + "step": 8961, + "time_per_iteration": 2.881422281265259 + }, + { + "auxiliary_loss_clip": 0.01447392, + "auxiliary_loss_mlp": 0.01243933, + "balance_loss_clip": 1.13172197, + "balance_loss_mlp": 1.04060936, + "epoch": 0.5388245904103413, + "flos": 22376206817280.0, + "grad_norm": 1.7212702877538892, + "language_loss": 0.84353489, + "learning_rate": 1.845731828364681e-06, + "loss": 0.87044811, + "num_input_tokens_seen": 192900190, + "step": 8962, + "time_per_iteration": 2.798707962036133 + }, + { + "auxiliary_loss_clip": 0.01523899, + "auxiliary_loss_mlp": 0.01206383, + "balance_loss_clip": 1.21644497, + "balance_loss_mlp": 1.01030731, + "epoch": 0.5388847136630092, + "flos": 69814340056800.0, + "grad_norm": 0.732450676612426, + "language_loss": 0.54086518, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56816804, + "num_input_tokens_seen": 192958675, + "step": 8963, + "time_per_iteration": 3.2926743030548096 + }, + { + "auxiliary_loss_clip": 0.01517399, + "auxiliary_loss_mlp": 0.01212715, + "balance_loss_clip": 1.21200061, + "balance_loss_mlp": 1.01587677, + "epoch": 0.5389448369156772, + "flos": 69829663033440.0, + "grad_norm": 0.7969617711562414, + "language_loss": 0.63209486, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65939599, + "num_input_tokens_seen": 193033135, + "step": 8964, + "time_per_iteration": 3.3580243587493896 + }, + { + "auxiliary_loss_clip": 0.01439737, + "auxiliary_loss_mlp": 0.01227879, + "balance_loss_clip": 1.1229794, + "balance_loss_mlp": 1.02398384, + "epoch": 0.5390049601683451, + "flos": 31725136385280.0, + "grad_norm": 1.7773465355090918, + "language_loss": 0.69968855, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72636473, + "num_input_tokens_seen": 193055570, + "step": 8965, + "time_per_iteration": 2.8266255855560303 + }, + { + "auxiliary_loss_clip": 0.01443939, + "auxiliary_loss_mlp": 0.01234798, + "balance_loss_clip": 1.12915063, + "balance_loss_mlp": 1.03013992, + "epoch": 0.5390650834210131, + "flos": 18115428481920.0, + "grad_norm": 2.6881071419749514, + "language_loss": 0.8249259, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.85171324, + "num_input_tokens_seen": 193073120, + "step": 8966, + "time_per_iteration": 4.146892547607422 + }, + { + "auxiliary_loss_clip": 0.01450344, + "auxiliary_loss_mlp": 0.01234656, + "balance_loss_clip": 1.13598001, + "balance_loss_mlp": 1.02751815, + "epoch": 0.539125206673681, + "flos": 17418415763520.0, + "grad_norm": 2.2750867797462875, + "language_loss": 0.72866744, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.75551742, + "num_input_tokens_seen": 193090105, + "step": 8967, + "time_per_iteration": 2.794787645339966 + }, + { + "auxiliary_loss_clip": 0.01440149, + "auxiliary_loss_mlp": 0.01229711, + "balance_loss_clip": 1.12598062, + "balance_loss_mlp": 1.02753186, + "epoch": 0.539185329926349, + "flos": 22201052752800.0, + "grad_norm": 1.6470630344593065, + "language_loss": 0.81784999, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.84454858, + "num_input_tokens_seen": 193109325, + "step": 8968, + "time_per_iteration": 2.7642464637756348 + }, + { + "auxiliary_loss_clip": 0.01447603, + "auxiliary_loss_mlp": 0.01225874, + "balance_loss_clip": 1.13304341, + "balance_loss_mlp": 1.01892626, + "epoch": 0.539245453179017, + "flos": 21436565109120.0, + "grad_norm": 1.6574617894867356, + "language_loss": 0.73805904, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76479387, + "num_input_tokens_seen": 193130595, + "step": 8969, + "time_per_iteration": 2.7833378314971924 + }, + { + "auxiliary_loss_clip": 0.01443398, + "auxiliary_loss_mlp": 0.01232433, + "balance_loss_clip": 1.12950444, + "balance_loss_mlp": 1.02853775, + "epoch": 0.539305576431685, + "flos": 20736783635040.0, + "grad_norm": 4.868354634039967, + "language_loss": 0.81993318, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84669155, + "num_input_tokens_seen": 193148930, + "step": 8970, + "time_per_iteration": 2.7557735443115234 + }, + { + "auxiliary_loss_clip": 0.01448764, + "auxiliary_loss_mlp": 0.0122759, + "balance_loss_clip": 1.13623428, + "balance_loss_mlp": 1.02350354, + "epoch": 0.5393656996843529, + "flos": 30923137427040.0, + "grad_norm": 1.568517020805527, + "language_loss": 0.75274318, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77950674, + "num_input_tokens_seen": 193170140, + "step": 8971, + "time_per_iteration": 2.892085313796997 + }, + { + "auxiliary_loss_clip": 0.01504579, + "auxiliary_loss_mlp": 0.01196701, + "balance_loss_clip": 1.20299518, + "balance_loss_mlp": 1.00062561, + "epoch": 0.5394258229370209, + "flos": 50322864085920.0, + "grad_norm": 1.019221937697886, + "language_loss": 0.60352415, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.63053697, + "num_input_tokens_seen": 193227235, + "step": 8972, + "time_per_iteration": 3.36063551902771 + }, + { + "auxiliary_loss_clip": 0.0144893, + "auxiliary_loss_mlp": 0.01227244, + "balance_loss_clip": 1.13471293, + "balance_loss_mlp": 1.0229671, + "epoch": 0.5394859461896888, + "flos": 25414358528160.0, + "grad_norm": 1.57398067241872, + "language_loss": 0.7835446, + "learning_rate": 1.841460870485045e-06, + "loss": 0.81030631, + "num_input_tokens_seen": 193248435, + "step": 8973, + "time_per_iteration": 5.9478161334991455 + }, + { + "auxiliary_loss_clip": 0.01451861, + "auxiliary_loss_mlp": 0.01237361, + "balance_loss_clip": 1.13693595, + "balance_loss_mlp": 1.03041327, + "epoch": 0.5395460694423568, + "flos": 25480543896000.0, + "grad_norm": 3.121683920691884, + "language_loss": 0.73975277, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.76664501, + "num_input_tokens_seen": 193267490, + "step": 8974, + "time_per_iteration": 2.8280141353607178 + }, + { + "auxiliary_loss_clip": 0.01502654, + "auxiliary_loss_mlp": 0.01203934, + "balance_loss_clip": 1.20090771, + "balance_loss_mlp": 1.00785828, + "epoch": 0.5396061926950249, + "flos": 53255460634560.0, + "grad_norm": 0.7435661059489317, + "language_loss": 0.50991398, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53697985, + "num_input_tokens_seen": 193326050, + "step": 8975, + "time_per_iteration": 3.2572414875030518 + }, + { + "auxiliary_loss_clip": 0.01452677, + "auxiliary_loss_mlp": 0.01228454, + "balance_loss_clip": 1.13889623, + "balance_loss_mlp": 1.0239861, + "epoch": 0.5396663159476928, + "flos": 26727558327360.0, + "grad_norm": 1.5430249464214056, + "language_loss": 0.72504199, + "learning_rate": 1.840296189214344e-06, + "loss": 0.75185329, + "num_input_tokens_seen": 193348785, + "step": 8976, + "time_per_iteration": 2.8690688610076904 + }, + { + "auxiliary_loss_clip": 0.01452491, + "auxiliary_loss_mlp": 0.01235949, + "balance_loss_clip": 1.13836527, + "balance_loss_mlp": 1.03376961, + "epoch": 0.5397264392003608, + "flos": 23255352381600.0, + "grad_norm": 1.9639528228839171, + "language_loss": 0.70343143, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.7303158, + "num_input_tokens_seen": 193367080, + "step": 8977, + "time_per_iteration": 2.7885501384735107 + }, + { + "auxiliary_loss_clip": 0.0144653, + "auxiliary_loss_mlp": 0.01232003, + "balance_loss_clip": 1.13460183, + "balance_loss_mlp": 1.02810717, + "epoch": 0.5397865624530287, + "flos": 18296044201440.0, + "grad_norm": 3.778664153663005, + "language_loss": 0.72673005, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.75351536, + "num_input_tokens_seen": 193383715, + "step": 8978, + "time_per_iteration": 4.210653781890869 + }, + { + "auxiliary_loss_clip": 0.0144454, + "auxiliary_loss_mlp": 0.01247742, + "balance_loss_clip": 1.13106179, + "balance_loss_mlp": 1.04117644, + "epoch": 0.5398466857056967, + "flos": 15298931124000.0, + "grad_norm": 2.100639783216034, + "language_loss": 0.74343264, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.77035552, + "num_input_tokens_seen": 193400560, + "step": 8979, + "time_per_iteration": 2.7853825092315674 + }, + { + "auxiliary_loss_clip": 0.01447526, + "auxiliary_loss_mlp": 0.0123303, + "balance_loss_clip": 1.13268673, + "balance_loss_mlp": 1.0266552, + "epoch": 0.5399068089583646, + "flos": 17823682160640.0, + "grad_norm": 2.2369254025454914, + "language_loss": 0.77685976, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.80366534, + "num_input_tokens_seen": 193418680, + "step": 8980, + "time_per_iteration": 2.7986340522766113 + }, + { + "auxiliary_loss_clip": 0.01444532, + "auxiliary_loss_mlp": 0.01234118, + "balance_loss_clip": 1.13019109, + "balance_loss_mlp": 1.03117597, + "epoch": 0.5399669322110326, + "flos": 27384139262880.0, + "grad_norm": 1.8442655259733847, + "language_loss": 0.82373965, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.85052615, + "num_input_tokens_seen": 193439310, + "step": 8981, + "time_per_iteration": 2.8888585567474365 + }, + { + "auxiliary_loss_clip": 0.01446555, + "auxiliary_loss_mlp": 0.01235753, + "balance_loss_clip": 1.13168716, + "balance_loss_mlp": 1.03052199, + "epoch": 0.5400270554637006, + "flos": 20451219603840.0, + "grad_norm": 2.3523590456716765, + "language_loss": 0.67052913, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.69735217, + "num_input_tokens_seen": 193458115, + "step": 8982, + "time_per_iteration": 2.832766056060791 + }, + { + "auxiliary_loss_clip": 0.01443075, + "auxiliary_loss_mlp": 0.01235391, + "balance_loss_clip": 1.12791538, + "balance_loss_mlp": 1.03187704, + "epoch": 0.5400871787163686, + "flos": 21691748319840.0, + "grad_norm": 1.7429689488541795, + "language_loss": 0.82679719, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.85358185, + "num_input_tokens_seen": 193477365, + "step": 8983, + "time_per_iteration": 2.862800359725952 + }, + { + "auxiliary_loss_clip": 0.01444877, + "auxiliary_loss_mlp": 0.01226324, + "balance_loss_clip": 1.12895727, + "balance_loss_mlp": 1.02014005, + "epoch": 0.5401473019690365, + "flos": 19206556718400.0, + "grad_norm": 2.0642406406156075, + "language_loss": 0.70651197, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.73322403, + "num_input_tokens_seen": 193495595, + "step": 8984, + "time_per_iteration": 2.843353748321533 + }, + { + "auxiliary_loss_clip": 0.01445773, + "auxiliary_loss_mlp": 0.01250157, + "balance_loss_clip": 1.1304822, + "balance_loss_mlp": 1.04597545, + "epoch": 0.5402074252217045, + "flos": 20629256208480.0, + "grad_norm": 1.7413060039930324, + "language_loss": 0.79777849, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82473779, + "num_input_tokens_seen": 193514035, + "step": 8985, + "time_per_iteration": 2.8439536094665527 + }, + { + "auxiliary_loss_clip": 0.01443837, + "auxiliary_loss_mlp": 0.01234888, + "balance_loss_clip": 1.12826025, + "balance_loss_mlp": 1.03499842, + "epoch": 0.5402675484743724, + "flos": 24975828770400.0, + "grad_norm": 1.5112537498656955, + "language_loss": 0.79006422, + "learning_rate": 1.83641431418363e-06, + "loss": 0.8168515, + "num_input_tokens_seen": 193535445, + "step": 8986, + "time_per_iteration": 2.8232598304748535 + }, + { + "auxiliary_loss_clip": 0.01441298, + "auxiliary_loss_mlp": 0.01238658, + "balance_loss_clip": 1.12520242, + "balance_loss_mlp": 1.03628802, + "epoch": 0.5403276717270404, + "flos": 19460791725120.0, + "grad_norm": 1.9262742981670211, + "language_loss": 0.77323008, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.80002964, + "num_input_tokens_seen": 193554780, + "step": 8987, + "time_per_iteration": 2.790708541870117 + }, + { + "auxiliary_loss_clip": 0.01444833, + "auxiliary_loss_mlp": 0.01236217, + "balance_loss_clip": 1.12933993, + "balance_loss_mlp": 1.03365612, + "epoch": 0.5403877949797083, + "flos": 18444117195360.0, + "grad_norm": 1.7511967730527798, + "language_loss": 0.71461737, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.7414279, + "num_input_tokens_seen": 193573580, + "step": 8988, + "time_per_iteration": 2.7738711833953857 + }, + { + "auxiliary_loss_clip": 0.01442246, + "auxiliary_loss_mlp": 0.01242858, + "balance_loss_clip": 1.12451649, + "balance_loss_mlp": 1.03953445, + "epoch": 0.5404479182323764, + "flos": 28295372414880.0, + "grad_norm": 2.5264076223867393, + "language_loss": 0.67193425, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69878531, + "num_input_tokens_seen": 193590490, + "step": 8989, + "time_per_iteration": 2.8164358139038086 + }, + { + "auxiliary_loss_clip": 0.01443812, + "auxiliary_loss_mlp": 0.01241762, + "balance_loss_clip": 1.12662208, + "balance_loss_mlp": 1.03824806, + "epoch": 0.5405080414850444, + "flos": 23369555164320.0, + "grad_norm": 1.6659924213974566, + "language_loss": 0.77870309, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.8055588, + "num_input_tokens_seen": 193609900, + "step": 8990, + "time_per_iteration": 2.896617889404297 + }, + { + "auxiliary_loss_clip": 0.01443309, + "auxiliary_loss_mlp": 0.01228989, + "balance_loss_clip": 1.12538624, + "balance_loss_mlp": 1.02776408, + "epoch": 0.5405681647377123, + "flos": 21108597030720.0, + "grad_norm": 1.8904406779227054, + "language_loss": 0.69178355, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71850646, + "num_input_tokens_seen": 193629775, + "step": 8991, + "time_per_iteration": 2.799208879470825 + }, + { + "auxiliary_loss_clip": 0.01450203, + "auxiliary_loss_mlp": 0.01229635, + "balance_loss_clip": 1.13125682, + "balance_loss_mlp": 1.02573907, + "epoch": 0.5406282879903803, + "flos": 20451295460160.0, + "grad_norm": 1.8714376745513366, + "language_loss": 0.76294369, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78974211, + "num_input_tokens_seen": 193648070, + "step": 8992, + "time_per_iteration": 2.848773717880249 + }, + { + "auxiliary_loss_clip": 0.01441603, + "auxiliary_loss_mlp": 0.01239263, + "balance_loss_clip": 1.12370074, + "balance_loss_mlp": 1.03517687, + "epoch": 0.5406884112430482, + "flos": 14211216421920.0, + "grad_norm": 2.595707800706976, + "language_loss": 0.76068997, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78749859, + "num_input_tokens_seen": 193665060, + "step": 8993, + "time_per_iteration": 2.77439546585083 + }, + { + "auxiliary_loss_clip": 0.01437702, + "auxiliary_loss_mlp": 0.01229658, + "balance_loss_clip": 1.12121236, + "balance_loss_mlp": 1.02938616, + "epoch": 0.5407485344957162, + "flos": 23877683824320.0, + "grad_norm": 1.634837350462455, + "language_loss": 0.7071197, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.73379332, + "num_input_tokens_seen": 193683620, + "step": 8994, + "time_per_iteration": 2.866386651992798 + }, + { + "auxiliary_loss_clip": 0.01442902, + "auxiliary_loss_mlp": 0.01230378, + "balance_loss_clip": 1.12710345, + "balance_loss_mlp": 1.02438426, + "epoch": 0.5408086577483842, + "flos": 23150707495200.0, + "grad_norm": 2.1522536272943205, + "language_loss": 0.7534163, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.78014904, + "num_input_tokens_seen": 193702990, + "step": 8995, + "time_per_iteration": 2.867033004760742 + }, + { + "auxiliary_loss_clip": 0.01438225, + "auxiliary_loss_mlp": 0.0122938, + "balance_loss_clip": 1.12195277, + "balance_loss_mlp": 1.02662849, + "epoch": 0.5408687810010522, + "flos": 18773071405920.0, + "grad_norm": 1.917972788420311, + "language_loss": 0.73237264, + "learning_rate": 1.832533059471282e-06, + "loss": 0.7590487, + "num_input_tokens_seen": 193721785, + "step": 8996, + "time_per_iteration": 2.8224010467529297 + }, + { + "auxiliary_loss_clip": 0.01443783, + "auxiliary_loss_mlp": 0.01230707, + "balance_loss_clip": 1.12755203, + "balance_loss_mlp": 1.02814662, + "epoch": 0.5409289042537201, + "flos": 13883286271680.0, + "grad_norm": 4.361663557162213, + "language_loss": 0.73470765, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.76145256, + "num_input_tokens_seen": 193740315, + "step": 8997, + "time_per_iteration": 2.837740421295166 + }, + { + "auxiliary_loss_clip": 0.01440705, + "auxiliary_loss_mlp": 0.01238166, + "balance_loss_clip": 1.12556112, + "balance_loss_mlp": 1.03570056, + "epoch": 0.5409890275063881, + "flos": 14467120267680.0, + "grad_norm": 2.0550301370605495, + "language_loss": 0.71940744, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.74619615, + "num_input_tokens_seen": 193757580, + "step": 8998, + "time_per_iteration": 2.786332845687866 + }, + { + "auxiliary_loss_clip": 0.01440038, + "auxiliary_loss_mlp": 0.01244012, + "balance_loss_clip": 1.12433505, + "balance_loss_mlp": 1.04412234, + "epoch": 0.541049150759056, + "flos": 48980952669600.0, + "grad_norm": 2.0759187176448575, + "language_loss": 0.70763534, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.73447585, + "num_input_tokens_seen": 193780965, + "step": 8999, + "time_per_iteration": 3.0819363594055176 + }, + { + "auxiliary_loss_clip": 0.01445136, + "auxiliary_loss_mlp": 0.0123504, + "balance_loss_clip": 1.12825382, + "balance_loss_mlp": 1.03057253, + "epoch": 0.541109274011724, + "flos": 18149412477600.0, + "grad_norm": 2.386583084777513, + "language_loss": 0.80627382, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.83307552, + "num_input_tokens_seen": 193797855, + "step": 9000, + "time_per_iteration": 2.7742421627044678 + }, + { + "auxiliary_loss_clip": 0.014415, + "auxiliary_loss_mlp": 0.01230999, + "balance_loss_clip": 1.12550867, + "balance_loss_mlp": 1.02977347, + "epoch": 0.541169397264392, + "flos": 20524687178400.0, + "grad_norm": 2.220778959666949, + "language_loss": 0.72786701, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75459194, + "num_input_tokens_seen": 193817375, + "step": 9001, + "time_per_iteration": 2.8410305976867676 + }, + { + "auxiliary_loss_clip": 0.01444043, + "auxiliary_loss_mlp": 0.01237317, + "balance_loss_clip": 1.12804985, + "balance_loss_mlp": 1.03532863, + "epoch": 0.54122952051706, + "flos": 20045915278560.0, + "grad_norm": 2.926784329878882, + "language_loss": 0.85380757, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.88062108, + "num_input_tokens_seen": 193832205, + "step": 9002, + "time_per_iteration": 2.749269962310791 + }, + { + "auxiliary_loss_clip": 0.01447926, + "auxiliary_loss_mlp": 0.012326, + "balance_loss_clip": 1.13422179, + "balance_loss_mlp": 1.02946782, + "epoch": 0.541289643769728, + "flos": 19064248804800.0, + "grad_norm": 1.7630630532753317, + "language_loss": 0.77730751, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.80411285, + "num_input_tokens_seen": 193849830, + "step": 9003, + "time_per_iteration": 2.822307825088501 + }, + { + "auxiliary_loss_clip": 0.01446063, + "auxiliary_loss_mlp": 0.01240684, + "balance_loss_clip": 1.13158238, + "balance_loss_mlp": 1.03802872, + "epoch": 0.5413497670223959, + "flos": 22384475156160.0, + "grad_norm": 2.5020418899965904, + "language_loss": 0.69730008, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.72416747, + "num_input_tokens_seen": 193869945, + "step": 9004, + "time_per_iteration": 4.219947576522827 + }, + { + "auxiliary_loss_clip": 0.01496706, + "auxiliary_loss_mlp": 0.01186615, + "balance_loss_clip": 1.19546473, + "balance_loss_mlp": 0.99092102, + "epoch": 0.5414098902750639, + "flos": 70038573524640.0, + "grad_norm": 0.985438292999904, + "language_loss": 0.59019518, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61702847, + "num_input_tokens_seen": 193930860, + "step": 9005, + "time_per_iteration": 3.5226731300354004 + }, + { + "auxiliary_loss_clip": 0.01441819, + "auxiliary_loss_mlp": 0.01239306, + "balance_loss_clip": 1.12731528, + "balance_loss_mlp": 1.03693581, + "epoch": 0.5414700135277318, + "flos": 21801134226240.0, + "grad_norm": 1.7226299070446816, + "language_loss": 0.77837396, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.8051852, + "num_input_tokens_seen": 193949075, + "step": 9006, + "time_per_iteration": 2.7832255363464355 + }, + { + "auxiliary_loss_clip": 0.01446767, + "auxiliary_loss_mlp": 0.01227924, + "balance_loss_clip": 1.13123298, + "balance_loss_mlp": 1.02669871, + "epoch": 0.5415301367803999, + "flos": 16909718181120.0, + "grad_norm": 1.9783591333737947, + "language_loss": 0.83299142, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85973835, + "num_input_tokens_seen": 193967630, + "step": 9007, + "time_per_iteration": 2.796051025390625 + }, + { + "auxiliary_loss_clip": 0.01446014, + "auxiliary_loss_mlp": 0.01231595, + "balance_loss_clip": 1.13016975, + "balance_loss_mlp": 1.02903485, + "epoch": 0.5415902600330678, + "flos": 25706635843680.0, + "grad_norm": 2.264838072278777, + "language_loss": 0.66945618, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69623232, + "num_input_tokens_seen": 193988730, + "step": 9008, + "time_per_iteration": 2.812920093536377 + }, + { + "auxiliary_loss_clip": 0.01441956, + "auxiliary_loss_mlp": 0.01237452, + "balance_loss_clip": 1.12572455, + "balance_loss_mlp": 1.03393817, + "epoch": 0.5416503832857358, + "flos": 19210159893600.0, + "grad_norm": 2.8221344366436534, + "language_loss": 0.74516714, + "learning_rate": 1.827488379924234e-06, + "loss": 0.77196115, + "num_input_tokens_seen": 194005160, + "step": 9009, + "time_per_iteration": 2.755406141281128 + }, + { + "auxiliary_loss_clip": 0.0144815, + "auxiliary_loss_mlp": 0.01238178, + "balance_loss_clip": 1.13056898, + "balance_loss_mlp": 1.03275621, + "epoch": 0.5417105065384037, + "flos": 12715352782560.0, + "grad_norm": 2.654622364115422, + "language_loss": 0.87456858, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.90143186, + "num_input_tokens_seen": 194021700, + "step": 9010, + "time_per_iteration": 2.779512405395508 + }, + { + "auxiliary_loss_clip": 0.01451902, + "auxiliary_loss_mlp": 0.01233285, + "balance_loss_clip": 1.13488102, + "balance_loss_mlp": 1.03015292, + "epoch": 0.5417706297910717, + "flos": 30338544867840.0, + "grad_norm": 3.053315482480864, + "language_loss": 0.65514529, + "learning_rate": 1.826712372694122e-06, + "loss": 0.68199718, + "num_input_tokens_seen": 194042620, + "step": 9011, + "time_per_iteration": 6.092804193496704 + }, + { + "auxiliary_loss_clip": 0.01452825, + "auxiliary_loss_mlp": 0.01235811, + "balance_loss_clip": 1.13657212, + "balance_loss_mlp": 1.03172481, + "epoch": 0.5418307530437396, + "flos": 29023448660640.0, + "grad_norm": 3.151997195864966, + "language_loss": 0.78864944, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81553578, + "num_input_tokens_seen": 194061800, + "step": 9012, + "time_per_iteration": 2.829735040664673 + }, + { + "auxiliary_loss_clip": 0.01450241, + "auxiliary_loss_mlp": 0.01243611, + "balance_loss_clip": 1.13410866, + "balance_loss_mlp": 1.04219472, + "epoch": 0.5418908762964076, + "flos": 16875354903840.0, + "grad_norm": 2.7409285191217743, + "language_loss": 0.7462582, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.7731967, + "num_input_tokens_seen": 194079890, + "step": 9013, + "time_per_iteration": 2.7966856956481934 + }, + { + "auxiliary_loss_clip": 0.01447564, + "auxiliary_loss_mlp": 0.01246598, + "balance_loss_clip": 1.13130403, + "balance_loss_mlp": 1.04766178, + "epoch": 0.5419509995490756, + "flos": 18951525220320.0, + "grad_norm": 4.574380265238311, + "language_loss": 0.7233727, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.75031424, + "num_input_tokens_seen": 194097625, + "step": 9014, + "time_per_iteration": 2.7649178504943848 + }, + { + "auxiliary_loss_clip": 0.01446752, + "auxiliary_loss_mlp": 0.01231643, + "balance_loss_clip": 1.13097668, + "balance_loss_mlp": 1.0296551, + "epoch": 0.5420111228017436, + "flos": 18079699790880.0, + "grad_norm": 1.6250990511449033, + "language_loss": 0.80603683, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.83282077, + "num_input_tokens_seen": 194116055, + "step": 9015, + "time_per_iteration": 2.856255292892456 + }, + { + "auxiliary_loss_clip": 0.01454468, + "auxiliary_loss_mlp": 0.01240997, + "balance_loss_clip": 1.1373508, + "balance_loss_mlp": 1.03500366, + "epoch": 0.5420712460544116, + "flos": 19063755738720.0, + "grad_norm": 2.1897849259863245, + "language_loss": 0.81485653, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.84181118, + "num_input_tokens_seen": 194130365, + "step": 9016, + "time_per_iteration": 4.318044424057007 + }, + { + "auxiliary_loss_clip": 0.0144664, + "auxiliary_loss_mlp": 0.01233576, + "balance_loss_clip": 1.12894809, + "balance_loss_mlp": 1.03158808, + "epoch": 0.5421313693070795, + "flos": 18189085697280.0, + "grad_norm": 1.6488672971574787, + "language_loss": 0.81389105, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.84069318, + "num_input_tokens_seen": 194148975, + "step": 9017, + "time_per_iteration": 2.808828353881836 + }, + { + "auxiliary_loss_clip": 0.01449176, + "auxiliary_loss_mlp": 0.01231462, + "balance_loss_clip": 1.13139486, + "balance_loss_mlp": 1.0279479, + "epoch": 0.5421914925597475, + "flos": 13007630098080.0, + "grad_norm": 1.86408202254971, + "language_loss": 0.77676642, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.80357289, + "num_input_tokens_seen": 194167185, + "step": 9018, + "time_per_iteration": 2.789022207260132 + }, + { + "auxiliary_loss_clip": 0.01446096, + "auxiliary_loss_mlp": 0.01249741, + "balance_loss_clip": 1.12899649, + "balance_loss_mlp": 1.04718018, + "epoch": 0.5422516158124154, + "flos": 46762208942400.0, + "grad_norm": 1.637617909352819, + "language_loss": 0.66694593, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.69390428, + "num_input_tokens_seen": 194192840, + "step": 9019, + "time_per_iteration": 3.034149646759033 + }, + { + "auxiliary_loss_clip": 0.01446899, + "auxiliary_loss_mlp": 0.01235855, + "balance_loss_clip": 1.12932658, + "balance_loss_mlp": 1.03548801, + "epoch": 0.5423117390650835, + "flos": 31761320214240.0, + "grad_norm": 1.705965181364156, + "language_loss": 0.69732535, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.72415292, + "num_input_tokens_seen": 194213150, + "step": 9020, + "time_per_iteration": 2.838313102722168 + }, + { + "auxiliary_loss_clip": 0.01446197, + "auxiliary_loss_mlp": 0.01236878, + "balance_loss_clip": 1.1292206, + "balance_loss_mlp": 1.03717875, + "epoch": 0.5423718623177514, + "flos": 27204623460000.0, + "grad_norm": 1.6018869087725198, + "language_loss": 0.80259931, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82942998, + "num_input_tokens_seen": 194234665, + "step": 9021, + "time_per_iteration": 2.8766958713531494 + }, + { + "auxiliary_loss_clip": 0.01445326, + "auxiliary_loss_mlp": 0.01227438, + "balance_loss_clip": 1.12763119, + "balance_loss_mlp": 1.02411425, + "epoch": 0.5424319855704194, + "flos": 23548502044800.0, + "grad_norm": 1.858289784515586, + "language_loss": 0.78447747, + "learning_rate": 1.822444805916788e-06, + "loss": 0.81120515, + "num_input_tokens_seen": 194253790, + "step": 9022, + "time_per_iteration": 2.8941872119903564 + }, + { + "auxiliary_loss_clip": 0.0144799, + "auxiliary_loss_mlp": 0.01224798, + "balance_loss_clip": 1.13151646, + "balance_loss_mlp": 1.02261925, + "epoch": 0.5424921088230873, + "flos": 26618096564640.0, + "grad_norm": 1.9957390464555225, + "language_loss": 0.82020688, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84693474, + "num_input_tokens_seen": 194274950, + "step": 9023, + "time_per_iteration": 2.89898681640625 + }, + { + "auxiliary_loss_clip": 0.01454298, + "auxiliary_loss_mlp": 0.01244294, + "balance_loss_clip": 1.13707101, + "balance_loss_mlp": 1.040398, + "epoch": 0.5425522320757553, + "flos": 23589085540320.0, + "grad_norm": 1.7408042652714635, + "language_loss": 0.71689409, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.74388003, + "num_input_tokens_seen": 194296155, + "step": 9024, + "time_per_iteration": 2.8590667247772217 + }, + { + "auxiliary_loss_clip": 0.01450164, + "auxiliary_loss_mlp": 0.01236011, + "balance_loss_clip": 1.13393116, + "balance_loss_mlp": 1.03411829, + "epoch": 0.5426123553284232, + "flos": 30594941779680.0, + "grad_norm": 1.7900681229499853, + "language_loss": 0.65215659, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67901832, + "num_input_tokens_seen": 194318025, + "step": 9025, + "time_per_iteration": 2.8829123973846436 + }, + { + "auxiliary_loss_clip": 0.01444882, + "auxiliary_loss_mlp": 0.01238097, + "balance_loss_clip": 1.12930727, + "balance_loss_mlp": 1.037444, + "epoch": 0.5426724785810912, + "flos": 12496770610560.0, + "grad_norm": 1.7901408233133893, + "language_loss": 0.73908991, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.76591969, + "num_input_tokens_seen": 194336150, + "step": 9026, + "time_per_iteration": 2.738104820251465 + }, + { + "auxiliary_loss_clip": 0.01441246, + "auxiliary_loss_mlp": 0.01235439, + "balance_loss_clip": 1.12389326, + "balance_loss_mlp": 1.03230596, + "epoch": 0.5427326018337592, + "flos": 26066843222400.0, + "grad_norm": 1.8305399509574825, + "language_loss": 0.78732562, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.81409246, + "num_input_tokens_seen": 194355980, + "step": 9027, + "time_per_iteration": 2.832479238510132 + }, + { + "auxiliary_loss_clip": 0.01488821, + "auxiliary_loss_mlp": 0.01195419, + "balance_loss_clip": 1.18510079, + "balance_loss_mlp": 1.00239563, + "epoch": 0.5427927250864272, + "flos": 65991522556800.0, + "grad_norm": 0.742214223777786, + "language_loss": 0.56526709, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.59210956, + "num_input_tokens_seen": 194422660, + "step": 9028, + "time_per_iteration": 3.3258588314056396 + }, + { + "auxiliary_loss_clip": 0.01438139, + "auxiliary_loss_mlp": 0.01231591, + "balance_loss_clip": 1.12091839, + "balance_loss_mlp": 1.02788603, + "epoch": 0.5428528483390952, + "flos": 19977492149280.0, + "grad_norm": 2.0361290056843995, + "language_loss": 0.77631795, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80301523, + "num_input_tokens_seen": 194438545, + "step": 9029, + "time_per_iteration": 2.7433815002441406 + }, + { + "auxiliary_loss_clip": 0.01440547, + "auxiliary_loss_mlp": 0.01229516, + "balance_loss_clip": 1.12362456, + "balance_loss_mlp": 1.02657402, + "epoch": 0.5429129715917631, + "flos": 21834321730560.0, + "grad_norm": 1.6642314046435305, + "language_loss": 0.83479518, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.86149579, + "num_input_tokens_seen": 194458060, + "step": 9030, + "time_per_iteration": 2.8177876472473145 + }, + { + "auxiliary_loss_clip": 0.01441564, + "auxiliary_loss_mlp": 0.01233943, + "balance_loss_clip": 1.12448788, + "balance_loss_mlp": 1.02890325, + "epoch": 0.5429730948444311, + "flos": 27785195634240.0, + "grad_norm": 1.5977378840537868, + "language_loss": 0.75131273, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.77806783, + "num_input_tokens_seen": 194477405, + "step": 9031, + "time_per_iteration": 2.832165241241455 + }, + { + "auxiliary_loss_clip": 0.01446177, + "auxiliary_loss_mlp": 0.01239791, + "balance_loss_clip": 1.13036513, + "balance_loss_mlp": 1.03971004, + "epoch": 0.543033218097099, + "flos": 26762869808640.0, + "grad_norm": 5.091120946542414, + "language_loss": 0.8504349, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87729454, + "num_input_tokens_seen": 194497085, + "step": 9032, + "time_per_iteration": 2.8144161701202393 + }, + { + "auxiliary_loss_clip": 0.0143831, + "auxiliary_loss_mlp": 0.01254717, + "balance_loss_clip": 1.12267613, + "balance_loss_mlp": 1.05444491, + "epoch": 0.5430933413497671, + "flos": 22677511034880.0, + "grad_norm": 2.1695044651051125, + "language_loss": 0.73589385, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.76282412, + "num_input_tokens_seen": 194516785, + "step": 9033, + "time_per_iteration": 2.815704345703125 + }, + { + "auxiliary_loss_clip": 0.0144286, + "auxiliary_loss_mlp": 0.01239179, + "balance_loss_clip": 1.12676215, + "balance_loss_mlp": 1.03680873, + "epoch": 0.543153464602435, + "flos": 24609780455040.0, + "grad_norm": 1.9860039803318557, + "language_loss": 0.75887042, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.78569078, + "num_input_tokens_seen": 194536475, + "step": 9034, + "time_per_iteration": 2.8226816654205322 + }, + { + "auxiliary_loss_clip": 0.01437575, + "auxiliary_loss_mlp": 0.01245263, + "balance_loss_clip": 1.1210413, + "balance_loss_mlp": 1.04499137, + "epoch": 0.543213587855103, + "flos": 19027723622400.0, + "grad_norm": 1.816020631004028, + "language_loss": 0.84351003, + "learning_rate": 1.817402369770655e-06, + "loss": 0.87033844, + "num_input_tokens_seen": 194554495, + "step": 9035, + "time_per_iteration": 2.7584316730499268 + }, + { + "auxiliary_loss_clip": 0.01488575, + "auxiliary_loss_mlp": 0.01201309, + "balance_loss_clip": 1.18547225, + "balance_loss_mlp": 1.00790405, + "epoch": 0.5432737111077709, + "flos": 65692835382240.0, + "grad_norm": 0.7088154257769345, + "language_loss": 0.55802619, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.58492506, + "num_input_tokens_seen": 194617620, + "step": 9036, + "time_per_iteration": 3.3397693634033203 + }, + { + "auxiliary_loss_clip": 0.01435017, + "auxiliary_loss_mlp": 0.01230838, + "balance_loss_clip": 1.11774611, + "balance_loss_mlp": 1.02675128, + "epoch": 0.5433338343604389, + "flos": 22093904607840.0, + "grad_norm": 1.6075362670135511, + "language_loss": 0.75346446, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.780123, + "num_input_tokens_seen": 194637690, + "step": 9037, + "time_per_iteration": 2.829176902770996 + }, + { + "auxiliary_loss_clip": 0.01432188, + "auxiliary_loss_mlp": 0.01231829, + "balance_loss_clip": 1.11483729, + "balance_loss_mlp": 1.03127098, + "epoch": 0.5433939576131068, + "flos": 34675218180000.0, + "grad_norm": 1.8704055054202315, + "language_loss": 0.66737205, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.69401222, + "num_input_tokens_seen": 194659520, + "step": 9038, + "time_per_iteration": 2.900456428527832 + }, + { + "auxiliary_loss_clip": 0.01432671, + "auxiliary_loss_mlp": 0.01238344, + "balance_loss_clip": 1.11625171, + "balance_loss_mlp": 1.03759539, + "epoch": 0.5434540808657748, + "flos": 20305460227680.0, + "grad_norm": 2.436250584491349, + "language_loss": 0.77919078, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80590093, + "num_input_tokens_seen": 194677645, + "step": 9039, + "time_per_iteration": 2.8346705436706543 + }, + { + "auxiliary_loss_clip": 0.01435145, + "auxiliary_loss_mlp": 0.01245147, + "balance_loss_clip": 1.11885679, + "balance_loss_mlp": 1.04411197, + "epoch": 0.5435142041184428, + "flos": 23115054660480.0, + "grad_norm": 1.9028397741578877, + "language_loss": 0.76282871, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78963166, + "num_input_tokens_seen": 194697400, + "step": 9040, + "time_per_iteration": 2.821504592895508 + }, + { + "auxiliary_loss_clip": 0.01488256, + "auxiliary_loss_mlp": 0.0120134, + "balance_loss_clip": 1.18631554, + "balance_loss_mlp": 1.00831604, + "epoch": 0.5435743273711108, + "flos": 64019314419840.0, + "grad_norm": 0.7487514121844522, + "language_loss": 0.52372295, + "learning_rate": 1.815075484268074e-06, + "loss": 0.55061889, + "num_input_tokens_seen": 194761205, + "step": 9041, + "time_per_iteration": 3.361670732498169 + }, + { + "auxiliary_loss_clip": 0.0143487, + "auxiliary_loss_mlp": 0.01234871, + "balance_loss_clip": 1.11778188, + "balance_loss_mlp": 1.03288305, + "epoch": 0.5436344506237788, + "flos": 25121246793120.0, + "grad_norm": 1.6595950689111971, + "language_loss": 0.76327407, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78997147, + "num_input_tokens_seen": 194782445, + "step": 9042, + "time_per_iteration": 4.170439720153809 + }, + { + "auxiliary_loss_clip": 0.014305, + "auxiliary_loss_mlp": 0.01244176, + "balance_loss_clip": 1.1139971, + "balance_loss_mlp": 1.04552531, + "epoch": 0.5436945738764467, + "flos": 19574994507840.0, + "grad_norm": 2.149599869699665, + "language_loss": 0.67453885, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.70128566, + "num_input_tokens_seen": 194800325, + "step": 9043, + "time_per_iteration": 2.846864938735962 + }, + { + "auxiliary_loss_clip": 0.01434108, + "auxiliary_loss_mlp": 0.01232355, + "balance_loss_clip": 1.11659884, + "balance_loss_mlp": 1.03294146, + "epoch": 0.5437546971291147, + "flos": 21144667075200.0, + "grad_norm": 1.8267595965557817, + "language_loss": 0.84376323, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.87042785, + "num_input_tokens_seen": 194818675, + "step": 9044, + "time_per_iteration": 2.8024258613586426 + }, + { + "auxiliary_loss_clip": 0.01436078, + "auxiliary_loss_mlp": 0.01230806, + "balance_loss_clip": 1.11647034, + "balance_loss_mlp": 1.02881753, + "epoch": 0.5438148203817826, + "flos": 25121019224160.0, + "grad_norm": 1.587962385469358, + "language_loss": 0.61963367, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64630246, + "num_input_tokens_seen": 194836595, + "step": 9045, + "time_per_iteration": 2.8397769927978516 + }, + { + "auxiliary_loss_clip": 0.01438071, + "auxiliary_loss_mlp": 0.01229603, + "balance_loss_clip": 1.11926508, + "balance_loss_mlp": 1.02685153, + "epoch": 0.5438749436344507, + "flos": 23005175688000.0, + "grad_norm": 1.4443914765178687, + "language_loss": 0.69953126, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72620803, + "num_input_tokens_seen": 194857520, + "step": 9046, + "time_per_iteration": 2.7881667613983154 + }, + { + "auxiliary_loss_clip": 0.01430688, + "auxiliary_loss_mlp": 0.01237587, + "balance_loss_clip": 1.11286068, + "balance_loss_mlp": 1.03750658, + "epoch": 0.5439350668871186, + "flos": 15488839242720.0, + "grad_norm": 1.7281348380209518, + "language_loss": 0.77668792, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.80337071, + "num_input_tokens_seen": 194876020, + "step": 9047, + "time_per_iteration": 2.8644909858703613 + }, + { + "auxiliary_loss_clip": 0.01436283, + "auxiliary_loss_mlp": 0.01229802, + "balance_loss_clip": 1.11803317, + "balance_loss_mlp": 1.02933919, + "epoch": 0.5439951901397866, + "flos": 17240341230720.0, + "grad_norm": 1.5654663877577986, + "language_loss": 0.72949278, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.75615364, + "num_input_tokens_seen": 194894650, + "step": 9048, + "time_per_iteration": 2.7486329078674316 + }, + { + "auxiliary_loss_clip": 0.01435415, + "auxiliary_loss_mlp": 0.01235132, + "balance_loss_clip": 1.11664248, + "balance_loss_mlp": 1.03219032, + "epoch": 0.5440553133924545, + "flos": 18663002792640.0, + "grad_norm": 2.5496570686649704, + "language_loss": 0.93700922, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.96371472, + "num_input_tokens_seen": 194911935, + "step": 9049, + "time_per_iteration": 4.541354417800903 + }, + { + "auxiliary_loss_clip": 0.01435773, + "auxiliary_loss_mlp": 0.01233779, + "balance_loss_clip": 1.11716497, + "balance_loss_mlp": 1.03408003, + "epoch": 0.5441154366451225, + "flos": 27125239092480.0, + "grad_norm": 2.2104685922946015, + "language_loss": 0.7397505, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.76644599, + "num_input_tokens_seen": 194931620, + "step": 9050, + "time_per_iteration": 2.878970146179199 + }, + { + "auxiliary_loss_clip": 0.01438711, + "auxiliary_loss_mlp": 0.01233672, + "balance_loss_clip": 1.11979461, + "balance_loss_mlp": 1.03149295, + "epoch": 0.5441755598977904, + "flos": 25996220259840.0, + "grad_norm": 1.7772670860434416, + "language_loss": 0.67245555, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69917929, + "num_input_tokens_seen": 194952560, + "step": 9051, + "time_per_iteration": 2.8801872730255127 + }, + { + "auxiliary_loss_clip": 0.01433645, + "auxiliary_loss_mlp": 0.01235513, + "balance_loss_clip": 1.11414218, + "balance_loss_mlp": 1.03543246, + "epoch": 0.5442356831504584, + "flos": 32382817237440.0, + "grad_norm": 1.807943850353356, + "language_loss": 0.6734724, + "learning_rate": 1.810810185460011e-06, + "loss": 0.70016396, + "num_input_tokens_seen": 194973915, + "step": 9052, + "time_per_iteration": 2.832153081893921 + }, + { + "auxiliary_loss_clip": 0.01434402, + "auxiliary_loss_mlp": 0.01240581, + "balance_loss_clip": 1.11462307, + "balance_loss_mlp": 1.0391655, + "epoch": 0.5442958064031264, + "flos": 24166244180160.0, + "grad_norm": 1.8938095735852605, + "language_loss": 0.93362677, + "learning_rate": 1.810422473773436e-06, + "loss": 0.96037662, + "num_input_tokens_seen": 194990170, + "step": 9053, + "time_per_iteration": 2.8336639404296875 + }, + { + "auxiliary_loss_clip": 0.01438238, + "auxiliary_loss_mlp": 0.01243117, + "balance_loss_clip": 1.11771107, + "balance_loss_mlp": 1.04131913, + "epoch": 0.5443559296557944, + "flos": 18766358121600.0, + "grad_norm": 2.71787855667796, + "language_loss": 0.83649993, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.8633135, + "num_input_tokens_seen": 195006395, + "step": 9054, + "time_per_iteration": 2.714226245880127 + }, + { + "auxiliary_loss_clip": 0.01441059, + "auxiliary_loss_mlp": 0.01234345, + "balance_loss_clip": 1.12153268, + "balance_loss_mlp": 1.02930498, + "epoch": 0.5444160529084624, + "flos": 22634462208960.0, + "grad_norm": 2.490401112311333, + "language_loss": 0.68373501, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.71048903, + "num_input_tokens_seen": 195025080, + "step": 9055, + "time_per_iteration": 4.189216136932373 + }, + { + "auxiliary_loss_clip": 0.01501338, + "auxiliary_loss_mlp": 0.01193138, + "balance_loss_clip": 1.19573808, + "balance_loss_mlp": 1.00049591, + "epoch": 0.5444761761611303, + "flos": 69679010924640.0, + "grad_norm": 0.7309659102961371, + "language_loss": 0.57578409, + "learning_rate": 1.80925938190531e-06, + "loss": 0.60272884, + "num_input_tokens_seen": 195085725, + "step": 9056, + "time_per_iteration": 3.303011894226074 + }, + { + "auxiliary_loss_clip": 0.0143873, + "auxiliary_loss_mlp": 0.01230512, + "balance_loss_clip": 1.11807215, + "balance_loss_mlp": 1.02737916, + "epoch": 0.5445362994137983, + "flos": 14279942976480.0, + "grad_norm": 2.769086373780613, + "language_loss": 0.69524956, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.72194195, + "num_input_tokens_seen": 195102585, + "step": 9057, + "time_per_iteration": 2.751560926437378 + }, + { + "auxiliary_loss_clip": 0.01445315, + "auxiliary_loss_mlp": 0.01234667, + "balance_loss_clip": 1.12329674, + "balance_loss_mlp": 1.03267908, + "epoch": 0.5445964226664662, + "flos": 28988744029920.0, + "grad_norm": 2.4220485057293946, + "language_loss": 0.74931026, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77611011, + "num_input_tokens_seen": 195120055, + "step": 9058, + "time_per_iteration": 2.8341851234436035 + }, + { + "auxiliary_loss_clip": 0.0151234, + "auxiliary_loss_mlp": 0.0121315, + "balance_loss_clip": 1.20336318, + "balance_loss_mlp": 1.02127075, + "epoch": 0.5446565459191343, + "flos": 68627631764160.0, + "grad_norm": 0.8967920945715857, + "language_loss": 0.62588465, + "learning_rate": 1.808096355133312e-06, + "loss": 0.65313953, + "num_input_tokens_seen": 195181045, + "step": 9059, + "time_per_iteration": 3.403158187866211 + }, + { + "auxiliary_loss_clip": 0.01431781, + "auxiliary_loss_mlp": 0.01234386, + "balance_loss_clip": 1.1117487, + "balance_loss_mlp": 1.03144383, + "epoch": 0.5447166691718022, + "flos": 16218318830400.0, + "grad_norm": 1.6997668233632053, + "language_loss": 0.79070759, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81736922, + "num_input_tokens_seen": 195198840, + "step": 9060, + "time_per_iteration": 2.9172232151031494 + }, + { + "auxiliary_loss_clip": 0.01439467, + "auxiliary_loss_mlp": 0.01235947, + "balance_loss_clip": 1.11862695, + "balance_loss_mlp": 1.03481674, + "epoch": 0.5447767924244702, + "flos": 25851447015840.0, + "grad_norm": 1.8972240481409817, + "language_loss": 0.79447722, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.82123137, + "num_input_tokens_seen": 195218720, + "step": 9061, + "time_per_iteration": 2.8524413108825684 + }, + { + "auxiliary_loss_clip": 0.01437195, + "auxiliary_loss_mlp": 0.01230772, + "balance_loss_clip": 1.1169852, + "balance_loss_mlp": 1.03002381, + "epoch": 0.5448369156771381, + "flos": 19679487681600.0, + "grad_norm": 1.7931214634144594, + "language_loss": 0.87159204, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.8982718, + "num_input_tokens_seen": 195235770, + "step": 9062, + "time_per_iteration": 2.789863109588623 + }, + { + "auxiliary_loss_clip": 0.01438856, + "auxiliary_loss_mlp": 0.01236009, + "balance_loss_clip": 1.11797643, + "balance_loss_mlp": 1.03306699, + "epoch": 0.5448970389298061, + "flos": 19283855037120.0, + "grad_norm": 2.1522785526614268, + "language_loss": 0.82241368, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84916234, + "num_input_tokens_seen": 195254870, + "step": 9063, + "time_per_iteration": 2.8633930683135986 + }, + { + "auxiliary_loss_clip": 0.01442114, + "auxiliary_loss_mlp": 0.01236947, + "balance_loss_clip": 1.12134361, + "balance_loss_mlp": 1.03362417, + "epoch": 0.544957162182474, + "flos": 20993559828480.0, + "grad_norm": 2.12391881829202, + "language_loss": 0.6367234, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.66351402, + "num_input_tokens_seen": 195273390, + "step": 9064, + "time_per_iteration": 2.817906141281128 + }, + { + "auxiliary_loss_clip": 0.01440028, + "auxiliary_loss_mlp": 0.01237364, + "balance_loss_clip": 1.11830211, + "balance_loss_mlp": 1.0315609, + "epoch": 0.545017285435142, + "flos": 25376960998080.0, + "grad_norm": 1.6771330797852924, + "language_loss": 0.80164677, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.8284207, + "num_input_tokens_seen": 195295635, + "step": 9065, + "time_per_iteration": 2.8410239219665527 + }, + { + "auxiliary_loss_clip": 0.0144216, + "auxiliary_loss_mlp": 0.01236925, + "balance_loss_clip": 1.12069285, + "balance_loss_mlp": 1.03283834, + "epoch": 0.54507740868781, + "flos": 19136730247200.0, + "grad_norm": 2.0122534841844413, + "language_loss": 0.78656715, + "learning_rate": 1.805382881379827e-06, + "loss": 0.81335801, + "num_input_tokens_seen": 195312545, + "step": 9066, + "time_per_iteration": 2.7704360485076904 + }, + { + "auxiliary_loss_clip": 0.01436321, + "auxiliary_loss_mlp": 0.01225772, + "balance_loss_clip": 1.11353564, + "balance_loss_mlp": 1.02168584, + "epoch": 0.545137531940478, + "flos": 26252199961920.0, + "grad_norm": 1.620422212872621, + "language_loss": 0.75834405, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.78496504, + "num_input_tokens_seen": 195332955, + "step": 9067, + "time_per_iteration": 2.847965955734253 + }, + { + "auxiliary_loss_clip": 0.01442855, + "auxiliary_loss_mlp": 0.01234211, + "balance_loss_clip": 1.12283289, + "balance_loss_mlp": 1.03012431, + "epoch": 0.545197655193146, + "flos": 37558393971840.0, + "grad_norm": 3.4406745144686393, + "language_loss": 0.63572103, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.66249174, + "num_input_tokens_seen": 195355930, + "step": 9068, + "time_per_iteration": 2.9731643199920654 + }, + { + "auxiliary_loss_clip": 0.01438354, + "auxiliary_loss_mlp": 0.01231311, + "balance_loss_clip": 1.11734545, + "balance_loss_mlp": 1.02932286, + "epoch": 0.5452577784458139, + "flos": 26033655718080.0, + "grad_norm": 1.658513144743718, + "language_loss": 0.72648406, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.75318074, + "num_input_tokens_seen": 195376445, + "step": 9069, + "time_per_iteration": 2.7958295345306396 + }, + { + "auxiliary_loss_clip": 0.01440493, + "auxiliary_loss_mlp": 0.01227805, + "balance_loss_clip": 1.12046957, + "balance_loss_mlp": 1.02429056, + "epoch": 0.5453179016984819, + "flos": 17640942464160.0, + "grad_norm": 1.8947631052403235, + "language_loss": 0.74298501, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.76966798, + "num_input_tokens_seen": 195393725, + "step": 9070, + "time_per_iteration": 2.793675661087036 + }, + { + "auxiliary_loss_clip": 0.01438599, + "auxiliary_loss_mlp": 0.01245001, + "balance_loss_clip": 1.11758721, + "balance_loss_mlp": 1.04396605, + "epoch": 0.5453780249511498, + "flos": 23218637558400.0, + "grad_norm": 2.96687049311437, + "language_loss": 0.60224861, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.62908459, + "num_input_tokens_seen": 195411380, + "step": 9071, + "time_per_iteration": 2.9031496047973633 + }, + { + "auxiliary_loss_clip": 0.01502113, + "auxiliary_loss_mlp": 0.012108, + "balance_loss_clip": 1.19307709, + "balance_loss_mlp": 1.01968384, + "epoch": 0.5454381482038179, + "flos": 68704095663360.0, + "grad_norm": 0.7423935829090781, + "language_loss": 0.57039362, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59752274, + "num_input_tokens_seen": 195482015, + "step": 9072, + "time_per_iteration": 3.4180173873901367 + }, + { + "auxiliary_loss_clip": 0.01438442, + "auxiliary_loss_mlp": 0.01231216, + "balance_loss_clip": 1.11742711, + "balance_loss_mlp": 1.03046763, + "epoch": 0.5454982714564858, + "flos": 13262282314560.0, + "grad_norm": 1.7406826432210412, + "language_loss": 0.69569856, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72239518, + "num_input_tokens_seen": 195500440, + "step": 9073, + "time_per_iteration": 2.7933855056762695 + }, + { + "auxiliary_loss_clip": 0.01438592, + "auxiliary_loss_mlp": 0.01239256, + "balance_loss_clip": 1.11776185, + "balance_loss_mlp": 1.03860247, + "epoch": 0.5455583947091538, + "flos": 21838076618400.0, + "grad_norm": 2.0663446995656094, + "language_loss": 0.71568078, + "learning_rate": 1.802282211606627e-06, + "loss": 0.74245924, + "num_input_tokens_seen": 195520860, + "step": 9074, + "time_per_iteration": 2.8151679039001465 + }, + { + "auxiliary_loss_clip": 0.01440749, + "auxiliary_loss_mlp": 0.01226348, + "balance_loss_clip": 1.11989808, + "balance_loss_mlp": 1.02455068, + "epoch": 0.5456185179618217, + "flos": 17819130781440.0, + "grad_norm": 1.943862815954245, + "language_loss": 0.68200499, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70867598, + "num_input_tokens_seen": 195538615, + "step": 9075, + "time_per_iteration": 2.755591630935669 + }, + { + "auxiliary_loss_clip": 0.01439594, + "auxiliary_loss_mlp": 0.01229492, + "balance_loss_clip": 1.11746621, + "balance_loss_mlp": 1.02712214, + "epoch": 0.5456786412144897, + "flos": 21071540854080.0, + "grad_norm": 1.7364296226624032, + "language_loss": 0.80909228, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.83578312, + "num_input_tokens_seen": 195557460, + "step": 9076, + "time_per_iteration": 2.7391245365142822 + }, + { + "auxiliary_loss_clip": 0.01438618, + "auxiliary_loss_mlp": 0.01229209, + "balance_loss_clip": 1.11684787, + "balance_loss_mlp": 1.02617192, + "epoch": 0.5457387644671576, + "flos": 23297225434560.0, + "grad_norm": 1.6250955591812926, + "language_loss": 0.80380523, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.83048356, + "num_input_tokens_seen": 195577985, + "step": 9077, + "time_per_iteration": 2.8753716945648193 + }, + { + "auxiliary_loss_clip": 0.01436191, + "auxiliary_loss_mlp": 0.01227763, + "balance_loss_clip": 1.11377954, + "balance_loss_mlp": 1.0250119, + "epoch": 0.5457988877198257, + "flos": 21619228949280.0, + "grad_norm": 1.8546518936878873, + "language_loss": 0.68206263, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.70870221, + "num_input_tokens_seen": 195597620, + "step": 9078, + "time_per_iteration": 2.8016176223754883 + }, + { + "auxiliary_loss_clip": 0.01438768, + "auxiliary_loss_mlp": 0.01241791, + "balance_loss_clip": 1.11798811, + "balance_loss_mlp": 1.03713274, + "epoch": 0.5458590109724936, + "flos": 23764391317440.0, + "grad_norm": 2.2259102634342574, + "language_loss": 0.81214339, + "learning_rate": 1.800344536188764e-06, + "loss": 0.83894897, + "num_input_tokens_seen": 195615910, + "step": 9079, + "time_per_iteration": 2.840949535369873 + }, + { + "auxiliary_loss_clip": 0.01440314, + "auxiliary_loss_mlp": 0.01235362, + "balance_loss_clip": 1.11769176, + "balance_loss_mlp": 1.03137136, + "epoch": 0.5459191342251616, + "flos": 24426509764320.0, + "grad_norm": 4.865558002395513, + "language_loss": 0.75718343, + "learning_rate": 1.799957023759277e-06, + "loss": 0.7839402, + "num_input_tokens_seen": 195635620, + "step": 9080, + "time_per_iteration": 4.104719400405884 + }, + { + "auxiliary_loss_clip": 0.0144, + "auxiliary_loss_mlp": 0.01229111, + "balance_loss_clip": 1.11674869, + "balance_loss_mlp": 1.02292633, + "epoch": 0.5459792574778296, + "flos": 23625269369280.0, + "grad_norm": 2.3072163330900564, + "language_loss": 0.83044505, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85713613, + "num_input_tokens_seen": 195652495, + "step": 9081, + "time_per_iteration": 2.7798805236816406 + }, + { + "auxiliary_loss_clip": 0.0143772, + "auxiliary_loss_mlp": 0.01235125, + "balance_loss_clip": 1.11610961, + "balance_loss_mlp": 1.02951241, + "epoch": 0.5460393807304975, + "flos": 19137564666720.0, + "grad_norm": 1.8258773294888935, + "language_loss": 0.70064843, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.72737688, + "num_input_tokens_seen": 195671965, + "step": 9082, + "time_per_iteration": 2.7558043003082275 + }, + { + "auxiliary_loss_clip": 0.01436433, + "auxiliary_loss_mlp": 0.01228261, + "balance_loss_clip": 1.11516964, + "balance_loss_mlp": 1.02722669, + "epoch": 0.5460995039831655, + "flos": 35921474048160.0, + "grad_norm": 1.6095093133350369, + "language_loss": 0.66274494, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68939191, + "num_input_tokens_seen": 195694725, + "step": 9083, + "time_per_iteration": 2.8751683235168457 + }, + { + "auxiliary_loss_clip": 0.01439195, + "auxiliary_loss_mlp": 0.01230089, + "balance_loss_clip": 1.11731994, + "balance_loss_mlp": 1.0282917, + "epoch": 0.5461596272358334, + "flos": 26761580251200.0, + "grad_norm": 1.8970441108464797, + "language_loss": 0.79255605, + "learning_rate": 1.798407050044766e-06, + "loss": 0.81924891, + "num_input_tokens_seen": 195714090, + "step": 9084, + "time_per_iteration": 2.840388774871826 + }, + { + "auxiliary_loss_clip": 0.01437411, + "auxiliary_loss_mlp": 0.01237115, + "balance_loss_clip": 1.11556077, + "balance_loss_mlp": 1.03703392, + "epoch": 0.5462197504885015, + "flos": 20888687373120.0, + "grad_norm": 1.861172237530804, + "language_loss": 0.75284362, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77958882, + "num_input_tokens_seen": 195733585, + "step": 9085, + "time_per_iteration": 2.76247239112854 + }, + { + "auxiliary_loss_clip": 0.01437524, + "auxiliary_loss_mlp": 0.01226586, + "balance_loss_clip": 1.11482799, + "balance_loss_mlp": 1.02116442, + "epoch": 0.5462798737411694, + "flos": 25806956919840.0, + "grad_norm": 2.6916282409256365, + "language_loss": 0.74638581, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.77302694, + "num_input_tokens_seen": 195752820, + "step": 9086, + "time_per_iteration": 2.768385171890259 + }, + { + "auxiliary_loss_clip": 0.01436061, + "auxiliary_loss_mlp": 0.01230891, + "balance_loss_clip": 1.11405611, + "balance_loss_mlp": 1.02890253, + "epoch": 0.5463399969938374, + "flos": 25777372590720.0, + "grad_norm": 1.6418654698144246, + "language_loss": 0.76533759, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79200715, + "num_input_tokens_seen": 195773740, + "step": 9087, + "time_per_iteration": 2.807274580001831 + }, + { + "auxiliary_loss_clip": 0.01445676, + "auxiliary_loss_mlp": 0.01227303, + "balance_loss_clip": 1.12292647, + "balance_loss_mlp": 1.02169085, + "epoch": 0.5464001202465053, + "flos": 18845287351200.0, + "grad_norm": 1.8970398656784915, + "language_loss": 0.77478373, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.80151355, + "num_input_tokens_seen": 195792125, + "step": 9088, + "time_per_iteration": 5.73088812828064 + }, + { + "auxiliary_loss_clip": 0.01513498, + "auxiliary_loss_mlp": 0.01189201, + "balance_loss_clip": 1.20281625, + "balance_loss_mlp": 0.99694061, + "epoch": 0.5464602434991733, + "flos": 69056489841120.0, + "grad_norm": 0.7275829325201014, + "language_loss": 0.57721597, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.60424298, + "num_input_tokens_seen": 195854935, + "step": 9089, + "time_per_iteration": 3.3697640895843506 + }, + { + "auxiliary_loss_clip": 0.01433419, + "auxiliary_loss_mlp": 0.01230564, + "balance_loss_clip": 1.11154819, + "balance_loss_mlp": 1.02876663, + "epoch": 0.5465203667518412, + "flos": 27562327580160.0, + "grad_norm": 1.8156030246250912, + "language_loss": 0.76762933, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.7942692, + "num_input_tokens_seen": 195874715, + "step": 9090, + "time_per_iteration": 2.846762180328369 + }, + { + "auxiliary_loss_clip": 0.01435285, + "auxiliary_loss_mlp": 0.01231364, + "balance_loss_clip": 1.11312008, + "balance_loss_mlp": 1.02765918, + "epoch": 0.5465804900045093, + "flos": 21212066144160.0, + "grad_norm": 2.0766382533718297, + "language_loss": 0.74013615, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.76680267, + "num_input_tokens_seen": 195892610, + "step": 9091, + "time_per_iteration": 2.9404430389404297 + }, + { + "auxiliary_loss_clip": 0.01436685, + "auxiliary_loss_mlp": 0.0123206, + "balance_loss_clip": 1.11585855, + "balance_loss_mlp": 1.03035808, + "epoch": 0.5466406132571772, + "flos": 22490978522400.0, + "grad_norm": 1.7950798097336036, + "language_loss": 0.77837467, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.80506212, + "num_input_tokens_seen": 195911085, + "step": 9092, + "time_per_iteration": 4.347275972366333 + }, + { + "auxiliary_loss_clip": 0.01437532, + "auxiliary_loss_mlp": 0.01235622, + "balance_loss_clip": 1.11498129, + "balance_loss_mlp": 1.03287089, + "epoch": 0.5467007365098452, + "flos": 17677771071840.0, + "grad_norm": 2.229992486139451, + "language_loss": 0.75269544, + "learning_rate": 1.794920057818476e-06, + "loss": 0.77942699, + "num_input_tokens_seen": 195929845, + "step": 9093, + "time_per_iteration": 2.7901954650878906 + }, + { + "auxiliary_loss_clip": 0.01438975, + "auxiliary_loss_mlp": 0.01240844, + "balance_loss_clip": 1.11829805, + "balance_loss_mlp": 1.03752017, + "epoch": 0.5467608597625132, + "flos": 15700290920640.0, + "grad_norm": 1.8825410059978493, + "language_loss": 0.69398129, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.72077954, + "num_input_tokens_seen": 195946350, + "step": 9094, + "time_per_iteration": 2.8007898330688477 + }, + { + "auxiliary_loss_clip": 0.01437605, + "auxiliary_loss_mlp": 0.01236815, + "balance_loss_clip": 1.11656439, + "balance_loss_mlp": 1.03349161, + "epoch": 0.5468209830151811, + "flos": 24314924024640.0, + "grad_norm": 3.1564716265587474, + "language_loss": 0.68216103, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70890522, + "num_input_tokens_seen": 195959840, + "step": 9095, + "time_per_iteration": 2.802500009536743 + }, + { + "auxiliary_loss_clip": 0.0143248, + "auxiliary_loss_mlp": 0.01242805, + "balance_loss_clip": 1.11236715, + "balance_loss_mlp": 1.04310584, + "epoch": 0.5468811062678491, + "flos": 29168752898880.0, + "grad_norm": 1.8923675637503232, + "language_loss": 0.66422474, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.69097763, + "num_input_tokens_seen": 195981125, + "step": 9096, + "time_per_iteration": 2.902987241744995 + }, + { + "auxiliary_loss_clip": 0.0148398, + "auxiliary_loss_mlp": 0.01200096, + "balance_loss_clip": 1.1766324, + "balance_loss_mlp": 1.00783539, + "epoch": 0.546941229520517, + "flos": 67873081662720.0, + "grad_norm": 0.7510523978292085, + "language_loss": 0.57489747, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.60173821, + "num_input_tokens_seen": 196038880, + "step": 9097, + "time_per_iteration": 3.444495677947998 + }, + { + "auxiliary_loss_clip": 0.01482956, + "auxiliary_loss_mlp": 0.01204262, + "balance_loss_clip": 1.17603922, + "balance_loss_mlp": 1.01238251, + "epoch": 0.5470013527731851, + "flos": 58275758024640.0, + "grad_norm": 10.961198943100591, + "language_loss": 0.64695626, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.67382842, + "num_input_tokens_seen": 196099215, + "step": 9098, + "time_per_iteration": 3.303419828414917 + }, + { + "auxiliary_loss_clip": 0.01433152, + "auxiliary_loss_mlp": 0.0124925, + "balance_loss_clip": 1.11383867, + "balance_loss_mlp": 1.04726219, + "epoch": 0.547061476025853, + "flos": 22968119511360.0, + "grad_norm": 1.591430706263, + "language_loss": 0.73163462, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75845861, + "num_input_tokens_seen": 196120370, + "step": 9099, + "time_per_iteration": 2.8772222995758057 + }, + { + "auxiliary_loss_clip": 0.01429476, + "auxiliary_loss_mlp": 0.01224179, + "balance_loss_clip": 1.10940933, + "balance_loss_mlp": 1.02428937, + "epoch": 0.547121599278521, + "flos": 29970676000800.0, + "grad_norm": 2.081441286785605, + "language_loss": 0.73117721, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.75771379, + "num_input_tokens_seen": 196139075, + "step": 9100, + "time_per_iteration": 2.8428070545196533 + }, + { + "auxiliary_loss_clip": 0.0143029, + "auxiliary_loss_mlp": 0.01235835, + "balance_loss_clip": 1.11090422, + "balance_loss_mlp": 1.03670728, + "epoch": 0.5471817225311889, + "flos": 36538230051360.0, + "grad_norm": 1.7498430099185802, + "language_loss": 0.67920434, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.7058655, + "num_input_tokens_seen": 196159990, + "step": 9101, + "time_per_iteration": 2.8787872791290283 + }, + { + "auxiliary_loss_clip": 0.01432387, + "auxiliary_loss_mlp": 0.01239545, + "balance_loss_clip": 1.11114216, + "balance_loss_mlp": 1.03908229, + "epoch": 0.5472418457838569, + "flos": 25777600159680.0, + "grad_norm": 2.2530945727947342, + "language_loss": 0.77852023, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.8052395, + "num_input_tokens_seen": 196180570, + "step": 9102, + "time_per_iteration": 2.8143179416656494 + }, + { + "auxiliary_loss_clip": 0.01438968, + "auxiliary_loss_mlp": 0.01246976, + "balance_loss_clip": 1.11751533, + "balance_loss_mlp": 1.04479718, + "epoch": 0.5473019690365248, + "flos": 27889954305120.0, + "grad_norm": 1.4974067902634225, + "language_loss": 0.7203033, + "learning_rate": 1.791046361258413e-06, + "loss": 0.7471627, + "num_input_tokens_seen": 196200300, + "step": 9103, + "time_per_iteration": 2.9730217456817627 + }, + { + "auxiliary_loss_clip": 0.01432143, + "auxiliary_loss_mlp": 0.01235993, + "balance_loss_clip": 1.11290097, + "balance_loss_mlp": 1.03610301, + "epoch": 0.5473620922891929, + "flos": 57635865843840.0, + "grad_norm": 1.3212709154240379, + "language_loss": 0.65609348, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.68277478, + "num_input_tokens_seen": 196228525, + "step": 9104, + "time_per_iteration": 3.1820361614227295 + }, + { + "auxiliary_loss_clip": 0.01433848, + "auxiliary_loss_mlp": 0.01229641, + "balance_loss_clip": 1.11272705, + "balance_loss_mlp": 1.02517319, + "epoch": 0.5474222155418608, + "flos": 19356070982400.0, + "grad_norm": 2.2105734954743568, + "language_loss": 0.81586099, + "learning_rate": 1.790271716558888e-06, + "loss": 0.84249586, + "num_input_tokens_seen": 196247690, + "step": 9105, + "time_per_iteration": 2.7570464611053467 + }, + { + "auxiliary_loss_clip": 0.01434494, + "auxiliary_loss_mlp": 0.01232116, + "balance_loss_clip": 1.11435187, + "balance_loss_mlp": 1.03146243, + "epoch": 0.5474823387945288, + "flos": 25122877704000.0, + "grad_norm": 1.9482590754130265, + "language_loss": 0.80322242, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82988852, + "num_input_tokens_seen": 196268555, + "step": 9106, + "time_per_iteration": 2.8054511547088623 + }, + { + "auxiliary_loss_clip": 0.01436934, + "auxiliary_loss_mlp": 0.01241793, + "balance_loss_clip": 1.11565208, + "balance_loss_mlp": 1.04247546, + "epoch": 0.5475424620471967, + "flos": 18006118431840.0, + "grad_norm": 1.8100065943951644, + "language_loss": 0.69794095, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.72472817, + "num_input_tokens_seen": 196285585, + "step": 9107, + "time_per_iteration": 2.7235825061798096 + }, + { + "auxiliary_loss_clip": 0.01432699, + "auxiliary_loss_mlp": 0.01235073, + "balance_loss_clip": 1.11252069, + "balance_loss_mlp": 1.03375196, + "epoch": 0.5476025852998647, + "flos": 22311500647680.0, + "grad_norm": 1.8423249584979182, + "language_loss": 0.63048708, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65716481, + "num_input_tokens_seen": 196305085, + "step": 9108, + "time_per_iteration": 2.7688698768615723 + }, + { + "auxiliary_loss_clip": 0.01425578, + "auxiliary_loss_mlp": 0.01233967, + "balance_loss_clip": 1.10585999, + "balance_loss_mlp": 1.03407669, + "epoch": 0.5476627085525327, + "flos": 20122568818560.0, + "grad_norm": 2.2901183108707754, + "language_loss": 0.75299478, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77959019, + "num_input_tokens_seen": 196323945, + "step": 9109, + "time_per_iteration": 2.707965135574341 + }, + { + "auxiliary_loss_clip": 0.01431587, + "auxiliary_loss_mlp": 0.01227355, + "balance_loss_clip": 1.11055517, + "balance_loss_mlp": 1.02708316, + "epoch": 0.5477228318052006, + "flos": 17714523823200.0, + "grad_norm": 2.30916705900077, + "language_loss": 0.7795471, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.80613649, + "num_input_tokens_seen": 196342200, + "step": 9110, + "time_per_iteration": 2.783571481704712 + }, + { + "auxiliary_loss_clip": 0.01426743, + "auxiliary_loss_mlp": 0.01244049, + "balance_loss_clip": 1.1053462, + "balance_loss_mlp": 1.04625738, + "epoch": 0.5477829550578687, + "flos": 25851295303200.0, + "grad_norm": 1.5996185437443056, + "language_loss": 0.71182269, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73853058, + "num_input_tokens_seen": 196362940, + "step": 9111, + "time_per_iteration": 2.854210138320923 + }, + { + "auxiliary_loss_clip": 0.0143173, + "auxiliary_loss_mlp": 0.01234028, + "balance_loss_clip": 1.11178756, + "balance_loss_mlp": 1.03280258, + "epoch": 0.5478430783105366, + "flos": 23041928439360.0, + "grad_norm": 3.1823724074846194, + "language_loss": 0.71385241, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.74050999, + "num_input_tokens_seen": 196383070, + "step": 9112, + "time_per_iteration": 2.787736654281616 + }, + { + "auxiliary_loss_clip": 0.01430991, + "auxiliary_loss_mlp": 0.0124709, + "balance_loss_clip": 1.10982299, + "balance_loss_mlp": 1.04700971, + "epoch": 0.5479032015632046, + "flos": 16073090448480.0, + "grad_norm": 2.5598664477496014, + "language_loss": 0.88015628, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.90693706, + "num_input_tokens_seen": 196398485, + "step": 9113, + "time_per_iteration": 2.7723822593688965 + }, + { + "auxiliary_loss_clip": 0.01434951, + "auxiliary_loss_mlp": 0.01245775, + "balance_loss_clip": 1.11435068, + "balance_loss_mlp": 1.04779208, + "epoch": 0.5479633248158725, + "flos": 24280940028960.0, + "grad_norm": 1.5243717292786405, + "language_loss": 0.73116195, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75796926, + "num_input_tokens_seen": 196417725, + "step": 9114, + "time_per_iteration": 2.7593390941619873 + }, + { + "auxiliary_loss_clip": 0.01432176, + "auxiliary_loss_mlp": 0.01228291, + "balance_loss_clip": 1.11136651, + "balance_loss_mlp": 1.02801943, + "epoch": 0.5480234480685405, + "flos": 26360561808000.0, + "grad_norm": 1.8398358493795548, + "language_loss": 0.72300541, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74961007, + "num_input_tokens_seen": 196437840, + "step": 9115, + "time_per_iteration": 2.794957399368286 + }, + { + "auxiliary_loss_clip": 0.01432458, + "auxiliary_loss_mlp": 0.01233983, + "balance_loss_clip": 1.11075509, + "balance_loss_mlp": 1.03294873, + "epoch": 0.5480835713212084, + "flos": 22057303569120.0, + "grad_norm": 2.017005743108544, + "language_loss": 0.72020173, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74686611, + "num_input_tokens_seen": 196457300, + "step": 9116, + "time_per_iteration": 2.7748494148254395 + }, + { + "auxiliary_loss_clip": 0.01435114, + "auxiliary_loss_mlp": 0.01237701, + "balance_loss_clip": 1.11445665, + "balance_loss_mlp": 1.03552246, + "epoch": 0.5481436945738765, + "flos": 25303379639040.0, + "grad_norm": 1.9427782717515334, + "language_loss": 0.76496565, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.79169375, + "num_input_tokens_seen": 196476720, + "step": 9117, + "time_per_iteration": 2.846174955368042 + }, + { + "auxiliary_loss_clip": 0.01431037, + "auxiliary_loss_mlp": 0.01227024, + "balance_loss_clip": 1.11036944, + "balance_loss_mlp": 1.02637136, + "epoch": 0.5482038178265444, + "flos": 33583521021120.0, + "grad_norm": 1.7307538286213482, + "language_loss": 0.62602341, + "learning_rate": 1.785237306671674e-06, + "loss": 0.65260404, + "num_input_tokens_seen": 196496765, + "step": 9118, + "time_per_iteration": 4.33179235458374 + }, + { + "auxiliary_loss_clip": 0.01437407, + "auxiliary_loss_mlp": 0.01236725, + "balance_loss_clip": 1.11650562, + "balance_loss_mlp": 1.03302062, + "epoch": 0.5482639410792124, + "flos": 19028216688480.0, + "grad_norm": 1.9586963581734382, + "language_loss": 0.79189503, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81863636, + "num_input_tokens_seen": 196516220, + "step": 9119, + "time_per_iteration": 2.8226687908172607 + }, + { + "auxiliary_loss_clip": 0.01433945, + "auxiliary_loss_mlp": 0.01233442, + "balance_loss_clip": 1.11430216, + "balance_loss_mlp": 1.03565025, + "epoch": 0.5483240643318803, + "flos": 25412651760960.0, + "grad_norm": 1.6832675454651977, + "language_loss": 0.82231021, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84898406, + "num_input_tokens_seen": 196533860, + "step": 9120, + "time_per_iteration": 2.928069829940796 + }, + { + "auxiliary_loss_clip": 0.01435395, + "auxiliary_loss_mlp": 0.01229468, + "balance_loss_clip": 1.11544454, + "balance_loss_mlp": 1.02747989, + "epoch": 0.5483841875845483, + "flos": 21468842337600.0, + "grad_norm": 1.9538944940436371, + "language_loss": 0.80415481, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.83080351, + "num_input_tokens_seen": 196551305, + "step": 9121, + "time_per_iteration": 2.784146785736084 + }, + { + "auxiliary_loss_clip": 0.01433066, + "auxiliary_loss_mlp": 0.01241313, + "balance_loss_clip": 1.11242127, + "balance_loss_mlp": 1.04113674, + "epoch": 0.5484443108372163, + "flos": 24749205828480.0, + "grad_norm": 1.8925079719725393, + "language_loss": 0.61132789, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63807166, + "num_input_tokens_seen": 196569420, + "step": 9122, + "time_per_iteration": 2.82802152633667 + }, + { + "auxiliary_loss_clip": 0.01435716, + "auxiliary_loss_mlp": 0.01236971, + "balance_loss_clip": 1.11558425, + "balance_loss_mlp": 1.03669989, + "epoch": 0.5485044340898843, + "flos": 25377795417600.0, + "grad_norm": 2.2842932271685377, + "language_loss": 0.71746135, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.74418819, + "num_input_tokens_seen": 196590610, + "step": 9123, + "time_per_iteration": 2.8379907608032227 + }, + { + "auxiliary_loss_clip": 0.01440785, + "auxiliary_loss_mlp": 0.012304, + "balance_loss_clip": 1.11992908, + "balance_loss_mlp": 1.02974701, + "epoch": 0.5485645573425523, + "flos": 12642833412000.0, + "grad_norm": 2.3217771158728238, + "language_loss": 0.83332705, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.86003894, + "num_input_tokens_seen": 196606495, + "step": 9124, + "time_per_iteration": 2.778315305709839 + }, + { + "auxiliary_loss_clip": 0.01442782, + "auxiliary_loss_mlp": 0.01234786, + "balance_loss_clip": 1.12226653, + "balance_loss_mlp": 1.03508687, + "epoch": 0.5486246805952202, + "flos": 28331935525440.0, + "grad_norm": 2.6685998854748125, + "language_loss": 0.80230844, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.8290841, + "num_input_tokens_seen": 196626365, + "step": 9125, + "time_per_iteration": 4.31989049911499 + }, + { + "auxiliary_loss_clip": 0.01437717, + "auxiliary_loss_mlp": 0.01236282, + "balance_loss_clip": 1.11673284, + "balance_loss_mlp": 1.03143275, + "epoch": 0.5486848038478882, + "flos": 16801887329280.0, + "grad_norm": 2.3124621500457416, + "language_loss": 0.75074381, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.77748382, + "num_input_tokens_seen": 196644465, + "step": 9126, + "time_per_iteration": 2.8255109786987305 + }, + { + "auxiliary_loss_clip": 0.0143315, + "auxiliary_loss_mlp": 0.01231526, + "balance_loss_clip": 1.1121285, + "balance_loss_mlp": 1.02801192, + "epoch": 0.5487449271005561, + "flos": 17238027612960.0, + "grad_norm": 2.6867909049911938, + "language_loss": 0.66851676, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.69516361, + "num_input_tokens_seen": 196659160, + "step": 9127, + "time_per_iteration": 4.1991682052612305 + }, + { + "auxiliary_loss_clip": 0.014415, + "auxiliary_loss_mlp": 0.01232396, + "balance_loss_clip": 1.12029231, + "balance_loss_mlp": 1.03098035, + "epoch": 0.5488050503532241, + "flos": 17342407002240.0, + "grad_norm": 1.755839412592081, + "language_loss": 0.83497268, + "learning_rate": 1.781365618532181e-06, + "loss": 0.86171162, + "num_input_tokens_seen": 196677410, + "step": 9128, + "time_per_iteration": 2.8306198120117188 + }, + { + "auxiliary_loss_clip": 0.01440204, + "auxiliary_loss_mlp": 0.01241037, + "balance_loss_clip": 1.11856365, + "balance_loss_mlp": 1.03618753, + "epoch": 0.548865173605892, + "flos": 17241137722080.0, + "grad_norm": 1.9944215589094023, + "language_loss": 0.74280453, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76961696, + "num_input_tokens_seen": 196696765, + "step": 9129, + "time_per_iteration": 2.8217122554779053 + }, + { + "auxiliary_loss_clip": 0.01444701, + "auxiliary_loss_mlp": 0.01242125, + "balance_loss_clip": 1.12447047, + "balance_loss_mlp": 1.0382297, + "epoch": 0.5489252968585601, + "flos": 17458619977440.0, + "grad_norm": 4.3539459516314025, + "language_loss": 0.63193917, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65880746, + "num_input_tokens_seen": 196714895, + "step": 9130, + "time_per_iteration": 4.249333381652832 + }, + { + "auxiliary_loss_clip": 0.01440293, + "auxiliary_loss_mlp": 0.01241521, + "balance_loss_clip": 1.11934066, + "balance_loss_mlp": 1.03991413, + "epoch": 0.548985420111228, + "flos": 26325819249120.0, + "grad_norm": 1.7995495855401353, + "language_loss": 0.63062036, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.65743852, + "num_input_tokens_seen": 196735510, + "step": 9131, + "time_per_iteration": 2.861477851867676 + }, + { + "auxiliary_loss_clip": 0.01442091, + "auxiliary_loss_mlp": 0.0123667, + "balance_loss_clip": 1.12042332, + "balance_loss_mlp": 1.03487253, + "epoch": 0.549045543363896, + "flos": 18695507590080.0, + "grad_norm": 14.2745625817145, + "language_loss": 0.74832964, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.77511728, + "num_input_tokens_seen": 196752855, + "step": 9132, + "time_per_iteration": 2.7845802307128906 + }, + { + "auxiliary_loss_clip": 0.01436838, + "auxiliary_loss_mlp": 0.01231428, + "balance_loss_clip": 1.11579728, + "balance_loss_mlp": 1.03106081, + "epoch": 0.5491056666165639, + "flos": 24719697355680.0, + "grad_norm": 1.616183845008402, + "language_loss": 0.8162958, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.84297848, + "num_input_tokens_seen": 196772230, + "step": 9133, + "time_per_iteration": 2.8273510932922363 + }, + { + "auxiliary_loss_clip": 0.01437021, + "auxiliary_loss_mlp": 0.01232903, + "balance_loss_clip": 1.11524665, + "balance_loss_mlp": 1.03310776, + "epoch": 0.5491657898692319, + "flos": 21578910950880.0, + "grad_norm": 1.7685663956577817, + "language_loss": 0.70253599, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72923523, + "num_input_tokens_seen": 196790405, + "step": 9134, + "time_per_iteration": 2.803298234939575 + }, + { + "auxiliary_loss_clip": 0.01441548, + "auxiliary_loss_mlp": 0.01232122, + "balance_loss_clip": 1.11938417, + "balance_loss_mlp": 1.03127789, + "epoch": 0.5492259131219, + "flos": 50480191915200.0, + "grad_norm": 4.450101973396505, + "language_loss": 0.61224723, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63898391, + "num_input_tokens_seen": 196813785, + "step": 9135, + "time_per_iteration": 3.009032726287842 + }, + { + "auxiliary_loss_clip": 0.01440838, + "auxiliary_loss_mlp": 0.01237652, + "balance_loss_clip": 1.11962855, + "balance_loss_mlp": 1.03242147, + "epoch": 0.5492860363745679, + "flos": 25121626074720.0, + "grad_norm": 3.031896354416789, + "language_loss": 0.72070205, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74748695, + "num_input_tokens_seen": 196834390, + "step": 9136, + "time_per_iteration": 2.8038105964660645 + }, + { + "auxiliary_loss_clip": 0.01437945, + "auxiliary_loss_mlp": 0.01236824, + "balance_loss_clip": 1.11548853, + "balance_loss_mlp": 1.03254664, + "epoch": 0.5493461596272359, + "flos": 22635751766400.0, + "grad_norm": 2.260599871634834, + "language_loss": 0.68631423, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.71306193, + "num_input_tokens_seen": 196853290, + "step": 9137, + "time_per_iteration": 2.7761588096618652 + }, + { + "auxiliary_loss_clip": 0.01519767, + "auxiliary_loss_mlp": 0.01221069, + "balance_loss_clip": 1.2115469, + "balance_loss_mlp": 1.029953, + "epoch": 0.5494062828799038, + "flos": 66157725575520.0, + "grad_norm": 0.7464085386498999, + "language_loss": 0.6521309, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67953926, + "num_input_tokens_seen": 196913120, + "step": 9138, + "time_per_iteration": 3.3457818031311035 + }, + { + "auxiliary_loss_clip": 0.01445665, + "auxiliary_loss_mlp": 0.01235033, + "balance_loss_clip": 1.12405908, + "balance_loss_mlp": 1.02961123, + "epoch": 0.5494664061325718, + "flos": 21108103964640.0, + "grad_norm": 1.7590391446621676, + "language_loss": 0.74980009, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.77660704, + "num_input_tokens_seen": 196931530, + "step": 9139, + "time_per_iteration": 2.7851510047912598 + }, + { + "auxiliary_loss_clip": 0.01443024, + "auxiliary_loss_mlp": 0.01236075, + "balance_loss_clip": 1.12058544, + "balance_loss_mlp": 1.03580332, + "epoch": 0.5495265293852397, + "flos": 14394335400000.0, + "grad_norm": 1.8925598850688976, + "language_loss": 0.71133494, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73812598, + "num_input_tokens_seen": 196949430, + "step": 9140, + "time_per_iteration": 2.783018112182617 + }, + { + "auxiliary_loss_clip": 0.01437358, + "auxiliary_loss_mlp": 0.01232434, + "balance_loss_clip": 1.11597931, + "balance_loss_mlp": 1.03197145, + "epoch": 0.5495866526379077, + "flos": 25551091002240.0, + "grad_norm": 1.7292372184307152, + "language_loss": 0.76869619, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.79539418, + "num_input_tokens_seen": 196968265, + "step": 9141, + "time_per_iteration": 2.8672640323638916 + }, + { + "auxiliary_loss_clip": 0.01441443, + "auxiliary_loss_mlp": 0.01227961, + "balance_loss_clip": 1.12165606, + "balance_loss_mlp": 1.02635384, + "epoch": 0.5496467758905756, + "flos": 21318986720160.0, + "grad_norm": 2.1188375136540927, + "language_loss": 0.74963677, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77633077, + "num_input_tokens_seen": 196984930, + "step": 9142, + "time_per_iteration": 2.760831594467163 + }, + { + "auxiliary_loss_clip": 0.01444987, + "auxiliary_loss_mlp": 0.01238463, + "balance_loss_clip": 1.12396383, + "balance_loss_mlp": 1.03590262, + "epoch": 0.5497068991432437, + "flos": 22235074676640.0, + "grad_norm": 2.740832598755614, + "language_loss": 0.76631737, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.79315186, + "num_input_tokens_seen": 197002320, + "step": 9143, + "time_per_iteration": 2.8463730812072754 + }, + { + "auxiliary_loss_clip": 0.0144312, + "auxiliary_loss_mlp": 0.01224907, + "balance_loss_clip": 1.12295175, + "balance_loss_mlp": 1.02406311, + "epoch": 0.5497670223959116, + "flos": 18481097515680.0, + "grad_norm": 2.632564560667796, + "language_loss": 0.80160975, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.82828999, + "num_input_tokens_seen": 197020825, + "step": 9144, + "time_per_iteration": 2.728213310241699 + }, + { + "auxiliary_loss_clip": 0.01441774, + "auxiliary_loss_mlp": 0.01236986, + "balance_loss_clip": 1.12161064, + "balance_loss_mlp": 1.03566492, + "epoch": 0.5498271456485796, + "flos": 29207667555360.0, + "grad_norm": 1.7706939657421845, + "language_loss": 0.71205795, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73884559, + "num_input_tokens_seen": 197040450, + "step": 9145, + "time_per_iteration": 2.838289737701416 + }, + { + "auxiliary_loss_clip": 0.01440168, + "auxiliary_loss_mlp": 0.01231338, + "balance_loss_clip": 1.11971796, + "balance_loss_mlp": 1.02992153, + "epoch": 0.5498872689012475, + "flos": 34827273630720.0, + "grad_norm": 1.8152100435812413, + "language_loss": 0.70834601, + "learning_rate": 1.774398678985076e-06, + "loss": 0.73506105, + "num_input_tokens_seen": 197063930, + "step": 9146, + "time_per_iteration": 2.8981597423553467 + }, + { + "auxiliary_loss_clip": 0.01436848, + "auxiliary_loss_mlp": 0.01229812, + "balance_loss_clip": 1.11770368, + "balance_loss_mlp": 1.02992189, + "epoch": 0.5499473921539155, + "flos": 25924345668000.0, + "grad_norm": 1.807812381219046, + "language_loss": 0.64508492, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.67175156, + "num_input_tokens_seen": 197082660, + "step": 9147, + "time_per_iteration": 2.8083982467651367 + }, + { + "auxiliary_loss_clip": 0.01448306, + "auxiliary_loss_mlp": 0.01226325, + "balance_loss_clip": 1.1264025, + "balance_loss_mlp": 1.02128482, + "epoch": 0.5500075154065835, + "flos": 22275923669280.0, + "grad_norm": 2.677379411958436, + "language_loss": 0.80637139, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.83311772, + "num_input_tokens_seen": 197100675, + "step": 9148, + "time_per_iteration": 2.867561101913452 + }, + { + "auxiliary_loss_clip": 0.01453627, + "auxiliary_loss_mlp": 0.01236288, + "balance_loss_clip": 1.13352942, + "balance_loss_mlp": 1.03429949, + "epoch": 0.5500676386592515, + "flos": 28039999563360.0, + "grad_norm": 45.4951919629105, + "language_loss": 0.79268295, + "learning_rate": 1.773237789559453e-06, + "loss": 0.8195821, + "num_input_tokens_seen": 197121320, + "step": 9149, + "time_per_iteration": 2.7961502075195312 + }, + { + "auxiliary_loss_clip": 0.0143758, + "auxiliary_loss_mlp": 0.01228965, + "balance_loss_clip": 1.11656797, + "balance_loss_mlp": 1.02640498, + "epoch": 0.5501277619119195, + "flos": 23917091546880.0, + "grad_norm": 1.9355533637596092, + "language_loss": 0.72301567, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74968112, + "num_input_tokens_seen": 197138965, + "step": 9150, + "time_per_iteration": 2.847707509994507 + }, + { + "auxiliary_loss_clip": 0.01437469, + "auxiliary_loss_mlp": 0.01232336, + "balance_loss_clip": 1.11527741, + "balance_loss_mlp": 1.03072929, + "epoch": 0.5501878851645874, + "flos": 20925933190560.0, + "grad_norm": 1.9661752269295847, + "language_loss": 0.74748981, + "learning_rate": 1.772463906245477e-06, + "loss": 0.7741878, + "num_input_tokens_seen": 197156460, + "step": 9151, + "time_per_iteration": 2.767411947250366 + }, + { + "auxiliary_loss_clip": 0.01435375, + "auxiliary_loss_mlp": 0.01230117, + "balance_loss_clip": 1.115381, + "balance_loss_mlp": 1.02841461, + "epoch": 0.5502480084172554, + "flos": 20667260589120.0, + "grad_norm": 1.9136193512290876, + "language_loss": 0.76287884, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78953373, + "num_input_tokens_seen": 197175140, + "step": 9152, + "time_per_iteration": 2.8398382663726807 + }, + { + "auxiliary_loss_clip": 0.01438184, + "auxiliary_loss_mlp": 0.01223839, + "balance_loss_clip": 1.11704183, + "balance_loss_mlp": 1.02375841, + "epoch": 0.5503081316699233, + "flos": 26434977586560.0, + "grad_norm": 8.858210533112963, + "language_loss": 0.82639283, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.85301298, + "num_input_tokens_seen": 197194345, + "step": 9153, + "time_per_iteration": 2.8771889209747314 + }, + { + "auxiliary_loss_clip": 0.01441067, + "auxiliary_loss_mlp": 0.01223952, + "balance_loss_clip": 1.12045932, + "balance_loss_mlp": 1.01967502, + "epoch": 0.5503682549225913, + "flos": 30631846243680.0, + "grad_norm": 2.7370206224422122, + "language_loss": 0.74784815, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.77449834, + "num_input_tokens_seen": 197215535, + "step": 9154, + "time_per_iteration": 2.884916305541992 + }, + { + "auxiliary_loss_clip": 0.01432557, + "auxiliary_loss_mlp": 0.01237425, + "balance_loss_clip": 1.11131692, + "balance_loss_mlp": 1.03448355, + "epoch": 0.5504283781752592, + "flos": 22567669990560.0, + "grad_norm": 1.7051480126491698, + "language_loss": 0.72789419, + "learning_rate": 1.770916243273199e-06, + "loss": 0.75459397, + "num_input_tokens_seen": 197234945, + "step": 9155, + "time_per_iteration": 2.872051477432251 + }, + { + "auxiliary_loss_clip": 0.01498473, + "auxiliary_loss_mlp": 0.01183784, + "balance_loss_clip": 1.19342124, + "balance_loss_mlp": 0.98885345, + "epoch": 0.5504885014279273, + "flos": 67906913945760.0, + "grad_norm": 0.7430752847634342, + "language_loss": 0.55340493, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.58022749, + "num_input_tokens_seen": 197302285, + "step": 9156, + "time_per_iteration": 4.819626092910767 + }, + { + "auxiliary_loss_clip": 0.01430402, + "auxiliary_loss_mlp": 0.01225889, + "balance_loss_clip": 1.11024261, + "balance_loss_mlp": 1.0242821, + "epoch": 0.5505486246805952, + "flos": 22451950081440.0, + "grad_norm": 3.6232238930754446, + "language_loss": 0.82993025, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.85649312, + "num_input_tokens_seen": 197321575, + "step": 9157, + "time_per_iteration": 2.9454617500305176 + }, + { + "auxiliary_loss_clip": 0.01432693, + "auxiliary_loss_mlp": 0.01237786, + "balance_loss_clip": 1.11156964, + "balance_loss_mlp": 1.03617895, + "epoch": 0.5506087479332632, + "flos": 26909349819840.0, + "grad_norm": 2.156115848047961, + "language_loss": 0.7468133, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77351809, + "num_input_tokens_seen": 197340255, + "step": 9158, + "time_per_iteration": 2.9025020599365234 + }, + { + "auxiliary_loss_clip": 0.01435239, + "auxiliary_loss_mlp": 0.01238926, + "balance_loss_clip": 1.11508679, + "balance_loss_mlp": 1.0393219, + "epoch": 0.5506688711859311, + "flos": 22932504604800.0, + "grad_norm": 1.5412508351670389, + "language_loss": 0.69329178, + "learning_rate": 1.769368719290979e-06, + "loss": 0.72003353, + "num_input_tokens_seen": 197360360, + "step": 9159, + "time_per_iteration": 2.82124924659729 + }, + { + "auxiliary_loss_clip": 0.01432307, + "auxiliary_loss_mlp": 0.01226996, + "balance_loss_clip": 1.11037493, + "balance_loss_mlp": 1.02472115, + "epoch": 0.5507289944385991, + "flos": 29608989423840.0, + "grad_norm": 1.5597146103030715, + "language_loss": 0.67892337, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.7055164, + "num_input_tokens_seen": 197381905, + "step": 9160, + "time_per_iteration": 2.8751893043518066 + }, + { + "auxiliary_loss_clip": 0.0142851, + "auxiliary_loss_mlp": 0.01238203, + "balance_loss_clip": 1.10732365, + "balance_loss_mlp": 1.04107881, + "epoch": 0.5507891176912671, + "flos": 15335873516160.0, + "grad_norm": 2.2487757774350303, + "language_loss": 0.71851885, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.74518603, + "num_input_tokens_seen": 197398555, + "step": 9161, + "time_per_iteration": 2.810718059539795 + }, + { + "auxiliary_loss_clip": 0.01440028, + "auxiliary_loss_mlp": 0.0124498, + "balance_loss_clip": 1.11844063, + "balance_loss_mlp": 1.04499459, + "epoch": 0.5508492409439351, + "flos": 26580736962720.0, + "grad_norm": 1.558096845335108, + "language_loss": 0.69627428, + "learning_rate": 1.768208168081359e-06, + "loss": 0.72312438, + "num_input_tokens_seen": 197419630, + "step": 9162, + "time_per_iteration": 2.8579914569854736 + }, + { + "auxiliary_loss_clip": 0.01435747, + "auxiliary_loss_mlp": 0.01234069, + "balance_loss_clip": 1.11483407, + "balance_loss_mlp": 1.03551412, + "epoch": 0.5509093641966031, + "flos": 25445573768160.0, + "grad_norm": 1.888651450977814, + "language_loss": 0.85811895, + "learning_rate": 1.767821335237733e-06, + "loss": 0.88481712, + "num_input_tokens_seen": 197438480, + "step": 9163, + "time_per_iteration": 2.8638103008270264 + }, + { + "auxiliary_loss_clip": 0.01440555, + "auxiliary_loss_mlp": 0.01241355, + "balance_loss_clip": 1.11974573, + "balance_loss_mlp": 1.04098785, + "epoch": 0.550969487449271, + "flos": 18700589963520.0, + "grad_norm": 1.643632705572351, + "language_loss": 0.80371106, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.83053017, + "num_input_tokens_seen": 197456755, + "step": 9164, + "time_per_iteration": 4.3333470821380615 + }, + { + "auxiliary_loss_clip": 0.0144014, + "auxiliary_loss_mlp": 0.01245148, + "balance_loss_clip": 1.11914349, + "balance_loss_mlp": 1.04201555, + "epoch": 0.551029610701939, + "flos": 22710812323680.0, + "grad_norm": 2.4203676508489544, + "language_loss": 0.73542845, + "learning_rate": 1.767047695977863e-06, + "loss": 0.76228136, + "num_input_tokens_seen": 197475530, + "step": 9165, + "time_per_iteration": 4.332506895065308 + }, + { + "auxiliary_loss_clip": 0.01435288, + "auxiliary_loss_mlp": 0.01233415, + "balance_loss_clip": 1.11580122, + "balance_loss_mlp": 1.03333402, + "epoch": 0.5510897339546069, + "flos": 12422051406720.0, + "grad_norm": 3.059441874763384, + "language_loss": 0.79414153, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.82082856, + "num_input_tokens_seen": 197490835, + "step": 9166, + "time_per_iteration": 2.8348655700683594 + }, + { + "auxiliary_loss_clip": 0.01441373, + "auxiliary_loss_mlp": 0.01239403, + "balance_loss_clip": 1.11975765, + "balance_loss_mlp": 1.03798676, + "epoch": 0.5511498572072749, + "flos": 18772578339840.0, + "grad_norm": 2.1793932751653404, + "language_loss": 0.7598784, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78668612, + "num_input_tokens_seen": 197508770, + "step": 9167, + "time_per_iteration": 2.7734055519104004 + }, + { + "auxiliary_loss_clip": 0.01443424, + "auxiliary_loss_mlp": 0.01235095, + "balance_loss_clip": 1.12403321, + "balance_loss_mlp": 1.03501403, + "epoch": 0.5512099804599428, + "flos": 19575373789440.0, + "grad_norm": 2.1588279607082295, + "language_loss": 0.79812849, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.82491362, + "num_input_tokens_seen": 197527340, + "step": 9168, + "time_per_iteration": 2.768740653991699 + }, + { + "auxiliary_loss_clip": 0.01451359, + "auxiliary_loss_mlp": 0.01236982, + "balance_loss_clip": 1.13189423, + "balance_loss_mlp": 1.03747368, + "epoch": 0.5512701037126109, + "flos": 26247762367200.0, + "grad_norm": 1.9884625051769889, + "language_loss": 0.69042498, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.7173084, + "num_input_tokens_seen": 197547280, + "step": 9169, + "time_per_iteration": 4.2952916622161865 + }, + { + "auxiliary_loss_clip": 0.01445566, + "auxiliary_loss_mlp": 0.01230753, + "balance_loss_clip": 1.12596273, + "balance_loss_mlp": 1.02952814, + "epoch": 0.5513302269652788, + "flos": 21947614237440.0, + "grad_norm": 1.898312904030736, + "language_loss": 0.85341227, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.88017547, + "num_input_tokens_seen": 197565045, + "step": 9170, + "time_per_iteration": 2.7886698246002197 + }, + { + "auxiliary_loss_clip": 0.01481567, + "auxiliary_loss_mlp": 0.01230179, + "balance_loss_clip": 1.17866576, + "balance_loss_mlp": 1.03829956, + "epoch": 0.5513903502179468, + "flos": 68242240087200.0, + "grad_norm": 0.7870759225613391, + "language_loss": 0.59837306, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.62549055, + "num_input_tokens_seen": 197625005, + "step": 9171, + "time_per_iteration": 3.41556978225708 + }, + { + "auxiliary_loss_clip": 0.01443529, + "auxiliary_loss_mlp": 0.0122942, + "balance_loss_clip": 1.12389827, + "balance_loss_mlp": 1.02991128, + "epoch": 0.5514504734706147, + "flos": 18736242798240.0, + "grad_norm": 1.5635350756013726, + "language_loss": 0.70143282, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72816235, + "num_input_tokens_seen": 197645050, + "step": 9172, + "time_per_iteration": 2.8231661319732666 + }, + { + "auxiliary_loss_clip": 0.01437737, + "auxiliary_loss_mlp": 0.01228672, + "balance_loss_clip": 1.11825407, + "balance_loss_mlp": 1.02744675, + "epoch": 0.5515105967232827, + "flos": 22273041129120.0, + "grad_norm": 1.8136136587067742, + "language_loss": 0.76038325, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.78704733, + "num_input_tokens_seen": 197663910, + "step": 9173, + "time_per_iteration": 2.814378023147583 + }, + { + "auxiliary_loss_clip": 0.01438086, + "auxiliary_loss_mlp": 0.01223214, + "balance_loss_clip": 1.1164906, + "balance_loss_mlp": 1.02351451, + "epoch": 0.5515707199759508, + "flos": 22559477508000.0, + "grad_norm": 1.767087864935947, + "language_loss": 0.75011134, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.77672434, + "num_input_tokens_seen": 197681580, + "step": 9174, + "time_per_iteration": 2.784381628036499 + }, + { + "auxiliary_loss_clip": 0.01439874, + "auxiliary_loss_mlp": 0.0122763, + "balance_loss_clip": 1.11838007, + "balance_loss_mlp": 1.02583241, + "epoch": 0.5516308432286187, + "flos": 28293476006880.0, + "grad_norm": 1.8881532415583062, + "language_loss": 0.72745466, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.75412971, + "num_input_tokens_seen": 197702095, + "step": 9175, + "time_per_iteration": 2.820003032684326 + }, + { + "auxiliary_loss_clip": 0.01438049, + "auxiliary_loss_mlp": 0.01221424, + "balance_loss_clip": 1.11765039, + "balance_loss_mlp": 1.02058029, + "epoch": 0.5516909664812867, + "flos": 18766130552640.0, + "grad_norm": 1.9379702760558686, + "language_loss": 0.69671524, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.72330987, + "num_input_tokens_seen": 197720720, + "step": 9176, + "time_per_iteration": 2.9318320751190186 + }, + { + "auxiliary_loss_clip": 0.01439681, + "auxiliary_loss_mlp": 0.01235139, + "balance_loss_clip": 1.1188966, + "balance_loss_mlp": 1.03410411, + "epoch": 0.5517510897339546, + "flos": 27742526089920.0, + "grad_norm": 1.6076443469959272, + "language_loss": 0.71117914, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.73792732, + "num_input_tokens_seen": 197741820, + "step": 9177, + "time_per_iteration": 2.8675663471221924 + }, + { + "auxiliary_loss_clip": 0.01441441, + "auxiliary_loss_mlp": 0.01235356, + "balance_loss_clip": 1.12029314, + "balance_loss_mlp": 1.03661013, + "epoch": 0.5518112129866226, + "flos": 18406378311840.0, + "grad_norm": 1.4838346651577217, + "language_loss": 0.80295599, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82972395, + "num_input_tokens_seen": 197759160, + "step": 9178, + "time_per_iteration": 2.782745599746704 + }, + { + "auxiliary_loss_clip": 0.01442869, + "auxiliary_loss_mlp": 0.01236252, + "balance_loss_clip": 1.12174082, + "balance_loss_mlp": 1.0354079, + "epoch": 0.5518713362392905, + "flos": 25085025036000.0, + "grad_norm": 1.6129118928967385, + "language_loss": 0.74826318, + "learning_rate": 1.761633217089826e-06, + "loss": 0.77505434, + "num_input_tokens_seen": 197779760, + "step": 9179, + "time_per_iteration": 2.826770305633545 + }, + { + "auxiliary_loss_clip": 0.01442586, + "auxiliary_loss_mlp": 0.01233543, + "balance_loss_clip": 1.12138999, + "balance_loss_mlp": 1.0315547, + "epoch": 0.5519314594919585, + "flos": 36541984939200.0, + "grad_norm": 1.6722169408895693, + "language_loss": 0.69224441, + "learning_rate": 1.761246535912924e-06, + "loss": 0.71900564, + "num_input_tokens_seen": 197801545, + "step": 9180, + "time_per_iteration": 2.8986868858337402 + }, + { + "auxiliary_loss_clip": 0.01439089, + "auxiliary_loss_mlp": 0.01237171, + "balance_loss_clip": 1.11871767, + "balance_loss_mlp": 1.0370903, + "epoch": 0.5519915827446265, + "flos": 20450688609600.0, + "grad_norm": 2.444923790391458, + "language_loss": 0.6729973, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69975996, + "num_input_tokens_seen": 197820760, + "step": 9181, + "time_per_iteration": 2.831183910369873 + }, + { + "auxiliary_loss_clip": 0.01439775, + "auxiliary_loss_mlp": 0.01222468, + "balance_loss_clip": 1.1198256, + "balance_loss_mlp": 1.01990771, + "epoch": 0.5520517059972945, + "flos": 23770270182240.0, + "grad_norm": 7.758290476242344, + "language_loss": 0.7917673, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81838971, + "num_input_tokens_seen": 197840195, + "step": 9182, + "time_per_iteration": 2.8074419498443604 + }, + { + "auxiliary_loss_clip": 0.01437582, + "auxiliary_loss_mlp": 0.01229456, + "balance_loss_clip": 1.11698544, + "balance_loss_mlp": 1.02765882, + "epoch": 0.5521118292499624, + "flos": 22198435709760.0, + "grad_norm": 2.0033451543676235, + "language_loss": 0.8306933, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.8573637, + "num_input_tokens_seen": 197859475, + "step": 9183, + "time_per_iteration": 2.9785046577453613 + }, + { + "auxiliary_loss_clip": 0.01439497, + "auxiliary_loss_mlp": 0.01227029, + "balance_loss_clip": 1.11840987, + "balance_loss_mlp": 1.02790189, + "epoch": 0.5521719525026304, + "flos": 23585368580640.0, + "grad_norm": 1.500384575253276, + "language_loss": 0.6751039, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.70176911, + "num_input_tokens_seen": 197879395, + "step": 9184, + "time_per_iteration": 2.85465669631958 + }, + { + "auxiliary_loss_clip": 0.0143617, + "auxiliary_loss_mlp": 0.0122761, + "balance_loss_clip": 1.11517572, + "balance_loss_mlp": 1.02676582, + "epoch": 0.5522320757552983, + "flos": 26139855659040.0, + "grad_norm": 1.6026921017247324, + "language_loss": 0.76335692, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78999472, + "num_input_tokens_seen": 197900815, + "step": 9185, + "time_per_iteration": 2.834291458129883 + }, + { + "auxiliary_loss_clip": 0.01438622, + "auxiliary_loss_mlp": 0.0123814, + "balance_loss_clip": 1.11839402, + "balance_loss_mlp": 1.03767776, + "epoch": 0.5522921990079663, + "flos": 24678203584320.0, + "grad_norm": 1.6563518117618037, + "language_loss": 0.74069643, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.76746404, + "num_input_tokens_seen": 197918985, + "step": 9186, + "time_per_iteration": 2.8283910751342773 + }, + { + "auxiliary_loss_clip": 0.01442374, + "auxiliary_loss_mlp": 0.01242249, + "balance_loss_clip": 1.12050176, + "balance_loss_mlp": 1.04274046, + "epoch": 0.5523523222606344, + "flos": 22750409687040.0, + "grad_norm": 2.3823450354895948, + "language_loss": 0.66882646, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.69567269, + "num_input_tokens_seen": 197937725, + "step": 9187, + "time_per_iteration": 2.7811474800109863 + }, + { + "auxiliary_loss_clip": 0.01439922, + "auxiliary_loss_mlp": 0.01235363, + "balance_loss_clip": 1.11881936, + "balance_loss_mlp": 1.03375673, + "epoch": 0.5524124455133023, + "flos": 19757885916960.0, + "grad_norm": 1.8259188895865879, + "language_loss": 0.77815121, + "learning_rate": 1.758153413657318e-06, + "loss": 0.80490404, + "num_input_tokens_seen": 197955635, + "step": 9188, + "time_per_iteration": 2.8132405281066895 + }, + { + "auxiliary_loss_clip": 0.01442928, + "auxiliary_loss_mlp": 0.01235297, + "balance_loss_clip": 1.12061262, + "balance_loss_mlp": 1.03445339, + "epoch": 0.5524725687659703, + "flos": 23297073721920.0, + "grad_norm": 1.943674320269701, + "language_loss": 0.81467015, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.84145242, + "num_input_tokens_seen": 197974490, + "step": 9189, + "time_per_iteration": 2.7562077045440674 + }, + { + "auxiliary_loss_clip": 0.01444706, + "auxiliary_loss_mlp": 0.01235782, + "balance_loss_clip": 1.12362957, + "balance_loss_mlp": 1.03551078, + "epoch": 0.5525326920186382, + "flos": 24864432671520.0, + "grad_norm": 1.4241725459932004, + "language_loss": 0.76343977, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.7902447, + "num_input_tokens_seen": 197995735, + "step": 9190, + "time_per_iteration": 2.81270694732666 + }, + { + "auxiliary_loss_clip": 0.01437717, + "auxiliary_loss_mlp": 0.0123859, + "balance_loss_clip": 1.11693132, + "balance_loss_mlp": 1.0354569, + "epoch": 0.5525928152713062, + "flos": 13737299326560.0, + "grad_norm": 2.567338074839676, + "language_loss": 0.78900862, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.8157717, + "num_input_tokens_seen": 198009685, + "step": 9191, + "time_per_iteration": 2.888958215713501 + }, + { + "auxiliary_loss_clip": 0.01437703, + "auxiliary_loss_mlp": 0.01230366, + "balance_loss_clip": 1.1160326, + "balance_loss_mlp": 1.03142989, + "epoch": 0.5526529385239741, + "flos": 13072867261920.0, + "grad_norm": 1.8890589163043707, + "language_loss": 0.68850911, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.71518981, + "num_input_tokens_seen": 198026845, + "step": 9192, + "time_per_iteration": 2.7283473014831543 + }, + { + "auxiliary_loss_clip": 0.01436914, + "auxiliary_loss_mlp": 0.01231371, + "balance_loss_clip": 1.11602736, + "balance_loss_mlp": 1.0331974, + "epoch": 0.5527130617766421, + "flos": 23150593710720.0, + "grad_norm": 1.55414352799081, + "language_loss": 0.77502322, + "learning_rate": 1.756220509823588e-06, + "loss": 0.80170608, + "num_input_tokens_seen": 198045275, + "step": 9193, + "time_per_iteration": 2.818237543106079 + }, + { + "auxiliary_loss_clip": 0.01440152, + "auxiliary_loss_mlp": 0.01228782, + "balance_loss_clip": 1.11965179, + "balance_loss_mlp": 1.02889144, + "epoch": 0.55277318502931, + "flos": 21287543911200.0, + "grad_norm": 1.5317560180223415, + "language_loss": 0.78496647, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.81165582, + "num_input_tokens_seen": 198065760, + "step": 9194, + "time_per_iteration": 4.184910774230957 + }, + { + "auxiliary_loss_clip": 0.01439951, + "auxiliary_loss_mlp": 0.01238864, + "balance_loss_clip": 1.1178354, + "balance_loss_mlp": 1.03840184, + "epoch": 0.5528333082819781, + "flos": 38327508851040.0, + "grad_norm": 2.3523733415852393, + "language_loss": 0.69399893, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.72078705, + "num_input_tokens_seen": 198087595, + "step": 9195, + "time_per_iteration": 2.9312617778778076 + }, + { + "auxiliary_loss_clip": 0.01441454, + "auxiliary_loss_mlp": 0.01232941, + "balance_loss_clip": 1.11952043, + "balance_loss_mlp": 1.03057086, + "epoch": 0.552893431534646, + "flos": 13555356121440.0, + "grad_norm": 2.0430579438372556, + "language_loss": 0.74157834, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76832223, + "num_input_tokens_seen": 198104620, + "step": 9196, + "time_per_iteration": 2.777533769607544 + }, + { + "auxiliary_loss_clip": 0.01440809, + "auxiliary_loss_mlp": 0.01235087, + "balance_loss_clip": 1.12059593, + "balance_loss_mlp": 1.03624582, + "epoch": 0.552953554787314, + "flos": 21941280234720.0, + "grad_norm": 1.6340101517041223, + "language_loss": 0.76738679, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.7941457, + "num_input_tokens_seen": 198123565, + "step": 9197, + "time_per_iteration": 2.786227226257324 + }, + { + "auxiliary_loss_clip": 0.01440005, + "auxiliary_loss_mlp": 0.01234404, + "balance_loss_clip": 1.11907816, + "balance_loss_mlp": 1.03661227, + "epoch": 0.5530136780399819, + "flos": 43661626751520.0, + "grad_norm": 1.5558920470138038, + "language_loss": 0.76481926, + "learning_rate": 1.754287837093407e-06, + "loss": 0.79156333, + "num_input_tokens_seen": 198148270, + "step": 9198, + "time_per_iteration": 2.9907548427581787 + }, + { + "auxiliary_loss_clip": 0.01440826, + "auxiliary_loss_mlp": 0.01228727, + "balance_loss_clip": 1.11987424, + "balance_loss_mlp": 1.0316025, + "epoch": 0.5530738012926499, + "flos": 25047817146720.0, + "grad_norm": 1.5018662949328194, + "language_loss": 0.79367042, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.82036597, + "num_input_tokens_seen": 198168810, + "step": 9199, + "time_per_iteration": 2.8405888080596924 + }, + { + "auxiliary_loss_clip": 0.01440056, + "auxiliary_loss_mlp": 0.01228261, + "balance_loss_clip": 1.11717093, + "balance_loss_mlp": 1.03008723, + "epoch": 0.553133924545318, + "flos": 16473957179040.0, + "grad_norm": 4.611338457006293, + "language_loss": 0.64185476, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66853791, + "num_input_tokens_seen": 198186200, + "step": 9200, + "time_per_iteration": 2.7535324096679688 + }, + { + "auxiliary_loss_clip": 0.01449269, + "auxiliary_loss_mlp": 0.01220642, + "balance_loss_clip": 1.12528038, + "balance_loss_mlp": 1.01655579, + "epoch": 0.5531940477979859, + "flos": 24608415041280.0, + "grad_norm": 1.6490559134868326, + "language_loss": 0.65982229, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68652129, + "num_input_tokens_seen": 198207050, + "step": 9201, + "time_per_iteration": 4.3831048011779785 + }, + { + "auxiliary_loss_clip": 0.01446502, + "auxiliary_loss_mlp": 0.0122907, + "balance_loss_clip": 1.12563968, + "balance_loss_mlp": 1.02917957, + "epoch": 0.5532541710506539, + "flos": 22161531245760.0, + "grad_norm": 6.268109671363979, + "language_loss": 0.6098401, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.63659585, + "num_input_tokens_seen": 198224565, + "step": 9202, + "time_per_iteration": 2.775308609008789 + }, + { + "auxiliary_loss_clip": 0.01447577, + "auxiliary_loss_mlp": 0.01226806, + "balance_loss_clip": 1.12475383, + "balance_loss_mlp": 1.02462649, + "epoch": 0.5533142943033218, + "flos": 21399395148000.0, + "grad_norm": 4.72515842473989, + "language_loss": 0.64499462, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.67173839, + "num_input_tokens_seen": 198244790, + "step": 9203, + "time_per_iteration": 4.284540891647339 + }, + { + "auxiliary_loss_clip": 0.01446648, + "auxiliary_loss_mlp": 0.01227985, + "balance_loss_clip": 1.12363029, + "balance_loss_mlp": 1.02790368, + "epoch": 0.5533744175559898, + "flos": 23552825855040.0, + "grad_norm": 1.6567486646736607, + "language_loss": 0.63814783, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.66489422, + "num_input_tokens_seen": 198264375, + "step": 9204, + "time_per_iteration": 2.830151081085205 + }, + { + "auxiliary_loss_clip": 0.01438419, + "auxiliary_loss_mlp": 0.01231733, + "balance_loss_clip": 1.11730301, + "balance_loss_mlp": 1.03298688, + "epoch": 0.5534345408086577, + "flos": 24063837055200.0, + "grad_norm": 1.5306991698444903, + "language_loss": 0.7729249, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79962647, + "num_input_tokens_seen": 198283895, + "step": 9205, + "time_per_iteration": 2.850511074066162 + }, + { + "auxiliary_loss_clip": 0.01444487, + "auxiliary_loss_mlp": 0.01223682, + "balance_loss_clip": 1.12283087, + "balance_loss_mlp": 1.02322006, + "epoch": 0.5534946640613257, + "flos": 33774984194400.0, + "grad_norm": 1.5428675730353503, + "language_loss": 0.72737205, + "learning_rate": 1.751196045993537e-06, + "loss": 0.75405377, + "num_input_tokens_seen": 198310035, + "step": 9206, + "time_per_iteration": 2.9695873260498047 + }, + { + "auxiliary_loss_clip": 0.01442893, + "auxiliary_loss_mlp": 0.01230824, + "balance_loss_clip": 1.12225628, + "balance_loss_mlp": 1.03036141, + "epoch": 0.5535547873139937, + "flos": 15161060805120.0, + "grad_norm": 2.5305578903179944, + "language_loss": 0.75950551, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.7862426, + "num_input_tokens_seen": 198327810, + "step": 9207, + "time_per_iteration": 4.293233871459961 + }, + { + "auxiliary_loss_clip": 0.01441439, + "auxiliary_loss_mlp": 0.01229089, + "balance_loss_clip": 1.11822236, + "balance_loss_mlp": 1.02519345, + "epoch": 0.5536149105666617, + "flos": 16982654761440.0, + "grad_norm": 8.778607034810193, + "language_loss": 0.6157428, + "learning_rate": 1.750423192272189e-06, + "loss": 0.64244807, + "num_input_tokens_seen": 198343150, + "step": 9208, + "time_per_iteration": 2.779568672180176 + }, + { + "auxiliary_loss_clip": 0.01441917, + "auxiliary_loss_mlp": 0.01228826, + "balance_loss_clip": 1.12107635, + "balance_loss_mlp": 1.0268383, + "epoch": 0.5536750338193296, + "flos": 18151877808000.0, + "grad_norm": 2.2563232986391024, + "language_loss": 0.64228117, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66898859, + "num_input_tokens_seen": 198360925, + "step": 9209, + "time_per_iteration": 2.8015859127044678 + }, + { + "auxiliary_loss_clip": 0.01437972, + "auxiliary_loss_mlp": 0.01237021, + "balance_loss_clip": 1.1167016, + "balance_loss_mlp": 1.03608155, + "epoch": 0.5537351570719976, + "flos": 22750030405440.0, + "grad_norm": 2.2114365412019903, + "language_loss": 0.82965374, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.85640371, + "num_input_tokens_seen": 198379265, + "step": 9210, + "time_per_iteration": 2.7574305534362793 + }, + { + "auxiliary_loss_clip": 0.01434291, + "auxiliary_loss_mlp": 0.01233623, + "balance_loss_clip": 1.113343, + "balance_loss_mlp": 1.03735685, + "epoch": 0.5537952803246655, + "flos": 26358248190240.0, + "grad_norm": 2.007677514227425, + "language_loss": 0.72798145, + "learning_rate": 1.74926398270663e-06, + "loss": 0.75466055, + "num_input_tokens_seen": 198399490, + "step": 9211, + "time_per_iteration": 2.8570733070373535 + }, + { + "auxiliary_loss_clip": 0.0144531, + "auxiliary_loss_mlp": 0.01238213, + "balance_loss_clip": 1.12352252, + "balance_loss_mlp": 1.03183818, + "epoch": 0.5538554035773335, + "flos": 18039192151680.0, + "grad_norm": 2.411621168225186, + "language_loss": 0.66445529, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.6912905, + "num_input_tokens_seen": 198419110, + "step": 9212, + "time_per_iteration": 2.822561025619507 + }, + { + "auxiliary_loss_clip": 0.01441365, + "auxiliary_loss_mlp": 0.01233557, + "balance_loss_clip": 1.1194731, + "balance_loss_mlp": 1.03118753, + "epoch": 0.5539155268300014, + "flos": 31688952556320.0, + "grad_norm": 1.4041532731827222, + "language_loss": 0.51658463, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.54333389, + "num_input_tokens_seen": 198441360, + "step": 9213, + "time_per_iteration": 2.8248133659362793 + }, + { + "auxiliary_loss_clip": 0.01441701, + "auxiliary_loss_mlp": 0.01225675, + "balance_loss_clip": 1.12008083, + "balance_loss_mlp": 1.02368665, + "epoch": 0.5539756500826695, + "flos": 15195499938720.0, + "grad_norm": 2.1228036671894834, + "language_loss": 0.85706657, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.88374037, + "num_input_tokens_seen": 198459835, + "step": 9214, + "time_per_iteration": 2.8215978145599365 + }, + { + "auxiliary_loss_clip": 0.01442836, + "auxiliary_loss_mlp": 0.01234672, + "balance_loss_clip": 1.12350011, + "balance_loss_mlp": 1.03497291, + "epoch": 0.5540357733353375, + "flos": 26355176009280.0, + "grad_norm": 2.3298852343608387, + "language_loss": 0.701612, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72838706, + "num_input_tokens_seen": 198478955, + "step": 9215, + "time_per_iteration": 2.859717845916748 + }, + { + "auxiliary_loss_clip": 0.01441184, + "auxiliary_loss_mlp": 0.01236334, + "balance_loss_clip": 1.12015533, + "balance_loss_mlp": 1.03539467, + "epoch": 0.5540958965880054, + "flos": 21325775860800.0, + "grad_norm": 1.5693578809148367, + "language_loss": 0.72899562, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.7557708, + "num_input_tokens_seen": 198499030, + "step": 9216, + "time_per_iteration": 2.859238624572754 + }, + { + "auxiliary_loss_clip": 0.01443349, + "auxiliary_loss_mlp": 0.01240371, + "balance_loss_clip": 1.12263107, + "balance_loss_mlp": 1.04086232, + "epoch": 0.5541560198406734, + "flos": 25669238313600.0, + "grad_norm": 1.9346569560503126, + "language_loss": 0.71749884, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.74433601, + "num_input_tokens_seen": 198520265, + "step": 9217, + "time_per_iteration": 2.835040330886841 + }, + { + "auxiliary_loss_clip": 0.0143458, + "auxiliary_loss_mlp": 0.01229484, + "balance_loss_clip": 1.11480141, + "balance_loss_mlp": 1.03274083, + "epoch": 0.5542161430933413, + "flos": 21941507803680.0, + "grad_norm": 2.144995503088053, + "language_loss": 0.78631622, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.81295687, + "num_input_tokens_seen": 198539645, + "step": 9218, + "time_per_iteration": 2.7424066066741943 + }, + { + "auxiliary_loss_clip": 0.01441345, + "auxiliary_loss_mlp": 0.0123951, + "balance_loss_clip": 1.11926436, + "balance_loss_mlp": 1.04028773, + "epoch": 0.5542762663460093, + "flos": 19573401525120.0, + "grad_norm": 1.9531340354548699, + "language_loss": 0.72487187, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.75168037, + "num_input_tokens_seen": 198558710, + "step": 9219, + "time_per_iteration": 2.7813730239868164 + }, + { + "auxiliary_loss_clip": 0.01444814, + "auxiliary_loss_mlp": 0.01240963, + "balance_loss_clip": 1.12513912, + "balance_loss_mlp": 1.04116821, + "epoch": 0.5543363895986773, + "flos": 19501033867200.0, + "grad_norm": 1.855777555556298, + "language_loss": 0.71506572, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.74192351, + "num_input_tokens_seen": 198577050, + "step": 9220, + "time_per_iteration": 2.7714831829071045 + }, + { + "auxiliary_loss_clip": 0.01441278, + "auxiliary_loss_mlp": 0.01226439, + "balance_loss_clip": 1.12115109, + "balance_loss_mlp": 1.02855158, + "epoch": 0.5543965128513453, + "flos": 22637686102560.0, + "grad_norm": 2.7834792476260586, + "language_loss": 0.79432416, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.82100135, + "num_input_tokens_seen": 198595290, + "step": 9221, + "time_per_iteration": 2.787065267562866 + }, + { + "auxiliary_loss_clip": 0.01449358, + "auxiliary_loss_mlp": 0.01228452, + "balance_loss_clip": 1.12813187, + "balance_loss_mlp": 1.02837145, + "epoch": 0.5544566361040132, + "flos": 25992086090400.0, + "grad_norm": 1.6959688043891048, + "language_loss": 0.83844519, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.86522329, + "num_input_tokens_seen": 198614110, + "step": 9222, + "time_per_iteration": 2.821974515914917 + }, + { + "auxiliary_loss_clip": 0.01453497, + "auxiliary_loss_mlp": 0.01231974, + "balance_loss_clip": 1.13148713, + "balance_loss_mlp": 1.02731562, + "epoch": 0.5545167593566812, + "flos": 28260136789920.0, + "grad_norm": 1.7091466956896422, + "language_loss": 0.7595672, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.7864219, + "num_input_tokens_seen": 198633880, + "step": 9223, + "time_per_iteration": 2.9060568809509277 + }, + { + "auxiliary_loss_clip": 0.01449602, + "auxiliary_loss_mlp": 0.01230234, + "balance_loss_clip": 1.12892485, + "balance_loss_mlp": 1.02986717, + "epoch": 0.5545768826093491, + "flos": 28479136171680.0, + "grad_norm": 1.9889969095298645, + "language_loss": 0.82002211, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84682053, + "num_input_tokens_seen": 198653505, + "step": 9224, + "time_per_iteration": 2.8016324043273926 + }, + { + "auxiliary_loss_clip": 0.01456766, + "auxiliary_loss_mlp": 0.01236152, + "balance_loss_clip": 1.13604259, + "balance_loss_mlp": 1.03588068, + "epoch": 0.5546370058620171, + "flos": 18479997599040.0, + "grad_norm": 2.458143060970649, + "language_loss": 0.57266557, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59959477, + "num_input_tokens_seen": 198671890, + "step": 9225, + "time_per_iteration": 2.8406684398651123 + }, + { + "auxiliary_loss_clip": 0.01450476, + "auxiliary_loss_mlp": 0.01240271, + "balance_loss_clip": 1.130759, + "balance_loss_mlp": 1.0388546, + "epoch": 0.554697129114685, + "flos": 22932770101920.0, + "grad_norm": 1.5031019812781754, + "language_loss": 0.67526674, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.70217425, + "num_input_tokens_seen": 198691995, + "step": 9226, + "time_per_iteration": 2.783423662185669 + }, + { + "auxiliary_loss_clip": 0.01447341, + "auxiliary_loss_mlp": 0.01232756, + "balance_loss_clip": 1.12794018, + "balance_loss_mlp": 1.03458214, + "epoch": 0.5547572523673531, + "flos": 21799237818240.0, + "grad_norm": 1.490591062190256, + "language_loss": 0.74390215, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.77070302, + "num_input_tokens_seen": 198712440, + "step": 9227, + "time_per_iteration": 2.8254356384277344 + }, + { + "auxiliary_loss_clip": 0.01460615, + "auxiliary_loss_mlp": 0.01245965, + "balance_loss_clip": 1.14273071, + "balance_loss_mlp": 1.04569376, + "epoch": 0.5548173756200211, + "flos": 22344839864640.0, + "grad_norm": 1.7543712479017446, + "language_loss": 0.73620224, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.76326799, + "num_input_tokens_seen": 198731515, + "step": 9228, + "time_per_iteration": 2.8115124702453613 + }, + { + "auxiliary_loss_clip": 0.01449817, + "auxiliary_loss_mlp": 0.0123884, + "balance_loss_clip": 1.1315546, + "balance_loss_mlp": 1.04114342, + "epoch": 0.554877498872689, + "flos": 17860700409120.0, + "grad_norm": 1.6481343820233818, + "language_loss": 0.76060659, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.78749311, + "num_input_tokens_seen": 198749750, + "step": 9229, + "time_per_iteration": 2.742325782775879 + }, + { + "auxiliary_loss_clip": 0.01456223, + "auxiliary_loss_mlp": 0.01236845, + "balance_loss_clip": 1.13916278, + "balance_loss_mlp": 1.03571546, + "epoch": 0.554937622125357, + "flos": 17240265374400.0, + "grad_norm": 1.4724814661501284, + "language_loss": 0.68593788, + "learning_rate": 1.741924325613172e-06, + "loss": 0.71286857, + "num_input_tokens_seen": 198768320, + "step": 9230, + "time_per_iteration": 2.788987398147583 + }, + { + "auxiliary_loss_clip": 0.01446291, + "auxiliary_loss_mlp": 0.01230769, + "balance_loss_clip": 1.12790453, + "balance_loss_mlp": 1.02944827, + "epoch": 0.5549977453780249, + "flos": 25370096001120.0, + "grad_norm": 2.8654492238395086, + "language_loss": 0.6816777, + "learning_rate": 1.741538124855163e-06, + "loss": 0.70844829, + "num_input_tokens_seen": 198787230, + "step": 9231, + "time_per_iteration": 2.776228666305542 + }, + { + "auxiliary_loss_clip": 0.01457128, + "auxiliary_loss_mlp": 0.01246684, + "balance_loss_clip": 1.13634419, + "balance_loss_mlp": 1.04564977, + "epoch": 0.555057868630693, + "flos": 25081270148160.0, + "grad_norm": 1.7543166228601368, + "language_loss": 0.78535342, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.81239152, + "num_input_tokens_seen": 198806720, + "step": 9232, + "time_per_iteration": 2.8257994651794434 + }, + { + "auxiliary_loss_clip": 0.0145453, + "auxiliary_loss_mlp": 0.01231983, + "balance_loss_clip": 1.13561797, + "balance_loss_mlp": 1.03495395, + "epoch": 0.5551179918833609, + "flos": 26106630226560.0, + "grad_norm": 2.64262375065021, + "language_loss": 0.82549101, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.85235614, + "num_input_tokens_seen": 198826235, + "step": 9233, + "time_per_iteration": 4.249969244003296 + }, + { + "auxiliary_loss_clip": 0.01449633, + "auxiliary_loss_mlp": 0.01237524, + "balance_loss_clip": 1.12945962, + "balance_loss_mlp": 1.03648996, + "epoch": 0.5551781151360289, + "flos": 19386451802880.0, + "grad_norm": 2.2944360213357924, + "language_loss": 0.75224751, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77911907, + "num_input_tokens_seen": 198842655, + "step": 9234, + "time_per_iteration": 2.8317995071411133 + }, + { + "auxiliary_loss_clip": 0.01445263, + "auxiliary_loss_mlp": 0.01223527, + "balance_loss_clip": 1.12689304, + "balance_loss_mlp": 1.02459073, + "epoch": 0.5552382383886968, + "flos": 21728121789600.0, + "grad_norm": 2.065774852029203, + "language_loss": 0.6452288, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.67191672, + "num_input_tokens_seen": 198861210, + "step": 9235, + "time_per_iteration": 2.896726369857788 + }, + { + "auxiliary_loss_clip": 0.0145234, + "auxiliary_loss_mlp": 0.01237208, + "balance_loss_clip": 1.13357043, + "balance_loss_mlp": 1.03655517, + "epoch": 0.5552983616413648, + "flos": 14357961930240.0, + "grad_norm": 2.1522553252768564, + "language_loss": 0.68463612, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.71153158, + "num_input_tokens_seen": 198880045, + "step": 9236, + "time_per_iteration": 2.8581528663635254 + }, + { + "auxiliary_loss_clip": 0.01446024, + "auxiliary_loss_mlp": 0.01225775, + "balance_loss_clip": 1.12738681, + "balance_loss_mlp": 1.02655196, + "epoch": 0.5553584848940327, + "flos": 25480619752320.0, + "grad_norm": 1.7747359104265148, + "language_loss": 0.86223203, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88894999, + "num_input_tokens_seen": 198900210, + "step": 9237, + "time_per_iteration": 3.022158145904541 + }, + { + "auxiliary_loss_clip": 0.01443327, + "auxiliary_loss_mlp": 0.01227129, + "balance_loss_clip": 1.12489867, + "balance_loss_mlp": 1.02685714, + "epoch": 0.5554186081467007, + "flos": 22166044696800.0, + "grad_norm": 1.656546791019563, + "language_loss": 0.73622704, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.76293159, + "num_input_tokens_seen": 198919055, + "step": 9238, + "time_per_iteration": 2.878103733062744 + }, + { + "auxiliary_loss_clip": 0.01440522, + "auxiliary_loss_mlp": 0.01232967, + "balance_loss_clip": 1.12249219, + "balance_loss_mlp": 1.03412604, + "epoch": 0.5554787313993687, + "flos": 49750370974080.0, + "grad_norm": 1.7493268492468057, + "language_loss": 0.78389597, + "learning_rate": 1.73844887285358e-06, + "loss": 0.81063086, + "num_input_tokens_seen": 198943505, + "step": 9239, + "time_per_iteration": 4.622663259506226 + }, + { + "auxiliary_loss_clip": 0.01441711, + "auxiliary_loss_mlp": 0.0122562, + "balance_loss_clip": 1.12311316, + "balance_loss_mlp": 1.02687454, + "epoch": 0.5555388546520367, + "flos": 22129557442560.0, + "grad_norm": 1.5102248266239262, + "language_loss": 0.79963791, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82631123, + "num_input_tokens_seen": 198963590, + "step": 9240, + "time_per_iteration": 2.8175864219665527 + }, + { + "auxiliary_loss_clip": 0.01442164, + "auxiliary_loss_mlp": 0.01221773, + "balance_loss_clip": 1.12296271, + "balance_loss_mlp": 1.0230279, + "epoch": 0.5555989779047047, + "flos": 24684499658880.0, + "grad_norm": 2.976679325956003, + "language_loss": 0.65697718, + "learning_rate": 1.737676658740786e-06, + "loss": 0.68361652, + "num_input_tokens_seen": 198982680, + "step": 9241, + "time_per_iteration": 2.8371999263763428 + }, + { + "auxiliary_loss_clip": 0.01448394, + "auxiliary_loss_mlp": 0.01237373, + "balance_loss_clip": 1.12925601, + "balance_loss_mlp": 1.03424013, + "epoch": 0.5556591011573726, + "flos": 16108326073440.0, + "grad_norm": 2.0799257660810233, + "language_loss": 0.73192626, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.75878388, + "num_input_tokens_seen": 199000185, + "step": 9242, + "time_per_iteration": 4.253066539764404 + }, + { + "auxiliary_loss_clip": 0.01442946, + "auxiliary_loss_mlp": 0.01233508, + "balance_loss_clip": 1.12296581, + "balance_loss_mlp": 1.03323603, + "epoch": 0.5557192244100406, + "flos": 12935414152800.0, + "grad_norm": 2.7417611105583695, + "language_loss": 0.63669711, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.66346163, + "num_input_tokens_seen": 199018380, + "step": 9243, + "time_per_iteration": 2.754232883453369 + }, + { + "auxiliary_loss_clip": 0.01445424, + "auxiliary_loss_mlp": 0.01230377, + "balance_loss_clip": 1.12688339, + "balance_loss_mlp": 1.02915192, + "epoch": 0.5557793476627085, + "flos": 23113916815680.0, + "grad_norm": 2.2447471181585126, + "language_loss": 0.75283313, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77959114, + "num_input_tokens_seen": 199037115, + "step": 9244, + "time_per_iteration": 2.7566750049591064 + }, + { + "auxiliary_loss_clip": 0.01444784, + "auxiliary_loss_mlp": 0.01235909, + "balance_loss_clip": 1.12736166, + "balance_loss_mlp": 1.036973, + "epoch": 0.5558394709153766, + "flos": 21429851824800.0, + "grad_norm": 2.2874304011773052, + "language_loss": 0.75297058, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.77977753, + "num_input_tokens_seen": 199053375, + "step": 9245, + "time_per_iteration": 4.2708046436309814 + }, + { + "auxiliary_loss_clip": 0.01436656, + "auxiliary_loss_mlp": 0.01223947, + "balance_loss_clip": 1.11706376, + "balance_loss_mlp": 1.02195859, + "epoch": 0.5558995941680445, + "flos": 25080435728640.0, + "grad_norm": 2.0354625914144733, + "language_loss": 0.79791653, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.82452255, + "num_input_tokens_seen": 199070930, + "step": 9246, + "time_per_iteration": 2.835928201675415 + }, + { + "auxiliary_loss_clip": 0.01442049, + "auxiliary_loss_mlp": 0.01230285, + "balance_loss_clip": 1.12294018, + "balance_loss_mlp": 1.03020406, + "epoch": 0.5559597174207125, + "flos": 20013220840320.0, + "grad_norm": 2.060325243200868, + "language_loss": 0.73581481, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76253819, + "num_input_tokens_seen": 199088675, + "step": 9247, + "time_per_iteration": 2.8562369346618652 + }, + { + "auxiliary_loss_clip": 0.01440969, + "auxiliary_loss_mlp": 0.01231124, + "balance_loss_clip": 1.12272477, + "balance_loss_mlp": 1.02799153, + "epoch": 0.5560198406733804, + "flos": 16837502235840.0, + "grad_norm": 2.830930114431446, + "language_loss": 0.75837815, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.78509915, + "num_input_tokens_seen": 199103075, + "step": 9248, + "time_per_iteration": 2.7664101123809814 + }, + { + "auxiliary_loss_clip": 0.01490845, + "auxiliary_loss_mlp": 0.0120021, + "balance_loss_clip": 1.18777037, + "balance_loss_mlp": 1.00604248, + "epoch": 0.5560799639260484, + "flos": 70704408931200.0, + "grad_norm": 0.8485524531695439, + "language_loss": 0.59305704, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61996758, + "num_input_tokens_seen": 199160325, + "step": 9249, + "time_per_iteration": 3.4406087398529053 + }, + { + "auxiliary_loss_clip": 0.01433969, + "auxiliary_loss_mlp": 0.01227229, + "balance_loss_clip": 1.11512971, + "balance_loss_mlp": 1.02896047, + "epoch": 0.5561400871787163, + "flos": 23150821279680.0, + "grad_norm": 2.0782563432169194, + "language_loss": 0.79805207, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82466406, + "num_input_tokens_seen": 199179760, + "step": 9250, + "time_per_iteration": 2.8120553493499756 + }, + { + "auxiliary_loss_clip": 0.01438111, + "auxiliary_loss_mlp": 0.01245687, + "balance_loss_clip": 1.11952949, + "balance_loss_mlp": 1.04379368, + "epoch": 0.5562002104313843, + "flos": 17568347237280.0, + "grad_norm": 2.2044083424034078, + "language_loss": 0.68638253, + "learning_rate": 1.733816187358836e-06, + "loss": 0.71322054, + "num_input_tokens_seen": 199196695, + "step": 9251, + "time_per_iteration": 2.7739288806915283 + }, + { + "auxiliary_loss_clip": 0.01439001, + "auxiliary_loss_mlp": 0.01235084, + "balance_loss_clip": 1.12246084, + "balance_loss_mlp": 1.03691077, + "epoch": 0.5562603336840523, + "flos": 25047741290400.0, + "grad_norm": 1.6851055651499711, + "language_loss": 0.75366092, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.78040183, + "num_input_tokens_seen": 199217845, + "step": 9252, + "time_per_iteration": 2.883800506591797 + }, + { + "auxiliary_loss_clip": 0.01440549, + "auxiliary_loss_mlp": 0.01230594, + "balance_loss_clip": 1.12401938, + "balance_loss_mlp": 1.03127635, + "epoch": 0.5563204569367203, + "flos": 29061566825760.0, + "grad_norm": 1.5447921929790613, + "language_loss": 0.72457874, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.7512902, + "num_input_tokens_seen": 199239250, + "step": 9253, + "time_per_iteration": 2.840388059616089 + }, + { + "auxiliary_loss_clip": 0.0144689, + "auxiliary_loss_mlp": 0.01230801, + "balance_loss_clip": 1.12798333, + "balance_loss_mlp": 1.03186464, + "epoch": 0.5563805801893883, + "flos": 22092615050400.0, + "grad_norm": 1.8642003081877305, + "language_loss": 0.83317935, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85995626, + "num_input_tokens_seen": 199258320, + "step": 9254, + "time_per_iteration": 2.8030529022216797 + }, + { + "auxiliary_loss_clip": 0.01480793, + "auxiliary_loss_mlp": 0.0119931, + "balance_loss_clip": 1.17945635, + "balance_loss_mlp": 1.00628662, + "epoch": 0.5564407034420562, + "flos": 58641654627360.0, + "grad_norm": 0.8633225091588655, + "language_loss": 0.64885038, + "learning_rate": 1.732272280610387e-06, + "loss": 0.67565143, + "num_input_tokens_seen": 199314840, + "step": 9255, + "time_per_iteration": 3.157707929611206 + }, + { + "auxiliary_loss_clip": 0.01445533, + "auxiliary_loss_mlp": 0.01223895, + "balance_loss_clip": 1.12931561, + "balance_loss_mlp": 1.02486348, + "epoch": 0.5565008266947242, + "flos": 23114561594400.0, + "grad_norm": 2.30859996157536, + "language_loss": 0.69044209, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71713632, + "num_input_tokens_seen": 199335405, + "step": 9256, + "time_per_iteration": 2.7865474224090576 + }, + { + "auxiliary_loss_clip": 0.01442396, + "auxiliary_loss_mlp": 0.01231875, + "balance_loss_clip": 1.12649488, + "balance_loss_mlp": 1.0344646, + "epoch": 0.5565609499473921, + "flos": 21580466005440.0, + "grad_norm": 1.8560142693252004, + "language_loss": 0.75698054, + "learning_rate": 1.73150038809119e-06, + "loss": 0.78372324, + "num_input_tokens_seen": 199354345, + "step": 9257, + "time_per_iteration": 2.8194081783294678 + }, + { + "auxiliary_loss_clip": 0.01440792, + "auxiliary_loss_mlp": 0.01231259, + "balance_loss_clip": 1.12407458, + "balance_loss_mlp": 1.03298998, + "epoch": 0.5566210732000602, + "flos": 18371635752960.0, + "grad_norm": 2.6697433490699427, + "language_loss": 0.60601383, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63273436, + "num_input_tokens_seen": 199372250, + "step": 9258, + "time_per_iteration": 2.7907049655914307 + }, + { + "auxiliary_loss_clip": 0.01438335, + "auxiliary_loss_mlp": 0.01225743, + "balance_loss_clip": 1.12268388, + "balance_loss_mlp": 1.02652013, + "epoch": 0.5566811964527281, + "flos": 25705763496000.0, + "grad_norm": 1.735241503734327, + "language_loss": 0.79119384, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81783462, + "num_input_tokens_seen": 199392815, + "step": 9259, + "time_per_iteration": 2.8264427185058594 + }, + { + "auxiliary_loss_clip": 0.01435683, + "auxiliary_loss_mlp": 0.0122513, + "balance_loss_clip": 1.11842513, + "balance_loss_mlp": 1.02781487, + "epoch": 0.5567413197053961, + "flos": 26946481852800.0, + "grad_norm": 2.034085013520987, + "language_loss": 0.8150779, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.84168601, + "num_input_tokens_seen": 199412375, + "step": 9260, + "time_per_iteration": 2.875047206878662 + }, + { + "auxiliary_loss_clip": 0.01437528, + "auxiliary_loss_mlp": 0.01237172, + "balance_loss_clip": 1.12150121, + "balance_loss_mlp": 1.03918874, + "epoch": 0.556801442958064, + "flos": 20852731113120.0, + "grad_norm": 1.6252220753028876, + "language_loss": 0.6908884, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71763539, + "num_input_tokens_seen": 199431490, + "step": 9261, + "time_per_iteration": 2.788973331451416 + }, + { + "auxiliary_loss_clip": 0.01477512, + "auxiliary_loss_mlp": 0.01185188, + "balance_loss_clip": 1.179281, + "balance_loss_mlp": 0.99254608, + "epoch": 0.556861566210732, + "flos": 70504639490880.0, + "grad_norm": 0.7983145723263995, + "language_loss": 0.610883, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63751006, + "num_input_tokens_seen": 199495855, + "step": 9262, + "time_per_iteration": 3.2838194370269775 + }, + { + "auxiliary_loss_clip": 0.01436118, + "auxiliary_loss_mlp": 0.01241081, + "balance_loss_clip": 1.12007165, + "balance_loss_mlp": 1.04491043, + "epoch": 0.5569216894633999, + "flos": 25339601396160.0, + "grad_norm": 1.5887339770018296, + "language_loss": 0.64911646, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.67588842, + "num_input_tokens_seen": 199515870, + "step": 9263, + "time_per_iteration": 2.8546206951141357 + }, + { + "auxiliary_loss_clip": 0.01440714, + "auxiliary_loss_mlp": 0.01237519, + "balance_loss_clip": 1.12477446, + "balance_loss_mlp": 1.03686547, + "epoch": 0.556981812716068, + "flos": 22640985852480.0, + "grad_norm": 1.7849455220811605, + "language_loss": 0.73002434, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75680661, + "num_input_tokens_seen": 199535745, + "step": 9264, + "time_per_iteration": 2.7713773250579834 + }, + { + "auxiliary_loss_clip": 0.01439862, + "auxiliary_loss_mlp": 0.01233374, + "balance_loss_clip": 1.12435102, + "balance_loss_mlp": 1.03281641, + "epoch": 0.5570419359687359, + "flos": 11037432153600.0, + "grad_norm": 2.1177771821167704, + "language_loss": 0.76349747, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.79022986, + "num_input_tokens_seen": 199554035, + "step": 9265, + "time_per_iteration": 2.8302650451660156 + }, + { + "auxiliary_loss_clip": 0.01440617, + "auxiliary_loss_mlp": 0.01232392, + "balance_loss_clip": 1.12594569, + "balance_loss_mlp": 1.03393233, + "epoch": 0.5571020592214039, + "flos": 22825508172480.0, + "grad_norm": 1.790127761850953, + "language_loss": 0.70826936, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73499948, + "num_input_tokens_seen": 199576120, + "step": 9266, + "time_per_iteration": 2.816901683807373 + }, + { + "auxiliary_loss_clip": 0.01436858, + "auxiliary_loss_mlp": 0.0124001, + "balance_loss_clip": 1.12067151, + "balance_loss_mlp": 1.03926158, + "epoch": 0.5571621824740719, + "flos": 22929887561760.0, + "grad_norm": 1.718268219779848, + "language_loss": 0.68218386, + "learning_rate": 1.727641538728533e-06, + "loss": 0.70895255, + "num_input_tokens_seen": 199593780, + "step": 9267, + "time_per_iteration": 2.8263070583343506 + }, + { + "auxiliary_loss_clip": 0.0143943, + "auxiliary_loss_mlp": 0.01226344, + "balance_loss_clip": 1.12494254, + "balance_loss_mlp": 1.02874219, + "epoch": 0.5572223057267398, + "flos": 22968953930880.0, + "grad_norm": 2.157585518164896, + "language_loss": 0.74741316, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.77407092, + "num_input_tokens_seen": 199613220, + "step": 9268, + "time_per_iteration": 2.8761374950408936 + }, + { + "auxiliary_loss_clip": 0.01432931, + "auxiliary_loss_mlp": 0.01238846, + "balance_loss_clip": 1.1179055, + "balance_loss_mlp": 1.04229403, + "epoch": 0.5572824289794078, + "flos": 20962192875840.0, + "grad_norm": 1.9976844607369328, + "language_loss": 0.746113, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77283078, + "num_input_tokens_seen": 199632085, + "step": 9269, + "time_per_iteration": 2.8152735233306885 + }, + { + "auxiliary_loss_clip": 0.01428034, + "auxiliary_loss_mlp": 0.01223418, + "balance_loss_clip": 1.11190271, + "balance_loss_mlp": 1.02543533, + "epoch": 0.5573425522320757, + "flos": 25044555324960.0, + "grad_norm": 1.7932791738176102, + "language_loss": 0.82518148, + "learning_rate": 1.726484084647256e-06, + "loss": 0.85169601, + "num_input_tokens_seen": 199649295, + "step": 9270, + "time_per_iteration": 2.846299886703491 + }, + { + "auxiliary_loss_clip": 0.01434746, + "auxiliary_loss_mlp": 0.01233177, + "balance_loss_clip": 1.1190033, + "balance_loss_mlp": 1.03281021, + "epoch": 0.5574026754847438, + "flos": 23661908336160.0, + "grad_norm": 2.090311821959768, + "language_loss": 0.79444718, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.82112646, + "num_input_tokens_seen": 199668870, + "step": 9271, + "time_per_iteration": 4.303653240203857 + }, + { + "auxiliary_loss_clip": 0.01431474, + "auxiliary_loss_mlp": 0.01228419, + "balance_loss_clip": 1.11372972, + "balance_loss_mlp": 1.02824283, + "epoch": 0.5574627987374117, + "flos": 24784100100000.0, + "grad_norm": 3.1353277335140395, + "language_loss": 0.90417027, + "learning_rate": 1.725712500427442e-06, + "loss": 0.93076926, + "num_input_tokens_seen": 199684870, + "step": 9272, + "time_per_iteration": 2.7590625286102295 + }, + { + "auxiliary_loss_clip": 0.01434366, + "auxiliary_loss_mlp": 0.01230573, + "balance_loss_clip": 1.11877036, + "balance_loss_mlp": 1.03087389, + "epoch": 0.5575229219900797, + "flos": 21837090486240.0, + "grad_norm": 2.042550531881931, + "language_loss": 0.8347792, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.86142862, + "num_input_tokens_seen": 199701975, + "step": 9273, + "time_per_iteration": 2.776261568069458 + }, + { + "auxiliary_loss_clip": 0.014357, + "auxiliary_loss_mlp": 0.01230625, + "balance_loss_clip": 1.12029552, + "balance_loss_mlp": 1.02749217, + "epoch": 0.5575830452427476, + "flos": 27817434934560.0, + "grad_norm": 1.902131680980139, + "language_loss": 0.73983943, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76650262, + "num_input_tokens_seen": 199721865, + "step": 9274, + "time_per_iteration": 2.7609102725982666 + }, + { + "auxiliary_loss_clip": 0.01442198, + "auxiliary_loss_mlp": 0.01233462, + "balance_loss_clip": 1.12443244, + "balance_loss_mlp": 1.02994823, + "epoch": 0.5576431684954156, + "flos": 17814238048800.0, + "grad_norm": 3.186126861786688, + "language_loss": 0.78097856, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.8077352, + "num_input_tokens_seen": 199736455, + "step": 9275, + "time_per_iteration": 2.7608096599578857 + }, + { + "auxiliary_loss_clip": 0.01435035, + "auxiliary_loss_mlp": 0.01228458, + "balance_loss_clip": 1.11917186, + "balance_loss_mlp": 1.02990305, + "epoch": 0.5577032917480835, + "flos": 15488687530080.0, + "grad_norm": 2.0929575802206473, + "language_loss": 0.75029171, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.77692664, + "num_input_tokens_seen": 199753125, + "step": 9276, + "time_per_iteration": 2.7019050121307373 + }, + { + "auxiliary_loss_clip": 0.01425157, + "auxiliary_loss_mlp": 0.01230201, + "balance_loss_clip": 1.10870707, + "balance_loss_mlp": 1.03021598, + "epoch": 0.5577634150007516, + "flos": 21581907275520.0, + "grad_norm": 2.006915854748332, + "language_loss": 0.75064677, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77720034, + "num_input_tokens_seen": 199771365, + "step": 9277, + "time_per_iteration": 4.2654736042022705 + }, + { + "auxiliary_loss_clip": 0.0143238, + "auxiliary_loss_mlp": 0.01220844, + "balance_loss_clip": 1.11610651, + "balance_loss_mlp": 1.02095342, + "epoch": 0.5578235382534195, + "flos": 21141746606880.0, + "grad_norm": 1.6352672730213988, + "language_loss": 0.71726847, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.74380076, + "num_input_tokens_seen": 199790035, + "step": 9278, + "time_per_iteration": 2.843033790588379 + }, + { + "auxiliary_loss_clip": 0.01440241, + "auxiliary_loss_mlp": 0.01231577, + "balance_loss_clip": 1.12523782, + "balance_loss_mlp": 1.02977908, + "epoch": 0.5578836615060875, + "flos": 26507610741600.0, + "grad_norm": 1.9179583920890082, + "language_loss": 0.75919569, + "learning_rate": 1.723012284057868e-06, + "loss": 0.78591394, + "num_input_tokens_seen": 199811125, + "step": 9279, + "time_per_iteration": 2.7944931983947754 + }, + { + "auxiliary_loss_clip": 0.01435937, + "auxiliary_loss_mlp": 0.0122695, + "balance_loss_clip": 1.11872029, + "balance_loss_mlp": 1.02791786, + "epoch": 0.5579437847587555, + "flos": 20155680466560.0, + "grad_norm": 1.910700350062829, + "language_loss": 0.67276865, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69939756, + "num_input_tokens_seen": 199829915, + "step": 9280, + "time_per_iteration": 4.267322301864624 + }, + { + "auxiliary_loss_clip": 0.01431517, + "auxiliary_loss_mlp": 0.01243423, + "balance_loss_clip": 1.11512387, + "balance_loss_mlp": 1.04591739, + "epoch": 0.5580039080114234, + "flos": 26104582105920.0, + "grad_norm": 1.6759506143196141, + "language_loss": 0.73352718, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.76027656, + "num_input_tokens_seen": 199850670, + "step": 9281, + "time_per_iteration": 2.8833727836608887 + }, + { + "auxiliary_loss_clip": 0.01437613, + "auxiliary_loss_mlp": 0.01229515, + "balance_loss_clip": 1.12128747, + "balance_loss_mlp": 1.03038836, + "epoch": 0.5580640312640914, + "flos": 13773369371040.0, + "grad_norm": 4.974112976360993, + "language_loss": 0.75547224, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.78214359, + "num_input_tokens_seen": 199867645, + "step": 9282, + "time_per_iteration": 2.75565767288208 + }, + { + "auxiliary_loss_clip": 0.01432709, + "auxiliary_loss_mlp": 0.01230675, + "balance_loss_clip": 1.11636806, + "balance_loss_mlp": 1.03440857, + "epoch": 0.5581241545167593, + "flos": 17677922784480.0, + "grad_norm": 1.7860852487929921, + "language_loss": 0.66455704, + "learning_rate": 1.721469534028297e-06, + "loss": 0.6911909, + "num_input_tokens_seen": 199886320, + "step": 9283, + "time_per_iteration": 2.8173599243164062 + }, + { + "auxiliary_loss_clip": 0.01430672, + "auxiliary_loss_mlp": 0.01227437, + "balance_loss_clip": 1.11379921, + "balance_loss_mlp": 1.02840543, + "epoch": 0.5581842777694274, + "flos": 19570974122880.0, + "grad_norm": 1.7477754696004932, + "language_loss": 0.83136356, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85794467, + "num_input_tokens_seen": 199904895, + "step": 9284, + "time_per_iteration": 4.31097149848938 + }, + { + "auxiliary_loss_clip": 0.01433666, + "auxiliary_loss_mlp": 0.01226323, + "balance_loss_clip": 1.1169517, + "balance_loss_mlp": 1.02547979, + "epoch": 0.5582444010220953, + "flos": 20597282405280.0, + "grad_norm": 2.6396436766175113, + "language_loss": 0.8497557, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87635565, + "num_input_tokens_seen": 199921090, + "step": 9285, + "time_per_iteration": 2.7502450942993164 + }, + { + "auxiliary_loss_clip": 0.01435461, + "auxiliary_loss_mlp": 0.01239309, + "balance_loss_clip": 1.11864638, + "balance_loss_mlp": 1.03884661, + "epoch": 0.5583045242747633, + "flos": 19137526738560.0, + "grad_norm": 3.4122239752348733, + "language_loss": 0.73220754, + "learning_rate": 1.720312582354912e-06, + "loss": 0.75895524, + "num_input_tokens_seen": 199939925, + "step": 9286, + "time_per_iteration": 2.8446717262268066 + }, + { + "auxiliary_loss_clip": 0.01435549, + "auxiliary_loss_mlp": 0.01227847, + "balance_loss_clip": 1.11692762, + "balance_loss_mlp": 1.02604938, + "epoch": 0.5583646475274312, + "flos": 27457075843200.0, + "grad_norm": 1.7085465004671518, + "language_loss": 0.74064529, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76727927, + "num_input_tokens_seen": 199960015, + "step": 9287, + "time_per_iteration": 2.8480470180511475 + }, + { + "auxiliary_loss_clip": 0.01433981, + "auxiliary_loss_mlp": 0.01233714, + "balance_loss_clip": 1.11582685, + "balance_loss_mlp": 1.03248823, + "epoch": 0.5584247707800992, + "flos": 23655726046080.0, + "grad_norm": 1.8178384174921258, + "language_loss": 0.7489602, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77563715, + "num_input_tokens_seen": 199980505, + "step": 9288, + "time_per_iteration": 2.8656818866729736 + }, + { + "auxiliary_loss_clip": 0.01438419, + "auxiliary_loss_mlp": 0.01237453, + "balance_loss_clip": 1.12101042, + "balance_loss_mlp": 1.03527451, + "epoch": 0.5584848940327671, + "flos": 13700584503360.0, + "grad_norm": 2.170957360776503, + "language_loss": 0.78077376, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.80753243, + "num_input_tokens_seen": 199999020, + "step": 9289, + "time_per_iteration": 2.7896575927734375 + }, + { + "auxiliary_loss_clip": 0.01433176, + "auxiliary_loss_mlp": 0.01229877, + "balance_loss_clip": 1.11460245, + "balance_loss_mlp": 1.02950978, + "epoch": 0.5585450172854352, + "flos": 27018659869920.0, + "grad_norm": 1.742814394528739, + "language_loss": 0.61209595, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63872653, + "num_input_tokens_seen": 200019020, + "step": 9290, + "time_per_iteration": 2.8904991149902344 + }, + { + "auxiliary_loss_clip": 0.01423651, + "auxiliary_loss_mlp": 0.01237691, + "balance_loss_clip": 1.10598278, + "balance_loss_mlp": 1.03741956, + "epoch": 0.5586051405381031, + "flos": 23187915384480.0, + "grad_norm": 2.880986276987096, + "language_loss": 0.67666101, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70327449, + "num_input_tokens_seen": 200038110, + "step": 9291, + "time_per_iteration": 2.8684918880462646 + }, + { + "auxiliary_loss_clip": 0.0143108, + "auxiliary_loss_mlp": 0.01234912, + "balance_loss_clip": 1.11290264, + "balance_loss_mlp": 1.03597569, + "epoch": 0.5586652637907711, + "flos": 20777556771360.0, + "grad_norm": 3.5246363022495464, + "language_loss": 0.83544302, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.86210293, + "num_input_tokens_seen": 200056210, + "step": 9292, + "time_per_iteration": 2.7564241886138916 + }, + { + "auxiliary_loss_clip": 0.01427745, + "auxiliary_loss_mlp": 0.01231042, + "balance_loss_clip": 1.11036086, + "balance_loss_mlp": 1.03277326, + "epoch": 0.5587253870434391, + "flos": 28222397906400.0, + "grad_norm": 2.199050124868234, + "language_loss": 0.73354286, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.76013076, + "num_input_tokens_seen": 200075620, + "step": 9293, + "time_per_iteration": 2.841670036315918 + }, + { + "auxiliary_loss_clip": 0.01433606, + "auxiliary_loss_mlp": 0.01230321, + "balance_loss_clip": 1.11415958, + "balance_loss_mlp": 1.03195643, + "epoch": 0.558785510296107, + "flos": 26618210349120.0, + "grad_norm": 2.573747920034944, + "language_loss": 0.72868997, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.75532925, + "num_input_tokens_seen": 200095945, + "step": 9294, + "time_per_iteration": 2.842101573944092 + }, + { + "auxiliary_loss_clip": 0.01426319, + "auxiliary_loss_mlp": 0.01236003, + "balance_loss_clip": 1.10770845, + "balance_loss_mlp": 1.03811514, + "epoch": 0.558845633548775, + "flos": 20158828503840.0, + "grad_norm": 3.6664746124702345, + "language_loss": 0.68304181, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70966506, + "num_input_tokens_seen": 200114185, + "step": 9295, + "time_per_iteration": 2.803410291671753 + }, + { + "auxiliary_loss_clip": 0.01434751, + "auxiliary_loss_mlp": 0.01230975, + "balance_loss_clip": 1.11416507, + "balance_loss_mlp": 1.03356457, + "epoch": 0.5589057568014429, + "flos": 24352966333440.0, + "grad_norm": 1.5631902465785255, + "language_loss": 0.80648851, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.83314574, + "num_input_tokens_seen": 200135030, + "step": 9296, + "time_per_iteration": 2.8638417720794678 + }, + { + "auxiliary_loss_clip": 0.01432919, + "auxiliary_loss_mlp": 0.01224808, + "balance_loss_clip": 1.11342502, + "balance_loss_mlp": 1.02673042, + "epoch": 0.558965880054111, + "flos": 21107269545120.0, + "grad_norm": 2.0262726977295027, + "language_loss": 0.65702409, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.68360132, + "num_input_tokens_seen": 200154290, + "step": 9297, + "time_per_iteration": 2.7617409229278564 + }, + { + "auxiliary_loss_clip": 0.01433992, + "auxiliary_loss_mlp": 0.01239642, + "balance_loss_clip": 1.1147759, + "balance_loss_mlp": 1.04013371, + "epoch": 0.5590260033067789, + "flos": 18437707336320.0, + "grad_norm": 1.687685850365947, + "language_loss": 0.75352418, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.78026056, + "num_input_tokens_seen": 200171555, + "step": 9298, + "time_per_iteration": 2.794816255569458 + }, + { + "auxiliary_loss_clip": 0.01480205, + "auxiliary_loss_mlp": 0.01212898, + "balance_loss_clip": 1.18149281, + "balance_loss_mlp": 1.0233078, + "epoch": 0.5590861265594469, + "flos": 70584213499200.0, + "grad_norm": 0.6791766732385746, + "language_loss": 0.52325916, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.55019021, + "num_input_tokens_seen": 200237010, + "step": 9299, + "time_per_iteration": 3.319758176803589 + }, + { + "auxiliary_loss_clip": 0.01434059, + "auxiliary_loss_mlp": 0.01228045, + "balance_loss_clip": 1.11501825, + "balance_loss_mlp": 1.03177857, + "epoch": 0.5591462498121148, + "flos": 30667043940480.0, + "grad_norm": 2.2176935666341135, + "language_loss": 0.68987668, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.71649766, + "num_input_tokens_seen": 200260820, + "step": 9300, + "time_per_iteration": 2.9098522663116455 + }, + { + "auxiliary_loss_clip": 0.0143226, + "auxiliary_loss_mlp": 0.01230949, + "balance_loss_clip": 1.11439669, + "balance_loss_mlp": 1.03248978, + "epoch": 0.5592063730647828, + "flos": 18152408802240.0, + "grad_norm": 1.8357381034040579, + "language_loss": 0.81789482, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.84452689, + "num_input_tokens_seen": 200278035, + "step": 9301, + "time_per_iteration": 2.7786972522735596 + }, + { + "auxiliary_loss_clip": 0.01433747, + "auxiliary_loss_mlp": 0.01223437, + "balance_loss_clip": 1.11407948, + "balance_loss_mlp": 1.02602696, + "epoch": 0.5592664963174507, + "flos": 24062661282240.0, + "grad_norm": 2.6081798720441056, + "language_loss": 0.67637086, + "learning_rate": 1.714143795138756e-06, + "loss": 0.70294273, + "num_input_tokens_seen": 200297255, + "step": 9302, + "time_per_iteration": 2.8117129802703857 + }, + { + "auxiliary_loss_clip": 0.01433959, + "auxiliary_loss_mlp": 0.01234975, + "balance_loss_clip": 1.11339068, + "balance_loss_mlp": 1.03518033, + "epoch": 0.5593266195701188, + "flos": 19829912221440.0, + "grad_norm": 2.2872771416554127, + "language_loss": 0.71052885, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73721814, + "num_input_tokens_seen": 200317505, + "step": 9303, + "time_per_iteration": 2.789454936981201 + }, + { + "auxiliary_loss_clip": 0.01444627, + "auxiliary_loss_mlp": 0.01219915, + "balance_loss_clip": 1.12565255, + "balance_loss_mlp": 1.02145576, + "epoch": 0.5593867428227867, + "flos": 25303076213760.0, + "grad_norm": 1.6198976580296263, + "language_loss": 0.72878337, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.75542879, + "num_input_tokens_seen": 200338350, + "step": 9304, + "time_per_iteration": 2.7891364097595215 + }, + { + "auxiliary_loss_clip": 0.01433, + "auxiliary_loss_mlp": 0.01228036, + "balance_loss_clip": 1.1135428, + "balance_loss_mlp": 1.02662015, + "epoch": 0.5594468660754547, + "flos": 12934693517760.0, + "grad_norm": 2.027545281205445, + "language_loss": 0.77665573, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80326611, + "num_input_tokens_seen": 200353965, + "step": 9305, + "time_per_iteration": 2.7618730068206787 + }, + { + "auxiliary_loss_clip": 0.0143644, + "auxiliary_loss_mlp": 0.01225497, + "balance_loss_clip": 1.11806774, + "balance_loss_mlp": 1.02646494, + "epoch": 0.5595069893281227, + "flos": 19064741870880.0, + "grad_norm": 2.6402952868181955, + "language_loss": 0.69495225, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.72157162, + "num_input_tokens_seen": 200373595, + "step": 9306, + "time_per_iteration": 2.783165454864502 + }, + { + "auxiliary_loss_clip": 0.014909, + "auxiliary_loss_mlp": 0.01203728, + "balance_loss_clip": 1.1901747, + "balance_loss_mlp": 1.01070404, + "epoch": 0.5595671125807906, + "flos": 70279457819040.0, + "grad_norm": 0.9285247440006633, + "language_loss": 0.60233581, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62928212, + "num_input_tokens_seen": 200429155, + "step": 9307, + "time_per_iteration": 3.414100408554077 + }, + { + "auxiliary_loss_clip": 0.01439098, + "auxiliary_loss_mlp": 0.01218551, + "balance_loss_clip": 1.12002695, + "balance_loss_mlp": 1.01999593, + "epoch": 0.5596272358334586, + "flos": 20667488158080.0, + "grad_norm": 1.8709929861765333, + "language_loss": 0.7407847, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76736128, + "num_input_tokens_seen": 200448290, + "step": 9308, + "time_per_iteration": 2.867949962615967 + }, + { + "auxiliary_loss_clip": 0.01440875, + "auxiliary_loss_mlp": 0.01220739, + "balance_loss_clip": 1.12175965, + "balance_loss_mlp": 1.02046776, + "epoch": 0.5596873590861265, + "flos": 25043076126720.0, + "grad_norm": 1.815025943869961, + "language_loss": 0.6953094, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.7219255, + "num_input_tokens_seen": 200466555, + "step": 9309, + "time_per_iteration": 4.226285219192505 + }, + { + "auxiliary_loss_clip": 0.01439017, + "auxiliary_loss_mlp": 0.01228089, + "balance_loss_clip": 1.12063694, + "balance_loss_mlp": 1.02667356, + "epoch": 0.5597474823387946, + "flos": 25960794994080.0, + "grad_norm": 1.9880434026927867, + "language_loss": 0.75019228, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77686334, + "num_input_tokens_seen": 200485980, + "step": 9310, + "time_per_iteration": 2.8902618885040283 + }, + { + "auxiliary_loss_clip": 0.01446784, + "auxiliary_loss_mlp": 0.01229878, + "balance_loss_clip": 1.12612391, + "balance_loss_mlp": 1.02464712, + "epoch": 0.5598076055914625, + "flos": 26179870232160.0, + "grad_norm": 3.9653479986650977, + "language_loss": 0.6947006, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.72146714, + "num_input_tokens_seen": 200504555, + "step": 9311, + "time_per_iteration": 2.849303722381592 + }, + { + "auxiliary_loss_clip": 0.0143587, + "auxiliary_loss_mlp": 0.01229284, + "balance_loss_clip": 1.11714005, + "balance_loss_mlp": 1.02882183, + "epoch": 0.5598677288441305, + "flos": 11657374122240.0, + "grad_norm": 2.0989377892006704, + "language_loss": 0.72226048, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74891204, + "num_input_tokens_seen": 200522700, + "step": 9312, + "time_per_iteration": 2.758106231689453 + }, + { + "auxiliary_loss_clip": 0.01439471, + "auxiliary_loss_mlp": 0.01226118, + "balance_loss_clip": 1.1216166, + "balance_loss_mlp": 1.02680016, + "epoch": 0.5599278520967984, + "flos": 22968840146400.0, + "grad_norm": 2.0419524923966885, + "language_loss": 0.89311409, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91977, + "num_input_tokens_seen": 200541910, + "step": 9313, + "time_per_iteration": 2.799955129623413 + }, + { + "auxiliary_loss_clip": 0.01441151, + "auxiliary_loss_mlp": 0.01233243, + "balance_loss_clip": 1.1231823, + "balance_loss_mlp": 1.03335261, + "epoch": 0.5599879753494664, + "flos": 21217717440000.0, + "grad_norm": 1.668822887875127, + "language_loss": 0.78031409, + "learning_rate": 1.709519022520204e-06, + "loss": 0.8070581, + "num_input_tokens_seen": 200562600, + "step": 9314, + "time_per_iteration": 2.7753891944885254 + }, + { + "auxiliary_loss_clip": 0.01433541, + "auxiliary_loss_mlp": 0.01220504, + "balance_loss_clip": 1.11382318, + "balance_loss_mlp": 1.020805, + "epoch": 0.5600480986021343, + "flos": 31905827961120.0, + "grad_norm": 1.9729598154406756, + "language_loss": 0.70127612, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72781658, + "num_input_tokens_seen": 200584795, + "step": 9315, + "time_per_iteration": 2.8458142280578613 + }, + { + "auxiliary_loss_clip": 0.01438934, + "auxiliary_loss_mlp": 0.01227967, + "balance_loss_clip": 1.12033224, + "balance_loss_mlp": 1.02779055, + "epoch": 0.5601082218548024, + "flos": 28478605177440.0, + "grad_norm": 2.076489462033761, + "language_loss": 0.67210442, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.69877344, + "num_input_tokens_seen": 200606945, + "step": 9316, + "time_per_iteration": 4.369388580322266 + }, + { + "auxiliary_loss_clip": 0.01439245, + "auxiliary_loss_mlp": 0.01220978, + "balance_loss_clip": 1.11970007, + "balance_loss_mlp": 1.0191803, + "epoch": 0.5601683451074703, + "flos": 24099451961760.0, + "grad_norm": 2.2629568088200536, + "language_loss": 0.86677516, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.89337742, + "num_input_tokens_seen": 200626340, + "step": 9317, + "time_per_iteration": 2.7609333992004395 + }, + { + "auxiliary_loss_clip": 0.01438088, + "auxiliary_loss_mlp": 0.01225906, + "balance_loss_clip": 1.117805, + "balance_loss_mlp": 1.02239156, + "epoch": 0.5602284683601383, + "flos": 26358437831040.0, + "grad_norm": 1.846209196073544, + "language_loss": 0.77372098, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.80036092, + "num_input_tokens_seen": 200644520, + "step": 9318, + "time_per_iteration": 4.182339191436768 + }, + { + "auxiliary_loss_clip": 0.01436045, + "auxiliary_loss_mlp": 0.01225155, + "balance_loss_clip": 1.11676598, + "balance_loss_mlp": 1.02812612, + "epoch": 0.5602885916128063, + "flos": 24498573996960.0, + "grad_norm": 1.4456584473843668, + "language_loss": 0.7618227, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78843462, + "num_input_tokens_seen": 200664845, + "step": 9319, + "time_per_iteration": 2.8371622562408447 + }, + { + "auxiliary_loss_clip": 0.01437951, + "auxiliary_loss_mlp": 0.01223012, + "balance_loss_clip": 1.11991656, + "balance_loss_mlp": 1.02417123, + "epoch": 0.5603487148654742, + "flos": 27347500296000.0, + "grad_norm": 1.4727923903323807, + "language_loss": 0.85190779, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87851745, + "num_input_tokens_seen": 200686535, + "step": 9320, + "time_per_iteration": 2.848019599914551 + }, + { + "auxiliary_loss_clip": 0.01491469, + "auxiliary_loss_mlp": 0.01200371, + "balance_loss_clip": 1.18988585, + "balance_loss_mlp": 1.00772858, + "epoch": 0.5604088381181422, + "flos": 54093719278080.0, + "grad_norm": 0.754100563369026, + "language_loss": 0.5252732, + "learning_rate": 1.706821969374996e-06, + "loss": 0.55219162, + "num_input_tokens_seen": 200736965, + "step": 9321, + "time_per_iteration": 3.1641082763671875 + }, + { + "auxiliary_loss_clip": 0.01441934, + "auxiliary_loss_mlp": 0.01236402, + "balance_loss_clip": 1.1237725, + "balance_loss_mlp": 1.03679752, + "epoch": 0.5604689613708101, + "flos": 22238564067360.0, + "grad_norm": 1.3757435764057293, + "language_loss": 0.74470162, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.77148497, + "num_input_tokens_seen": 200757420, + "step": 9322, + "time_per_iteration": 4.459918260574341 + }, + { + "auxiliary_loss_clip": 0.01437091, + "auxiliary_loss_mlp": 0.01225644, + "balance_loss_clip": 1.11885571, + "balance_loss_mlp": 1.02785158, + "epoch": 0.5605290846234782, + "flos": 35300659731840.0, + "grad_norm": 1.6252033280244786, + "language_loss": 0.73836708, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.76499444, + "num_input_tokens_seen": 200779520, + "step": 9323, + "time_per_iteration": 3.013690948486328 + }, + { + "auxiliary_loss_clip": 0.01438613, + "auxiliary_loss_mlp": 0.01227131, + "balance_loss_clip": 1.12006736, + "balance_loss_mlp": 1.02829015, + "epoch": 0.5605892078761461, + "flos": 20265218085600.0, + "grad_norm": 2.306606649350455, + "language_loss": 0.62161052, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.64826798, + "num_input_tokens_seen": 200799485, + "step": 9324, + "time_per_iteration": 2.8667874336242676 + }, + { + "auxiliary_loss_clip": 0.01433323, + "auxiliary_loss_mlp": 0.01224631, + "balance_loss_clip": 1.11431515, + "balance_loss_mlp": 1.0245502, + "epoch": 0.5606493311288141, + "flos": 17310091845600.0, + "grad_norm": 1.9253756830175983, + "language_loss": 0.88094199, + "learning_rate": 1.705281040409226e-06, + "loss": 0.90752149, + "num_input_tokens_seen": 200817540, + "step": 9325, + "time_per_iteration": 2.7833733558654785 + }, + { + "auxiliary_loss_clip": 0.01443695, + "auxiliary_loss_mlp": 0.01223287, + "balance_loss_clip": 1.12506282, + "balance_loss_mlp": 1.02301562, + "epoch": 0.560709454381482, + "flos": 21655147281120.0, + "grad_norm": 1.5591283587506235, + "language_loss": 0.73861301, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76528287, + "num_input_tokens_seen": 200838380, + "step": 9326, + "time_per_iteration": 2.7942869663238525 + }, + { + "auxiliary_loss_clip": 0.01441324, + "auxiliary_loss_mlp": 0.01240006, + "balance_loss_clip": 1.12212849, + "balance_loss_mlp": 1.03820837, + "epoch": 0.56076957763415, + "flos": 20305460227680.0, + "grad_norm": 2.062100557098519, + "language_loss": 0.78083813, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.8076514, + "num_input_tokens_seen": 200855640, + "step": 9327, + "time_per_iteration": 2.7963485717773438 + }, + { + "auxiliary_loss_clip": 0.01444213, + "auxiliary_loss_mlp": 0.01236231, + "balance_loss_clip": 1.12456751, + "balance_loss_mlp": 1.03500617, + "epoch": 0.5608297008868179, + "flos": 25048234356480.0, + "grad_norm": 1.6630411515076622, + "language_loss": 0.78227186, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80907631, + "num_input_tokens_seen": 200876585, + "step": 9328, + "time_per_iteration": 2.8301994800567627 + }, + { + "auxiliary_loss_clip": 0.01442629, + "auxiliary_loss_mlp": 0.01232184, + "balance_loss_clip": 1.12299037, + "balance_loss_mlp": 1.03401041, + "epoch": 0.560889824139486, + "flos": 19868978590560.0, + "grad_norm": 1.5433441103861363, + "language_loss": 0.73647773, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.76322585, + "num_input_tokens_seen": 200898175, + "step": 9329, + "time_per_iteration": 2.810624599456787 + }, + { + "auxiliary_loss_clip": 0.01440264, + "auxiliary_loss_mlp": 0.01234877, + "balance_loss_clip": 1.12007856, + "balance_loss_mlp": 1.03794312, + "epoch": 0.5609499473921539, + "flos": 22931594328960.0, + "grad_norm": 1.6665682620271443, + "language_loss": 0.83529299, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.86204445, + "num_input_tokens_seen": 200917515, + "step": 9330, + "time_per_iteration": 2.803854465484619 + }, + { + "auxiliary_loss_clip": 0.01507873, + "auxiliary_loss_mlp": 0.01183044, + "balance_loss_clip": 1.20447433, + "balance_loss_mlp": 0.98963928, + "epoch": 0.5610100706448219, + "flos": 53042074620480.0, + "grad_norm": 0.7266957382891404, + "language_loss": 0.57776862, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.6046778, + "num_input_tokens_seen": 200978615, + "step": 9331, + "time_per_iteration": 3.3631222248077393 + }, + { + "auxiliary_loss_clip": 0.01444311, + "auxiliary_loss_mlp": 0.01221536, + "balance_loss_clip": 1.12430501, + "balance_loss_mlp": 1.02345777, + "epoch": 0.5610701938974898, + "flos": 21837090486240.0, + "grad_norm": 2.1297613902246177, + "language_loss": 0.81732315, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.84398156, + "num_input_tokens_seen": 200997745, + "step": 9332, + "time_per_iteration": 2.8204784393310547 + }, + { + "auxiliary_loss_clip": 0.01450926, + "auxiliary_loss_mlp": 0.01235631, + "balance_loss_clip": 1.12957847, + "balance_loss_mlp": 1.03307056, + "epoch": 0.5611303171501578, + "flos": 17459113043520.0, + "grad_norm": 2.15057234711735, + "language_loss": 0.81631052, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.84317601, + "num_input_tokens_seen": 201016370, + "step": 9333, + "time_per_iteration": 2.752558708190918 + }, + { + "auxiliary_loss_clip": 0.01443048, + "auxiliary_loss_mlp": 0.0122322, + "balance_loss_clip": 1.12396991, + "balance_loss_mlp": 1.02676344, + "epoch": 0.5611904404028258, + "flos": 22639999720320.0, + "grad_norm": 1.6361970257469831, + "language_loss": 0.72700608, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.75366879, + "num_input_tokens_seen": 201034310, + "step": 9334, + "time_per_iteration": 2.7952792644500732 + }, + { + "auxiliary_loss_clip": 0.01452252, + "auxiliary_loss_mlp": 0.01227872, + "balance_loss_clip": 1.13145614, + "balance_loss_mlp": 1.03122473, + "epoch": 0.5612505636554938, + "flos": 14317909428960.0, + "grad_norm": 1.9959192040780656, + "language_loss": 0.70842844, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73522967, + "num_input_tokens_seen": 201052030, + "step": 9335, + "time_per_iteration": 2.7559781074523926 + }, + { + "auxiliary_loss_clip": 0.01445916, + "auxiliary_loss_mlp": 0.01238029, + "balance_loss_clip": 1.12495708, + "balance_loss_mlp": 1.04033196, + "epoch": 0.5613106869081618, + "flos": 16510141008000.0, + "grad_norm": 1.9599728717935196, + "language_loss": 0.76881009, + "learning_rate": 1.701044410566205e-06, + "loss": 0.79564953, + "num_input_tokens_seen": 201068445, + "step": 9336, + "time_per_iteration": 2.7736494541168213 + }, + { + "auxiliary_loss_clip": 0.01449189, + "auxiliary_loss_mlp": 0.01231422, + "balance_loss_clip": 1.12824488, + "balance_loss_mlp": 1.03315318, + "epoch": 0.5613708101608297, + "flos": 24060726946080.0, + "grad_norm": 2.358579380293774, + "language_loss": 0.64944041, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.67624646, + "num_input_tokens_seen": 201082140, + "step": 9337, + "time_per_iteration": 2.7843642234802246 + }, + { + "auxiliary_loss_clip": 0.01505416, + "auxiliary_loss_mlp": 0.01200447, + "balance_loss_clip": 1.19999695, + "balance_loss_mlp": 1.00818634, + "epoch": 0.5614309334134977, + "flos": 64911280066560.0, + "grad_norm": 0.8879944532108303, + "language_loss": 0.62542433, + "learning_rate": 1.700274261035102e-06, + "loss": 0.65248299, + "num_input_tokens_seen": 201137245, + "step": 9338, + "time_per_iteration": 3.3251969814300537 + }, + { + "auxiliary_loss_clip": 0.01447037, + "auxiliary_loss_mlp": 0.01233119, + "balance_loss_clip": 1.12599778, + "balance_loss_mlp": 1.03465915, + "epoch": 0.5614910566661656, + "flos": 32922236993760.0, + "grad_norm": 2.0431124523058646, + "language_loss": 0.65796947, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.68477106, + "num_input_tokens_seen": 201157270, + "step": 9339, + "time_per_iteration": 2.899782419204712 + }, + { + "auxiliary_loss_clip": 0.01453297, + "auxiliary_loss_mlp": 0.01227492, + "balance_loss_clip": 1.13304305, + "balance_loss_mlp": 1.02674413, + "epoch": 0.5615511799188336, + "flos": 18590900631840.0, + "grad_norm": 1.7281619645289366, + "language_loss": 0.69985318, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72666109, + "num_input_tokens_seen": 201174530, + "step": 9340, + "time_per_iteration": 2.7238166332244873 + }, + { + "auxiliary_loss_clip": 0.01453171, + "auxiliary_loss_mlp": 0.01231758, + "balance_loss_clip": 1.1330626, + "balance_loss_mlp": 1.03224945, + "epoch": 0.5616113031715015, + "flos": 22822208422560.0, + "grad_norm": 1.600425056406738, + "language_loss": 0.77550805, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.80235738, + "num_input_tokens_seen": 201194905, + "step": 9341, + "time_per_iteration": 2.848162889480591 + }, + { + "auxiliary_loss_clip": 0.01445609, + "auxiliary_loss_mlp": 0.01235643, + "balance_loss_clip": 1.12336552, + "balance_loss_mlp": 1.03737414, + "epoch": 0.5616714264241696, + "flos": 22347874117440.0, + "grad_norm": 1.7951239316744443, + "language_loss": 0.79308236, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81989491, + "num_input_tokens_seen": 201213715, + "step": 9342, + "time_per_iteration": 2.7684988975524902 + }, + { + "auxiliary_loss_clip": 0.01448452, + "auxiliary_loss_mlp": 0.01232764, + "balance_loss_clip": 1.12870336, + "balance_loss_mlp": 1.03315961, + "epoch": 0.5617315496768375, + "flos": 18809824157280.0, + "grad_norm": 8.544950222023003, + "language_loss": 0.76333499, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.79014713, + "num_input_tokens_seen": 201231415, + "step": 9343, + "time_per_iteration": 2.8049192428588867 + }, + { + "auxiliary_loss_clip": 0.01446461, + "auxiliary_loss_mlp": 0.0123602, + "balance_loss_clip": 1.12822437, + "balance_loss_mlp": 1.03574824, + "epoch": 0.5617916729295055, + "flos": 18371294399520.0, + "grad_norm": 1.8370151779758415, + "language_loss": 0.68759942, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.71442425, + "num_input_tokens_seen": 201249625, + "step": 9344, + "time_per_iteration": 2.7629494667053223 + }, + { + "auxiliary_loss_clip": 0.01451241, + "auxiliary_loss_mlp": 0.01237868, + "balance_loss_clip": 1.12932515, + "balance_loss_mlp": 1.03559339, + "epoch": 0.5618517961821734, + "flos": 28182269548800.0, + "grad_norm": 2.3631613354507226, + "language_loss": 0.66465151, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.69154263, + "num_input_tokens_seen": 201271205, + "step": 9345, + "time_per_iteration": 2.860776662826538 + }, + { + "auxiliary_loss_clip": 0.01447542, + "auxiliary_loss_mlp": 0.01236318, + "balance_loss_clip": 1.12745416, + "balance_loss_mlp": 1.0361414, + "epoch": 0.5619119194348414, + "flos": 15488687530080.0, + "grad_norm": 2.1272557258538654, + "language_loss": 0.87443358, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.90127218, + "num_input_tokens_seen": 201287700, + "step": 9346, + "time_per_iteration": 2.7207694053649902 + }, + { + "auxiliary_loss_clip": 0.01448455, + "auxiliary_loss_mlp": 0.01236756, + "balance_loss_clip": 1.12741077, + "balance_loss_mlp": 1.03591239, + "epoch": 0.5619720426875094, + "flos": 29131014015360.0, + "grad_norm": 2.1632346397123525, + "language_loss": 0.59774637, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.62459844, + "num_input_tokens_seen": 201307530, + "step": 9347, + "time_per_iteration": 4.196431398391724 + }, + { + "auxiliary_loss_clip": 0.0145044, + "auxiliary_loss_mlp": 0.01227468, + "balance_loss_clip": 1.12847281, + "balance_loss_mlp": 1.0252893, + "epoch": 0.5620321659401774, + "flos": 18005663293920.0, + "grad_norm": 2.140518831181768, + "language_loss": 0.69081676, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71759582, + "num_input_tokens_seen": 201326210, + "step": 9348, + "time_per_iteration": 2.7718067169189453 + }, + { + "auxiliary_loss_clip": 0.01447266, + "auxiliary_loss_mlp": 0.01230011, + "balance_loss_clip": 1.12591076, + "balance_loss_mlp": 1.02687836, + "epoch": 0.5620922891928454, + "flos": 20596599698400.0, + "grad_norm": 7.515127463141074, + "language_loss": 0.79207492, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81884766, + "num_input_tokens_seen": 201346120, + "step": 9349, + "time_per_iteration": 2.7539355754852295 + }, + { + "auxiliary_loss_clip": 0.01451524, + "auxiliary_loss_mlp": 0.01225974, + "balance_loss_clip": 1.12951171, + "balance_loss_mlp": 1.02293706, + "epoch": 0.5621524124455133, + "flos": 26289445779360.0, + "grad_norm": 2.1517314593779453, + "language_loss": 0.66747034, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69424522, + "num_input_tokens_seen": 201365700, + "step": 9350, + "time_per_iteration": 2.856208086013794 + }, + { + "auxiliary_loss_clip": 0.01445697, + "auxiliary_loss_mlp": 0.01228256, + "balance_loss_clip": 1.12621737, + "balance_loss_mlp": 1.02703059, + "epoch": 0.5622125356981813, + "flos": 12751953821280.0, + "grad_norm": 2.2990585922357645, + "language_loss": 0.79257143, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.8193109, + "num_input_tokens_seen": 201382795, + "step": 9351, + "time_per_iteration": 2.743446111679077 + }, + { + "auxiliary_loss_clip": 0.01445697, + "auxiliary_loss_mlp": 0.01230272, + "balance_loss_clip": 1.1248945, + "balance_loss_mlp": 1.03104925, + "epoch": 0.5622726589508492, + "flos": 23807591856000.0, + "grad_norm": 1.6782729612829688, + "language_loss": 0.58981061, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.61657035, + "num_input_tokens_seen": 201402780, + "step": 9352, + "time_per_iteration": 2.9337620735168457 + }, + { + "auxiliary_loss_clip": 0.01443535, + "auxiliary_loss_mlp": 0.01223341, + "balance_loss_clip": 1.12365007, + "balance_loss_mlp": 1.02583551, + "epoch": 0.5623327822035172, + "flos": 24720569703360.0, + "grad_norm": 1.2847951397147575, + "language_loss": 0.71913266, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.74580145, + "num_input_tokens_seen": 201424140, + "step": 9353, + "time_per_iteration": 3.0195319652557373 + }, + { + "auxiliary_loss_clip": 0.01445436, + "auxiliary_loss_mlp": 0.01229998, + "balance_loss_clip": 1.12432516, + "balance_loss_mlp": 1.02905905, + "epoch": 0.5623929054561851, + "flos": 14020549740000.0, + "grad_norm": 2.7986617330814982, + "language_loss": 0.76537454, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.79212892, + "num_input_tokens_seen": 201439645, + "step": 9354, + "time_per_iteration": 4.36852240562439 + }, + { + "auxiliary_loss_clip": 0.01446705, + "auxiliary_loss_mlp": 0.01231439, + "balance_loss_clip": 1.12563872, + "balance_loss_mlp": 1.03164446, + "epoch": 0.5624530287088532, + "flos": 20706516599040.0, + "grad_norm": 1.7517143912581237, + "language_loss": 0.72794801, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.75472951, + "num_input_tokens_seen": 201459970, + "step": 9355, + "time_per_iteration": 2.7735328674316406 + }, + { + "auxiliary_loss_clip": 0.01444709, + "auxiliary_loss_mlp": 0.01233534, + "balance_loss_clip": 1.12401819, + "balance_loss_mlp": 1.03497922, + "epoch": 0.5625131519615211, + "flos": 21473090291520.0, + "grad_norm": 1.714333353171054, + "language_loss": 0.73642969, + "learning_rate": 1.693344975084274e-06, + "loss": 0.76321208, + "num_input_tokens_seen": 201480055, + "step": 9356, + "time_per_iteration": 2.8004751205444336 + }, + { + "auxiliary_loss_clip": 0.01451336, + "auxiliary_loss_mlp": 0.01229174, + "balance_loss_clip": 1.1301657, + "balance_loss_mlp": 1.02918839, + "epoch": 0.5625732752141891, + "flos": 18700021041120.0, + "grad_norm": 2.1217703332932416, + "language_loss": 0.8348335, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.86163855, + "num_input_tokens_seen": 201497645, + "step": 9357, + "time_per_iteration": 4.241018056869507 + }, + { + "auxiliary_loss_clip": 0.0144723, + "auxiliary_loss_mlp": 0.01232478, + "balance_loss_clip": 1.12670887, + "balance_loss_mlp": 1.03535342, + "epoch": 0.562633398466857, + "flos": 16218773968320.0, + "grad_norm": 4.531567542452893, + "language_loss": 0.72099996, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74779701, + "num_input_tokens_seen": 201515455, + "step": 9358, + "time_per_iteration": 2.7671847343444824 + }, + { + "auxiliary_loss_clip": 0.0144288, + "auxiliary_loss_mlp": 0.01229153, + "balance_loss_clip": 1.12132835, + "balance_loss_mlp": 1.03250563, + "epoch": 0.562693521719525, + "flos": 22494126559680.0, + "grad_norm": 1.733382111234854, + "language_loss": 0.77753168, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.80425203, + "num_input_tokens_seen": 201534500, + "step": 9359, + "time_per_iteration": 2.8010189533233643 + }, + { + "auxiliary_loss_clip": 0.0144438, + "auxiliary_loss_mlp": 0.01234245, + "balance_loss_clip": 1.1242094, + "balance_loss_mlp": 1.03492737, + "epoch": 0.562753644972193, + "flos": 25333153608960.0, + "grad_norm": 1.722421445410581, + "language_loss": 0.70107931, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72786558, + "num_input_tokens_seen": 201553280, + "step": 9360, + "time_per_iteration": 2.786954164505005 + }, + { + "auxiliary_loss_clip": 0.01489788, + "auxiliary_loss_mlp": 0.01226044, + "balance_loss_clip": 1.18389833, + "balance_loss_mlp": 1.0345459, + "epoch": 0.562813768224861, + "flos": 67398633573120.0, + "grad_norm": 0.7769547367543598, + "language_loss": 0.55575395, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.58291221, + "num_input_tokens_seen": 201610030, + "step": 9361, + "time_per_iteration": 4.70980978012085 + }, + { + "auxiliary_loss_clip": 0.01445974, + "auxiliary_loss_mlp": 0.01220797, + "balance_loss_clip": 1.12619185, + "balance_loss_mlp": 1.02290916, + "epoch": 0.562873891477529, + "flos": 23333409263520.0, + "grad_norm": 2.7233013100279506, + "language_loss": 0.8190186, + "learning_rate": 1.691036046141018e-06, + "loss": 0.84568632, + "num_input_tokens_seen": 201628370, + "step": 9362, + "time_per_iteration": 2.8172671794891357 + }, + { + "auxiliary_loss_clip": 0.01442976, + "auxiliary_loss_mlp": 0.0122736, + "balance_loss_clip": 1.12185788, + "balance_loss_mlp": 1.03052139, + "epoch": 0.5629340147301969, + "flos": 38475923198400.0, + "grad_norm": 1.7950002847130633, + "language_loss": 0.74416196, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.77086526, + "num_input_tokens_seen": 201649790, + "step": 9363, + "time_per_iteration": 2.933147668838501 + }, + { + "auxiliary_loss_clip": 0.01440438, + "auxiliary_loss_mlp": 0.01231333, + "balance_loss_clip": 1.12013543, + "balance_loss_mlp": 1.03363597, + "epoch": 0.5629941379828649, + "flos": 29244609947520.0, + "grad_norm": 2.0111789064543584, + "language_loss": 0.82832354, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85504127, + "num_input_tokens_seen": 201669175, + "step": 9364, + "time_per_iteration": 2.8424105644226074 + }, + { + "auxiliary_loss_clip": 0.01449249, + "auxiliary_loss_mlp": 0.01235782, + "balance_loss_clip": 1.12781143, + "balance_loss_mlp": 1.03474736, + "epoch": 0.5630542612355328, + "flos": 19422559775520.0, + "grad_norm": 2.5496238626250176, + "language_loss": 0.65026283, + "learning_rate": 1.689881739637642e-06, + "loss": 0.67711312, + "num_input_tokens_seen": 201687000, + "step": 9365, + "time_per_iteration": 2.800311326980591 + }, + { + "auxiliary_loss_clip": 0.01437312, + "auxiliary_loss_mlp": 0.01234865, + "balance_loss_clip": 1.11717749, + "balance_loss_mlp": 1.03621483, + "epoch": 0.5631143844882008, + "flos": 22268110468320.0, + "grad_norm": 3.4234632281787993, + "language_loss": 0.81341821, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.84013999, + "num_input_tokens_seen": 201703335, + "step": 9366, + "time_per_iteration": 2.7988975048065186 + }, + { + "auxiliary_loss_clip": 0.01446255, + "auxiliary_loss_mlp": 0.01231211, + "balance_loss_clip": 1.12508655, + "balance_loss_mlp": 1.03227472, + "epoch": 0.5631745077408687, + "flos": 22967474732640.0, + "grad_norm": 1.466145443766182, + "language_loss": 0.73574007, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.76251471, + "num_input_tokens_seen": 201723495, + "step": 9367, + "time_per_iteration": 2.8070356845855713 + }, + { + "auxiliary_loss_clip": 0.01481032, + "auxiliary_loss_mlp": 0.01200378, + "balance_loss_clip": 1.17534447, + "balance_loss_mlp": 1.00888062, + "epoch": 0.5632346309935368, + "flos": 65087685396000.0, + "grad_norm": 0.619502762799232, + "language_loss": 0.53395092, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.56076503, + "num_input_tokens_seen": 201792615, + "step": 9368, + "time_per_iteration": 3.436574935913086 + }, + { + "auxiliary_loss_clip": 0.014419, + "auxiliary_loss_mlp": 0.01226073, + "balance_loss_clip": 1.12093472, + "balance_loss_mlp": 1.02646899, + "epoch": 0.5632947542462047, + "flos": 23005137759840.0, + "grad_norm": 3.0929769648007106, + "language_loss": 0.69017279, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71685255, + "num_input_tokens_seen": 201812520, + "step": 9369, + "time_per_iteration": 2.832289457321167 + }, + { + "auxiliary_loss_clip": 0.0143759, + "auxiliary_loss_mlp": 0.01224879, + "balance_loss_clip": 1.11505485, + "balance_loss_mlp": 1.02517927, + "epoch": 0.5633548774988727, + "flos": 30485252448000.0, + "grad_norm": 1.8869011071990403, + "language_loss": 0.7574963, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.78412092, + "num_input_tokens_seen": 201834185, + "step": 9370, + "time_per_iteration": 2.8435964584350586 + }, + { + "auxiliary_loss_clip": 0.014323, + "auxiliary_loss_mlp": 0.01229016, + "balance_loss_clip": 1.11103272, + "balance_loss_mlp": 1.03026986, + "epoch": 0.5634150007515406, + "flos": 18517015847520.0, + "grad_norm": 2.0208231708605426, + "language_loss": 0.75688779, + "learning_rate": 1.687573444537108e-06, + "loss": 0.78350097, + "num_input_tokens_seen": 201851305, + "step": 9371, + "time_per_iteration": 2.7721433639526367 + }, + { + "auxiliary_loss_clip": 0.01431247, + "auxiliary_loss_mlp": 0.01225619, + "balance_loss_clip": 1.10979962, + "balance_loss_mlp": 1.02801824, + "epoch": 0.5634751240042086, + "flos": 19246685076000.0, + "grad_norm": 1.9248604026243317, + "language_loss": 0.75902981, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78559846, + "num_input_tokens_seen": 201870350, + "step": 9372, + "time_per_iteration": 2.946154832839966 + }, + { + "auxiliary_loss_clip": 0.01439215, + "auxiliary_loss_mlp": 0.01232187, + "balance_loss_clip": 1.1171782, + "balance_loss_mlp": 1.03201032, + "epoch": 0.5635352472568766, + "flos": 12022246664640.0, + "grad_norm": 2.195752595479656, + "language_loss": 0.7143451, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.74105906, + "num_input_tokens_seen": 201886800, + "step": 9373, + "time_per_iteration": 2.7148630619049072 + }, + { + "auxiliary_loss_clip": 0.01451086, + "auxiliary_loss_mlp": 0.01233405, + "balance_loss_clip": 1.12884343, + "balance_loss_mlp": 1.02989125, + "epoch": 0.5635953705095446, + "flos": 21873805309440.0, + "grad_norm": 2.243002147163164, + "language_loss": 0.8288576, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.85570252, + "num_input_tokens_seen": 201904730, + "step": 9374, + "time_per_iteration": 2.7925779819488525 + }, + { + "auxiliary_loss_clip": 0.01438024, + "auxiliary_loss_mlp": 0.01222974, + "balance_loss_clip": 1.11398911, + "balance_loss_mlp": 1.02422786, + "epoch": 0.5636554937622126, + "flos": 27128652626880.0, + "grad_norm": 1.8971912165601115, + "language_loss": 0.66488856, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.69149852, + "num_input_tokens_seen": 201924850, + "step": 9375, + "time_per_iteration": 2.7764761447906494 + }, + { + "auxiliary_loss_clip": 0.01446631, + "auxiliary_loss_mlp": 0.01235935, + "balance_loss_clip": 1.1226927, + "balance_loss_mlp": 1.03318393, + "epoch": 0.5637156170148805, + "flos": 12927714736320.0, + "grad_norm": 2.3908959930441664, + "language_loss": 0.81027579, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83710146, + "num_input_tokens_seen": 201939500, + "step": 9376, + "time_per_iteration": 2.743083953857422 + }, + { + "auxiliary_loss_clip": 0.01438909, + "auxiliary_loss_mlp": 0.01230125, + "balance_loss_clip": 1.11378062, + "balance_loss_mlp": 1.03004456, + "epoch": 0.5637757402675485, + "flos": 45554526377280.0, + "grad_norm": 1.5939338266965448, + "language_loss": 0.68877304, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.7154634, + "num_input_tokens_seen": 201963000, + "step": 9377, + "time_per_iteration": 2.97015643119812 + }, + { + "auxiliary_loss_clip": 0.01444644, + "auxiliary_loss_mlp": 0.01227836, + "balance_loss_clip": 1.12054551, + "balance_loss_mlp": 1.03042555, + "epoch": 0.5638358635202164, + "flos": 20888118450720.0, + "grad_norm": 1.905598599491265, + "language_loss": 0.7486164, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.77534115, + "num_input_tokens_seen": 201983145, + "step": 9378, + "time_per_iteration": 2.809739828109741 + }, + { + "auxiliary_loss_clip": 0.01437642, + "auxiliary_loss_mlp": 0.01234094, + "balance_loss_clip": 1.11331379, + "balance_loss_mlp": 1.03210568, + "epoch": 0.5638959867728844, + "flos": 18808534599840.0, + "grad_norm": 2.4658797922205666, + "language_loss": 0.82479668, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.85151404, + "num_input_tokens_seen": 202000335, + "step": 9379, + "time_per_iteration": 2.815793037414551 + }, + { + "auxiliary_loss_clip": 0.01437887, + "auxiliary_loss_mlp": 0.012249, + "balance_loss_clip": 1.11394739, + "balance_loss_mlp": 1.02415156, + "epoch": 0.5639561100255523, + "flos": 27492804534240.0, + "grad_norm": 2.470438410458309, + "language_loss": 0.71694016, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.74356806, + "num_input_tokens_seen": 202018275, + "step": 9380, + "time_per_iteration": 2.8158318996429443 + }, + { + "auxiliary_loss_clip": 0.01445049, + "auxiliary_loss_mlp": 0.01231302, + "balance_loss_clip": 1.12192273, + "balance_loss_mlp": 1.03026772, + "epoch": 0.5640162332782204, + "flos": 18078486089760.0, + "grad_norm": 3.5931816268333403, + "language_loss": 0.74331403, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.77007759, + "num_input_tokens_seen": 202034330, + "step": 9381, + "time_per_iteration": 2.7770466804504395 + }, + { + "auxiliary_loss_clip": 0.01441427, + "auxiliary_loss_mlp": 0.01226937, + "balance_loss_clip": 1.11694634, + "balance_loss_mlp": 1.02685618, + "epoch": 0.5640763565308883, + "flos": 20886866821440.0, + "grad_norm": 2.0084820034065323, + "language_loss": 0.72011679, + "learning_rate": 1.683342680176499e-06, + "loss": 0.74680042, + "num_input_tokens_seen": 202053100, + "step": 9382, + "time_per_iteration": 2.803342580795288 + }, + { + "auxiliary_loss_clip": 0.0148757, + "auxiliary_loss_mlp": 0.01185059, + "balance_loss_clip": 1.17738247, + "balance_loss_mlp": 0.99127197, + "epoch": 0.5641364797835563, + "flos": 64454354786880.0, + "grad_norm": 0.7317684763438165, + "language_loss": 0.54357755, + "learning_rate": 1.682958136989022e-06, + "loss": 0.57030386, + "num_input_tokens_seen": 202120125, + "step": 9383, + "time_per_iteration": 3.484156608581543 + }, + { + "auxiliary_loss_clip": 0.01435666, + "auxiliary_loss_mlp": 0.01232465, + "balance_loss_clip": 1.11328721, + "balance_loss_mlp": 1.03438687, + "epoch": 0.5641966030362242, + "flos": 18662926936320.0, + "grad_norm": 1.7380690231847955, + "language_loss": 0.70600659, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.73268783, + "num_input_tokens_seen": 202138030, + "step": 9384, + "time_per_iteration": 2.8071906566619873 + }, + { + "auxiliary_loss_clip": 0.01439344, + "auxiliary_loss_mlp": 0.01219194, + "balance_loss_clip": 1.1168766, + "balance_loss_mlp": 1.01644325, + "epoch": 0.5642567262888922, + "flos": 22494543769440.0, + "grad_norm": 3.8411860058314664, + "language_loss": 0.76066887, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78725433, + "num_input_tokens_seen": 202155580, + "step": 9385, + "time_per_iteration": 4.133168935775757 + }, + { + "auxiliary_loss_clip": 0.01440388, + "auxiliary_loss_mlp": 0.0122698, + "balance_loss_clip": 1.11787701, + "balance_loss_mlp": 1.02461016, + "epoch": 0.5643168495415603, + "flos": 13005544049280.0, + "grad_norm": 2.030000428531705, + "language_loss": 0.82359332, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.85026705, + "num_input_tokens_seen": 202170365, + "step": 9386, + "time_per_iteration": 2.808835744857788 + }, + { + "auxiliary_loss_clip": 0.01437274, + "auxiliary_loss_mlp": 0.01235692, + "balance_loss_clip": 1.11440527, + "balance_loss_mlp": 1.03465748, + "epoch": 0.5643769727942282, + "flos": 18590293781280.0, + "grad_norm": 1.902461595839627, + "language_loss": 0.69759846, + "learning_rate": 1.681420084607516e-06, + "loss": 0.7243281, + "num_input_tokens_seen": 202189095, + "step": 9387, + "time_per_iteration": 2.795130729675293 + }, + { + "auxiliary_loss_clip": 0.01438854, + "auxiliary_loss_mlp": 0.0123229, + "balance_loss_clip": 1.11606014, + "balance_loss_mlp": 1.0356425, + "epoch": 0.5644370960468962, + "flos": 33809309543520.0, + "grad_norm": 1.575196046557912, + "language_loss": 0.74379641, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.77050781, + "num_input_tokens_seen": 202213500, + "step": 9388, + "time_per_iteration": 2.9426333904266357 + }, + { + "auxiliary_loss_clip": 0.01438477, + "auxiliary_loss_mlp": 0.01218951, + "balance_loss_clip": 1.11597919, + "balance_loss_mlp": 1.02411497, + "epoch": 0.5644972192995641, + "flos": 21217034733120.0, + "grad_norm": 1.5731452496700962, + "language_loss": 0.82243168, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.84900594, + "num_input_tokens_seen": 202231920, + "step": 9389, + "time_per_iteration": 2.831822156906128 + }, + { + "auxiliary_loss_clip": 0.01446636, + "auxiliary_loss_mlp": 0.01239806, + "balance_loss_clip": 1.12264597, + "balance_loss_mlp": 1.03781796, + "epoch": 0.5645573425522321, + "flos": 18589686930720.0, + "grad_norm": 2.1335154784548442, + "language_loss": 0.63963097, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66649538, + "num_input_tokens_seen": 202247600, + "step": 9390, + "time_per_iteration": 2.785752058029175 + }, + { + "auxiliary_loss_clip": 0.01438977, + "auxiliary_loss_mlp": 0.01231235, + "balance_loss_clip": 1.11694872, + "balance_loss_mlp": 1.03315699, + "epoch": 0.5646174658049, + "flos": 18115769835360.0, + "grad_norm": 1.786936793301765, + "language_loss": 0.92279053, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94949269, + "num_input_tokens_seen": 202265350, + "step": 9391, + "time_per_iteration": 2.803663730621338 + }, + { + "auxiliary_loss_clip": 0.01438956, + "auxiliary_loss_mlp": 0.01237064, + "balance_loss_clip": 1.11628354, + "balance_loss_mlp": 1.03574324, + "epoch": 0.564677589057568, + "flos": 28332504447840.0, + "grad_norm": 1.9003065899131797, + "language_loss": 0.6012336, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62799382, + "num_input_tokens_seen": 202284285, + "step": 9392, + "time_per_iteration": 4.365333318710327 + }, + { + "auxiliary_loss_clip": 0.01435849, + "auxiliary_loss_mlp": 0.0122508, + "balance_loss_clip": 1.11349106, + "balance_loss_mlp": 1.02480853, + "epoch": 0.564737712310236, + "flos": 22165930912320.0, + "grad_norm": 2.4194523988652556, + "language_loss": 0.81353384, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.84014308, + "num_input_tokens_seen": 202303450, + "step": 9393, + "time_per_iteration": 2.823380708694458 + }, + { + "auxiliary_loss_clip": 0.01441431, + "auxiliary_loss_mlp": 0.01227931, + "balance_loss_clip": 1.12044382, + "balance_loss_mlp": 1.02994847, + "epoch": 0.564797835562904, + "flos": 20961092959200.0, + "grad_norm": 1.6972789440835185, + "language_loss": 0.87097704, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89767063, + "num_input_tokens_seen": 202322315, + "step": 9394, + "time_per_iteration": 2.813601016998291 + }, + { + "auxiliary_loss_clip": 0.01442079, + "auxiliary_loss_mlp": 0.01227108, + "balance_loss_clip": 1.11989808, + "balance_loss_mlp": 1.02750444, + "epoch": 0.5648579588155719, + "flos": 17422208579520.0, + "grad_norm": 1.8741314476760946, + "language_loss": 0.84750843, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.87420034, + "num_input_tokens_seen": 202339905, + "step": 9395, + "time_per_iteration": 4.3742711544036865 + }, + { + "auxiliary_loss_clip": 0.01474942, + "auxiliary_loss_mlp": 0.01195259, + "balance_loss_clip": 1.16710615, + "balance_loss_mlp": 1.00261688, + "epoch": 0.5649180820682399, + "flos": 69936811178400.0, + "grad_norm": 0.7971613168164637, + "language_loss": 0.58229399, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60899597, + "num_input_tokens_seen": 202397320, + "step": 9396, + "time_per_iteration": 3.371934175491333 + }, + { + "auxiliary_loss_clip": 0.0143572, + "auxiliary_loss_mlp": 0.01235689, + "balance_loss_clip": 1.11365962, + "balance_loss_mlp": 1.03789663, + "epoch": 0.5649782053209078, + "flos": 24975411560640.0, + "grad_norm": 2.343084427192513, + "language_loss": 0.69725436, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72396845, + "num_input_tokens_seen": 202416865, + "step": 9397, + "time_per_iteration": 2.808575391769409 + }, + { + "auxiliary_loss_clip": 0.01428786, + "auxiliary_loss_mlp": 0.01236395, + "balance_loss_clip": 1.10717082, + "balance_loss_mlp": 1.04060602, + "epoch": 0.5650383285735758, + "flos": 21728804496480.0, + "grad_norm": 2.3231881182361787, + "language_loss": 0.66585165, + "learning_rate": 1.67719144001275e-06, + "loss": 0.69250345, + "num_input_tokens_seen": 202436210, + "step": 9398, + "time_per_iteration": 2.7760040760040283 + }, + { + "auxiliary_loss_clip": 0.01470648, + "auxiliary_loss_mlp": 0.01187386, + "balance_loss_clip": 1.16269958, + "balance_loss_mlp": 0.99588776, + "epoch": 0.5650984518262439, + "flos": 65910848631840.0, + "grad_norm": 0.8446564117224945, + "language_loss": 0.58037949, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60695982, + "num_input_tokens_seen": 202492925, + "step": 9399, + "time_per_iteration": 4.65338921546936 + }, + { + "auxiliary_loss_clip": 0.0143268, + "auxiliary_loss_mlp": 0.01230425, + "balance_loss_clip": 1.11136067, + "balance_loss_mlp": 1.03253806, + "epoch": 0.5651585750789118, + "flos": 21034712246400.0, + "grad_norm": 1.976838160717955, + "language_loss": 0.73181331, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.75844431, + "num_input_tokens_seen": 202511905, + "step": 9400, + "time_per_iteration": 2.8136684894561768 + }, + { + "auxiliary_loss_clip": 0.01437974, + "auxiliary_loss_mlp": 0.0123632, + "balance_loss_clip": 1.11566639, + "balance_loss_mlp": 1.03595304, + "epoch": 0.5652186983315798, + "flos": 18553920311520.0, + "grad_norm": 2.017548258842244, + "language_loss": 0.6092481, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63599098, + "num_input_tokens_seen": 202529815, + "step": 9401, + "time_per_iteration": 2.83450984954834 + }, + { + "auxiliary_loss_clip": 0.01435, + "auxiliary_loss_mlp": 0.01237415, + "balance_loss_clip": 1.11325371, + "balance_loss_mlp": 1.04086268, + "epoch": 0.5652788215842477, + "flos": 18480642377760.0, + "grad_norm": 4.274909666962332, + "language_loss": 0.80892003, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83564413, + "num_input_tokens_seen": 202547710, + "step": 9402, + "time_per_iteration": 2.815725564956665 + }, + { + "auxiliary_loss_clip": 0.01433347, + "auxiliary_loss_mlp": 0.01228347, + "balance_loss_clip": 1.11129141, + "balance_loss_mlp": 1.03055537, + "epoch": 0.5653389448369157, + "flos": 30046646833920.0, + "grad_norm": 1.5568603495949298, + "language_loss": 0.77513969, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80175674, + "num_input_tokens_seen": 202568835, + "step": 9403, + "time_per_iteration": 2.9275834560394287 + }, + { + "auxiliary_loss_clip": 0.01430714, + "auxiliary_loss_mlp": 0.01234205, + "balance_loss_clip": 1.1105032, + "balance_loss_mlp": 1.03612661, + "epoch": 0.5653990680895836, + "flos": 16729443815040.0, + "grad_norm": 1.8310248698835976, + "language_loss": 0.68646848, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.7131176, + "num_input_tokens_seen": 202587385, + "step": 9404, + "time_per_iteration": 2.7887356281280518 + }, + { + "auxiliary_loss_clip": 0.01432038, + "auxiliary_loss_mlp": 0.01223077, + "balance_loss_clip": 1.11026692, + "balance_loss_mlp": 1.02662015, + "epoch": 0.5654591913422516, + "flos": 14540018919840.0, + "grad_norm": 1.8304732409783546, + "language_loss": 0.66843069, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.69498181, + "num_input_tokens_seen": 202604815, + "step": 9405, + "time_per_iteration": 2.805830240249634 + }, + { + "auxiliary_loss_clip": 0.0144567, + "auxiliary_loss_mlp": 0.01231979, + "balance_loss_clip": 1.12415624, + "balance_loss_mlp": 1.03313792, + "epoch": 0.5655193145949196, + "flos": 26212109532480.0, + "grad_norm": 1.7632771565331502, + "language_loss": 0.74233562, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76911211, + "num_input_tokens_seen": 202623775, + "step": 9406, + "time_per_iteration": 2.8336658477783203 + }, + { + "auxiliary_loss_clip": 0.01435739, + "auxiliary_loss_mlp": 0.01237404, + "balance_loss_clip": 1.11363733, + "balance_loss_mlp": 1.03999388, + "epoch": 0.5655794378475876, + "flos": 25048954991520.0, + "grad_norm": 1.936026537958535, + "language_loss": 0.80193287, + "learning_rate": 1.673732740698882e-06, + "loss": 0.8286643, + "num_input_tokens_seen": 202643375, + "step": 9407, + "time_per_iteration": 2.8387482166290283 + }, + { + "auxiliary_loss_clip": 0.01445241, + "auxiliary_loss_mlp": 0.01237549, + "balance_loss_clip": 1.12262022, + "balance_loss_mlp": 1.04071081, + "epoch": 0.5656395611002555, + "flos": 31035709298880.0, + "grad_norm": 1.3459260789336212, + "language_loss": 0.71002364, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.73685157, + "num_input_tokens_seen": 202668400, + "step": 9408, + "time_per_iteration": 2.9002625942230225 + }, + { + "auxiliary_loss_clip": 0.01438017, + "auxiliary_loss_mlp": 0.01228713, + "balance_loss_clip": 1.11675978, + "balance_loss_mlp": 1.03130221, + "epoch": 0.5656996843529235, + "flos": 20231575443360.0, + "grad_norm": 1.9086899871018672, + "language_loss": 0.81739151, + "learning_rate": 1.672964276570308e-06, + "loss": 0.84405875, + "num_input_tokens_seen": 202685125, + "step": 9409, + "time_per_iteration": 3.004889965057373 + }, + { + "auxiliary_loss_clip": 0.01430929, + "auxiliary_loss_mlp": 0.0122588, + "balance_loss_clip": 1.10985994, + "balance_loss_mlp": 1.02866054, + "epoch": 0.5657598076055914, + "flos": 20998376704800.0, + "grad_norm": 1.8199134180945133, + "language_loss": 0.78500283, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.81157094, + "num_input_tokens_seen": 202703830, + "step": 9410, + "time_per_iteration": 2.804171323776245 + }, + { + "auxiliary_loss_clip": 0.01433231, + "auxiliary_loss_mlp": 0.01232379, + "balance_loss_clip": 1.11133814, + "balance_loss_mlp": 1.03287089, + "epoch": 0.5658199308582594, + "flos": 11547571006080.0, + "grad_norm": 4.04599602082459, + "language_loss": 0.83306682, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85972291, + "num_input_tokens_seen": 202719835, + "step": 9411, + "time_per_iteration": 2.7475554943084717 + }, + { + "auxiliary_loss_clip": 0.01435815, + "auxiliary_loss_mlp": 0.01239173, + "balance_loss_clip": 1.11431324, + "balance_loss_mlp": 1.03785276, + "epoch": 0.5658800541109275, + "flos": 14173818891840.0, + "grad_norm": 2.4276824104284964, + "language_loss": 0.67903131, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.70578122, + "num_input_tokens_seen": 202736795, + "step": 9412, + "time_per_iteration": 2.802995204925537 + }, + { + "auxiliary_loss_clip": 0.01435628, + "auxiliary_loss_mlp": 0.0123966, + "balance_loss_clip": 1.11393726, + "balance_loss_mlp": 1.04472876, + "epoch": 0.5659401773635954, + "flos": 27307371938400.0, + "grad_norm": 1.5558028600641176, + "language_loss": 0.58477783, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.61153078, + "num_input_tokens_seen": 202756900, + "step": 9413, + "time_per_iteration": 2.818955898284912 + }, + { + "auxiliary_loss_clip": 0.01432181, + "auxiliary_loss_mlp": 0.01240033, + "balance_loss_clip": 1.11078238, + "balance_loss_mlp": 1.04481637, + "epoch": 0.5660003006162634, + "flos": 16730126521920.0, + "grad_norm": 1.7481776707986463, + "language_loss": 0.69508421, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.72180641, + "num_input_tokens_seen": 202775145, + "step": 9414, + "time_per_iteration": 2.7324881553649902 + }, + { + "auxiliary_loss_clip": 0.01429848, + "auxiliary_loss_mlp": 0.01228796, + "balance_loss_clip": 1.10892129, + "balance_loss_mlp": 1.03224373, + "epoch": 0.5660604238689313, + "flos": 21655640347200.0, + "grad_norm": 1.7359743114108352, + "language_loss": 0.78344858, + "learning_rate": 1.670659182280247e-06, + "loss": 0.81003505, + "num_input_tokens_seen": 202794505, + "step": 9415, + "time_per_iteration": 2.7630984783172607 + }, + { + "auxiliary_loss_clip": 0.01459225, + "auxiliary_loss_mlp": 0.01204475, + "balance_loss_clip": 1.15329337, + "balance_loss_mlp": 1.01412201, + "epoch": 0.5661205471215993, + "flos": 68830587534240.0, + "grad_norm": 0.7014311674717428, + "language_loss": 0.49095058, + "learning_rate": 1.670275043523822e-06, + "loss": 0.5175876, + "num_input_tokens_seen": 202858580, + "step": 9416, + "time_per_iteration": 3.4131886959075928 + }, + { + "auxiliary_loss_clip": 0.01433728, + "auxiliary_loss_mlp": 0.01238775, + "balance_loss_clip": 1.11220849, + "balance_loss_mlp": 1.0425086, + "epoch": 0.5661806703742672, + "flos": 28624326625440.0, + "grad_norm": 1.832508657944555, + "language_loss": 0.62863088, + "learning_rate": 1.6698909172706e-06, + "loss": 0.65535593, + "num_input_tokens_seen": 202878565, + "step": 9417, + "time_per_iteration": 2.8373382091522217 + }, + { + "auxiliary_loss_clip": 0.01432166, + "auxiliary_loss_mlp": 0.01231929, + "balance_loss_clip": 1.11230731, + "balance_loss_mlp": 1.0327065, + "epoch": 0.5662407936269352, + "flos": 21400115783040.0, + "grad_norm": 1.8048904118481979, + "language_loss": 0.68755639, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71419728, + "num_input_tokens_seen": 202897350, + "step": 9418, + "time_per_iteration": 2.784003973007202 + }, + { + "auxiliary_loss_clip": 0.0143582, + "auxiliary_loss_mlp": 0.01240528, + "balance_loss_clip": 1.11483073, + "balance_loss_mlp": 1.04092443, + "epoch": 0.5663009168796032, + "flos": 25662031963200.0, + "grad_norm": 1.636122052019121, + "language_loss": 0.64399731, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.67076075, + "num_input_tokens_seen": 202916745, + "step": 9419, + "time_per_iteration": 2.836364507675171 + }, + { + "auxiliary_loss_clip": 0.01462802, + "auxiliary_loss_mlp": 0.01227951, + "balance_loss_clip": 1.15635121, + "balance_loss_mlp": 1.0345459, + "epoch": 0.5663610401322712, + "flos": 67938015036960.0, + "grad_norm": 0.7480711612490857, + "language_loss": 0.5972923, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.62419987, + "num_input_tokens_seen": 202982375, + "step": 9420, + "time_per_iteration": 3.322847366333008 + }, + { + "auxiliary_loss_clip": 0.01432267, + "auxiliary_loss_mlp": 0.01231928, + "balance_loss_clip": 1.11123872, + "balance_loss_mlp": 1.03680646, + "epoch": 0.5664211633849391, + "flos": 24611752719360.0, + "grad_norm": 1.676637527664061, + "language_loss": 0.73991024, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76655215, + "num_input_tokens_seen": 203002430, + "step": 9421, + "time_per_iteration": 2.817599296569824 + }, + { + "auxiliary_loss_clip": 0.0143518, + "auxiliary_loss_mlp": 0.01238049, + "balance_loss_clip": 1.11465621, + "balance_loss_mlp": 1.03977966, + "epoch": 0.5664812866376071, + "flos": 11650016059200.0, + "grad_norm": 3.3030439974883574, + "language_loss": 0.73096323, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.7576955, + "num_input_tokens_seen": 203019425, + "step": 9422, + "time_per_iteration": 2.7319018840789795 + }, + { + "auxiliary_loss_clip": 0.01433532, + "auxiliary_loss_mlp": 0.01225339, + "balance_loss_clip": 1.11395383, + "balance_loss_mlp": 1.03078985, + "epoch": 0.566541409890275, + "flos": 24646267709280.0, + "grad_norm": 1.7439433505697755, + "language_loss": 0.81600893, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.8425976, + "num_input_tokens_seen": 203039035, + "step": 9423, + "time_per_iteration": 4.230859041213989 + }, + { + "auxiliary_loss_clip": 0.01434194, + "auxiliary_loss_mlp": 0.01234845, + "balance_loss_clip": 1.11409068, + "balance_loss_mlp": 1.03638506, + "epoch": 0.566601533142943, + "flos": 22272623919360.0, + "grad_norm": 4.439058786817524, + "language_loss": 0.80993837, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.83662874, + "num_input_tokens_seen": 203059320, + "step": 9424, + "time_per_iteration": 2.9213387966156006 + }, + { + "auxiliary_loss_clip": 0.01437837, + "auxiliary_loss_mlp": 0.01240543, + "balance_loss_clip": 1.11642778, + "balance_loss_mlp": 1.03845966, + "epoch": 0.5666616563956111, + "flos": 29974279176000.0, + "grad_norm": 2.2185259007615707, + "language_loss": 0.78752637, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.81431019, + "num_input_tokens_seen": 203078490, + "step": 9425, + "time_per_iteration": 2.872739791870117 + }, + { + "auxiliary_loss_clip": 0.01434807, + "auxiliary_loss_mlp": 0.01238486, + "balance_loss_clip": 1.11487341, + "balance_loss_mlp": 1.03983545, + "epoch": 0.566721779648279, + "flos": 17783174521440.0, + "grad_norm": 3.3792145673999765, + "language_loss": 0.59056836, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61730134, + "num_input_tokens_seen": 203096065, + "step": 9426, + "time_per_iteration": 2.788344144821167 + }, + { + "auxiliary_loss_clip": 0.01440288, + "auxiliary_loss_mlp": 0.01235192, + "balance_loss_clip": 1.12020671, + "balance_loss_mlp": 1.03539705, + "epoch": 0.566781902900947, + "flos": 21035660450400.0, + "grad_norm": 1.8190851835395319, + "language_loss": 0.82048607, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.84724087, + "num_input_tokens_seen": 203115270, + "step": 9427, + "time_per_iteration": 2.8460731506347656 + }, + { + "auxiliary_loss_clip": 0.01441112, + "auxiliary_loss_mlp": 0.01238029, + "balance_loss_clip": 1.12221324, + "balance_loss_mlp": 1.04033267, + "epoch": 0.5668420261536149, + "flos": 23150821279680.0, + "grad_norm": 1.923771883259438, + "language_loss": 0.86331517, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.89010656, + "num_input_tokens_seen": 203134290, + "step": 9428, + "time_per_iteration": 2.902984619140625 + }, + { + "auxiliary_loss_clip": 0.01439908, + "auxiliary_loss_mlp": 0.01243559, + "balance_loss_clip": 1.1197412, + "balance_loss_mlp": 1.04252434, + "epoch": 0.5669021494062829, + "flos": 22603815891360.0, + "grad_norm": 2.7169184110141855, + "language_loss": 0.740116, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.76695061, + "num_input_tokens_seen": 203152935, + "step": 9429, + "time_per_iteration": 2.8463029861450195 + }, + { + "auxiliary_loss_clip": 0.0143597, + "auxiliary_loss_mlp": 0.01231496, + "balance_loss_clip": 1.11516643, + "balance_loss_mlp": 1.03294086, + "epoch": 0.5669622726589508, + "flos": 17382990497760.0, + "grad_norm": 3.0144562459368562, + "language_loss": 0.7562775, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.78295219, + "num_input_tokens_seen": 203170110, + "step": 9430, + "time_per_iteration": 2.783734083175659 + }, + { + "auxiliary_loss_clip": 0.01433616, + "auxiliary_loss_mlp": 0.01236048, + "balance_loss_clip": 1.11305118, + "balance_loss_mlp": 1.03968596, + "epoch": 0.5670223959116188, + "flos": 18764840995200.0, + "grad_norm": 1.96120147650894, + "language_loss": 0.72945136, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.75614798, + "num_input_tokens_seen": 203188825, + "step": 9431, + "time_per_iteration": 4.279350996017456 + }, + { + "auxiliary_loss_clip": 0.01437235, + "auxiliary_loss_mlp": 0.01226658, + "balance_loss_clip": 1.11745834, + "balance_loss_mlp": 1.02924788, + "epoch": 0.5670825191642868, + "flos": 13555052696160.0, + "grad_norm": 1.7302111040278134, + "language_loss": 0.73451358, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.76115251, + "num_input_tokens_seen": 203206860, + "step": 9432, + "time_per_iteration": 4.382115840911865 + }, + { + "auxiliary_loss_clip": 0.01431991, + "auxiliary_loss_mlp": 0.0123594, + "balance_loss_clip": 1.11324906, + "balance_loss_mlp": 1.03852999, + "epoch": 0.5671426424169548, + "flos": 22056203652480.0, + "grad_norm": 1.572639951309281, + "language_loss": 0.77873766, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80541694, + "num_input_tokens_seen": 203225625, + "step": 9433, + "time_per_iteration": 2.8314902782440186 + }, + { + "auxiliary_loss_clip": 0.01434821, + "auxiliary_loss_mlp": 0.01233813, + "balance_loss_clip": 1.11538136, + "balance_loss_mlp": 1.03363681, + "epoch": 0.5672027656696227, + "flos": 21326079286080.0, + "grad_norm": 2.7000501487499253, + "language_loss": 0.63823611, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.66492242, + "num_input_tokens_seen": 203242920, + "step": 9434, + "time_per_iteration": 2.832533597946167 + }, + { + "auxiliary_loss_clip": 0.01435279, + "auxiliary_loss_mlp": 0.01241031, + "balance_loss_clip": 1.11453843, + "balance_loss_mlp": 1.04285765, + "epoch": 0.5672628889222907, + "flos": 23516338600800.0, + "grad_norm": 1.808840461938646, + "language_loss": 0.66879523, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.69555831, + "num_input_tokens_seen": 203261995, + "step": 9435, + "time_per_iteration": 2.843353033065796 + }, + { + "auxiliary_loss_clip": 0.01431883, + "auxiliary_loss_mlp": 0.01224599, + "balance_loss_clip": 1.11150777, + "balance_loss_mlp": 1.03004956, + "epoch": 0.5673230121749586, + "flos": 27124177104000.0, + "grad_norm": 1.5526479831891538, + "language_loss": 0.71770561, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.74427044, + "num_input_tokens_seen": 203280670, + "step": 9436, + "time_per_iteration": 2.793299436569214 + }, + { + "auxiliary_loss_clip": 0.01431022, + "auxiliary_loss_mlp": 0.01231731, + "balance_loss_clip": 1.11154437, + "balance_loss_mlp": 1.03355753, + "epoch": 0.5673831354276266, + "flos": 31144829708160.0, + "grad_norm": 1.618471410686273, + "language_loss": 0.74091256, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.7675401, + "num_input_tokens_seen": 203304800, + "step": 9437, + "time_per_iteration": 2.902376174926758 + }, + { + "auxiliary_loss_clip": 0.0143615, + "auxiliary_loss_mlp": 0.01232009, + "balance_loss_clip": 1.11631024, + "balance_loss_mlp": 1.03097415, + "epoch": 0.5674432586802945, + "flos": 27675695943360.0, + "grad_norm": 1.6746202996065023, + "language_loss": 0.6122396, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63892126, + "num_input_tokens_seen": 203324060, + "step": 9438, + "time_per_iteration": 4.297806024551392 + }, + { + "auxiliary_loss_clip": 0.01433963, + "auxiliary_loss_mlp": 0.01234781, + "balance_loss_clip": 1.11357319, + "balance_loss_mlp": 1.03498626, + "epoch": 0.5675033819329626, + "flos": 26617603498560.0, + "grad_norm": 1.642054449371487, + "language_loss": 0.75004011, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77672756, + "num_input_tokens_seen": 203344360, + "step": 9439, + "time_per_iteration": 2.7690088748931885 + }, + { + "auxiliary_loss_clip": 0.01437113, + "auxiliary_loss_mlp": 0.01242576, + "balance_loss_clip": 1.11781454, + "balance_loss_mlp": 1.04287672, + "epoch": 0.5675635051856306, + "flos": 19100356777440.0, + "grad_norm": 1.911400411089697, + "language_loss": 0.83696878, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.86376566, + "num_input_tokens_seen": 203362115, + "step": 9440, + "time_per_iteration": 2.835127353668213 + }, + { + "auxiliary_loss_clip": 0.01432844, + "auxiliary_loss_mlp": 0.012337, + "balance_loss_clip": 1.11260366, + "balance_loss_mlp": 1.03581238, + "epoch": 0.5676236284382985, + "flos": 17568233452800.0, + "grad_norm": 2.4031859936026696, + "language_loss": 0.75878745, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.78545284, + "num_input_tokens_seen": 203380550, + "step": 9441, + "time_per_iteration": 2.7539865970611572 + }, + { + "auxiliary_loss_clip": 0.01432484, + "auxiliary_loss_mlp": 0.0123432, + "balance_loss_clip": 1.11343431, + "balance_loss_mlp": 1.03605151, + "epoch": 0.5676837516909665, + "flos": 15955512059520.0, + "grad_norm": 1.8456407343003918, + "language_loss": 0.82921219, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85588026, + "num_input_tokens_seen": 203396590, + "step": 9442, + "time_per_iteration": 2.8033788204193115 + }, + { + "auxiliary_loss_clip": 0.01439559, + "auxiliary_loss_mlp": 0.01232587, + "balance_loss_clip": 1.11981285, + "balance_loss_mlp": 1.03498602, + "epoch": 0.5677438749436344, + "flos": 18297864753120.0, + "grad_norm": 1.8541210042374117, + "language_loss": 0.74709964, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.77382112, + "num_input_tokens_seen": 203414280, + "step": 9443, + "time_per_iteration": 2.7072510719299316 + }, + { + "auxiliary_loss_clip": 0.01433977, + "auxiliary_loss_mlp": 0.01224395, + "balance_loss_clip": 1.11526966, + "balance_loss_mlp": 1.02574468, + "epoch": 0.5678039981963025, + "flos": 17933257707840.0, + "grad_norm": 2.210540434889252, + "language_loss": 0.77638096, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.80296469, + "num_input_tokens_seen": 203433280, + "step": 9444, + "time_per_iteration": 2.69677734375 + }, + { + "auxiliary_loss_clip": 0.01430299, + "auxiliary_loss_mlp": 0.01243105, + "balance_loss_clip": 1.11070549, + "balance_loss_mlp": 1.04502702, + "epoch": 0.5678641214489704, + "flos": 19318294170720.0, + "grad_norm": 1.876907732774739, + "language_loss": 0.81179267, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.83852667, + "num_input_tokens_seen": 203449935, + "step": 9445, + "time_per_iteration": 2.70576548576355 + }, + { + "auxiliary_loss_clip": 0.01430682, + "auxiliary_loss_mlp": 0.01232225, + "balance_loss_clip": 1.11119866, + "balance_loss_mlp": 1.03471911, + "epoch": 0.5679242447016384, + "flos": 27753866609760.0, + "grad_norm": 1.4362473726211864, + "language_loss": 0.71078336, + "learning_rate": 1.658756760280259e-06, + "loss": 0.73741245, + "num_input_tokens_seen": 203473025, + "step": 9446, + "time_per_iteration": 2.729550361633301 + }, + { + "auxiliary_loss_clip": 0.01429344, + "auxiliary_loss_mlp": 0.01241353, + "balance_loss_clip": 1.10904455, + "balance_loss_mlp": 1.04327512, + "epoch": 0.5679843679543063, + "flos": 23771370098880.0, + "grad_norm": 2.2021424072034796, + "language_loss": 0.73677325, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.76348019, + "num_input_tokens_seen": 203492895, + "step": 9447, + "time_per_iteration": 2.6469438076019287 + }, + { + "auxiliary_loss_clip": 0.01431919, + "auxiliary_loss_mlp": 0.01241101, + "balance_loss_clip": 1.11243832, + "balance_loss_mlp": 1.04330873, + "epoch": 0.5680444912069743, + "flos": 25594177756320.0, + "grad_norm": 1.8425265508354864, + "language_loss": 0.75491834, + "learning_rate": 1.657989284462725e-06, + "loss": 0.78164852, + "num_input_tokens_seen": 203513710, + "step": 9448, + "time_per_iteration": 2.7946081161499023 + }, + { + "auxiliary_loss_clip": 0.01438443, + "auxiliary_loss_mlp": 0.012384, + "balance_loss_clip": 1.11779904, + "balance_loss_mlp": 1.03831935, + "epoch": 0.5681046144596422, + "flos": 23698054236960.0, + "grad_norm": 2.246057338381794, + "language_loss": 0.7651099, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.79187828, + "num_input_tokens_seen": 203531630, + "step": 9449, + "time_per_iteration": 2.7800817489624023 + }, + { + "auxiliary_loss_clip": 0.01429326, + "auxiliary_loss_mlp": 0.01232781, + "balance_loss_clip": 1.1097914, + "balance_loss_mlp": 1.03413081, + "epoch": 0.5681647377123102, + "flos": 28003246812000.0, + "grad_norm": 1.9297467424966257, + "language_loss": 0.74685967, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.77348071, + "num_input_tokens_seen": 203551885, + "step": 9450, + "time_per_iteration": 2.8133037090301514 + }, + { + "auxiliary_loss_clip": 0.01425389, + "auxiliary_loss_mlp": 0.01240751, + "balance_loss_clip": 1.10689867, + "balance_loss_mlp": 1.04524779, + "epoch": 0.5682248609649782, + "flos": 22749992477280.0, + "grad_norm": 1.6871779208183244, + "language_loss": 0.66552663, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69218802, + "num_input_tokens_seen": 203572250, + "step": 9451, + "time_per_iteration": 2.8265373706817627 + }, + { + "auxiliary_loss_clip": 0.01425763, + "auxiliary_loss_mlp": 0.01235575, + "balance_loss_clip": 1.10580063, + "balance_loss_mlp": 1.0343498, + "epoch": 0.5682849842176462, + "flos": 21290767804800.0, + "grad_norm": 1.9819338175138346, + "language_loss": 0.7227363, + "learning_rate": 1.656454488573026e-06, + "loss": 0.74934965, + "num_input_tokens_seen": 203590605, + "step": 9452, + "time_per_iteration": 2.7579095363616943 + }, + { + "auxiliary_loss_clip": 0.0142809, + "auxiliary_loss_mlp": 0.01232385, + "balance_loss_clip": 1.10876679, + "balance_loss_mlp": 1.03287661, + "epoch": 0.5683451074703142, + "flos": 21143794727520.0, + "grad_norm": 1.5338154140941853, + "language_loss": 0.70295668, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72956139, + "num_input_tokens_seen": 203610080, + "step": 9453, + "time_per_iteration": 2.9256882667541504 + }, + { + "auxiliary_loss_clip": 0.01430907, + "auxiliary_loss_mlp": 0.01229132, + "balance_loss_clip": 1.10881543, + "balance_loss_mlp": 1.03191161, + "epoch": 0.5684052307229821, + "flos": 22346584560000.0, + "grad_norm": 1.8236036005053011, + "language_loss": 0.69972736, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.72632778, + "num_input_tokens_seen": 203630060, + "step": 9454, + "time_per_iteration": 2.8118510246276855 + }, + { + "auxiliary_loss_clip": 0.0142764, + "auxiliary_loss_mlp": 0.0122969, + "balance_loss_clip": 1.10820019, + "balance_loss_mlp": 1.03247035, + "epoch": 0.5684653539756501, + "flos": 21800792872800.0, + "grad_norm": 3.575576751449308, + "language_loss": 0.60422897, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.63080227, + "num_input_tokens_seen": 203649065, + "step": 9455, + "time_per_iteration": 2.7687368392944336 + }, + { + "auxiliary_loss_clip": 0.01425722, + "auxiliary_loss_mlp": 0.01233379, + "balance_loss_clip": 1.10512316, + "balance_loss_mlp": 1.03463328, + "epoch": 0.568525477228318, + "flos": 23001686297280.0, + "grad_norm": 2.469009620186947, + "language_loss": 0.73546052, + "learning_rate": 1.6549199011198e-06, + "loss": 0.76205158, + "num_input_tokens_seen": 203667545, + "step": 9456, + "time_per_iteration": 2.841486692428589 + }, + { + "auxiliary_loss_clip": 0.01424887, + "auxiliary_loss_mlp": 0.01226961, + "balance_loss_clip": 1.10301137, + "balance_loss_mlp": 1.02783334, + "epoch": 0.568585600480986, + "flos": 21394350702720.0, + "grad_norm": 1.9039474421196623, + "language_loss": 0.77037179, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.79689026, + "num_input_tokens_seen": 203686025, + "step": 9457, + "time_per_iteration": 2.7482542991638184 + }, + { + "auxiliary_loss_clip": 0.01424814, + "auxiliary_loss_mlp": 0.01232322, + "balance_loss_clip": 1.1043129, + "balance_loss_mlp": 1.03395724, + "epoch": 0.568645723733654, + "flos": 30009932010720.0, + "grad_norm": 2.0581765014318454, + "language_loss": 0.66061652, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68718791, + "num_input_tokens_seen": 203705540, + "step": 9458, + "time_per_iteration": 2.8606882095336914 + }, + { + "auxiliary_loss_clip": 0.01427559, + "auxiliary_loss_mlp": 0.01232985, + "balance_loss_clip": 1.1065197, + "balance_loss_mlp": 1.03290391, + "epoch": 0.568705846986322, + "flos": 20414770277760.0, + "grad_norm": 2.400641734392848, + "language_loss": 0.68341547, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.7100209, + "num_input_tokens_seen": 203723670, + "step": 9459, + "time_per_iteration": 2.831815242767334 + }, + { + "auxiliary_loss_clip": 0.014242, + "auxiliary_loss_mlp": 0.01237792, + "balance_loss_clip": 1.10416877, + "balance_loss_mlp": 1.03876066, + "epoch": 0.5687659702389899, + "flos": 17458657905600.0, + "grad_norm": 3.097967865356125, + "language_loss": 0.77082396, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.79744387, + "num_input_tokens_seen": 203739705, + "step": 9460, + "time_per_iteration": 2.714972972869873 + }, + { + "auxiliary_loss_clip": 0.01424498, + "auxiliary_loss_mlp": 0.01231002, + "balance_loss_clip": 1.10407758, + "balance_loss_mlp": 1.03139818, + "epoch": 0.5688260934916579, + "flos": 25408176238080.0, + "grad_norm": 1.8862481221263696, + "language_loss": 0.71760911, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74416411, + "num_input_tokens_seen": 203759000, + "step": 9461, + "time_per_iteration": 4.244911432266235 + }, + { + "auxiliary_loss_clip": 0.01427148, + "auxiliary_loss_mlp": 0.01231079, + "balance_loss_clip": 1.10547447, + "balance_loss_mlp": 1.03185618, + "epoch": 0.5688862167443258, + "flos": 21609708981120.0, + "grad_norm": 2.0842196584982147, + "language_loss": 0.73153704, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75811929, + "num_input_tokens_seen": 203774295, + "step": 9462, + "time_per_iteration": 2.7531063556671143 + }, + { + "auxiliary_loss_clip": 0.01424718, + "auxiliary_loss_mlp": 0.01227689, + "balance_loss_clip": 1.10506713, + "balance_loss_mlp": 1.03056407, + "epoch": 0.5689463399969938, + "flos": 22421189979360.0, + "grad_norm": 3.2429981275057305, + "language_loss": 0.72681874, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75334287, + "num_input_tokens_seen": 203792710, + "step": 9463, + "time_per_iteration": 2.726223945617676 + }, + { + "auxiliary_loss_clip": 0.01427541, + "auxiliary_loss_mlp": 0.01226018, + "balance_loss_clip": 1.10710692, + "balance_loss_mlp": 1.02774894, + "epoch": 0.5690064632496618, + "flos": 18298812957120.0, + "grad_norm": 2.1351917566458396, + "language_loss": 0.73899913, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.7655347, + "num_input_tokens_seen": 203811645, + "step": 9464, + "time_per_iteration": 2.699028491973877 + }, + { + "auxiliary_loss_clip": 0.01425157, + "auxiliary_loss_mlp": 0.01227452, + "balance_loss_clip": 1.10495424, + "balance_loss_mlp": 1.02851558, + "epoch": 0.5690665865023298, + "flos": 21581376281280.0, + "grad_norm": 1.652437457870818, + "language_loss": 0.84608185, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.87260795, + "num_input_tokens_seen": 203830040, + "step": 9465, + "time_per_iteration": 2.806936025619507 + }, + { + "auxiliary_loss_clip": 0.01427958, + "auxiliary_loss_mlp": 0.01231202, + "balance_loss_clip": 1.10881138, + "balance_loss_mlp": 1.03379178, + "epoch": 0.5691267097549978, + "flos": 24423513439680.0, + "grad_norm": 1.6810065782601402, + "language_loss": 0.72071302, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74730468, + "num_input_tokens_seen": 203851245, + "step": 9466, + "time_per_iteration": 2.8315229415893555 + }, + { + "auxiliary_loss_clip": 0.01481213, + "auxiliary_loss_mlp": 0.01196693, + "balance_loss_clip": 1.18160021, + "balance_loss_mlp": 1.00366974, + "epoch": 0.5691868330076657, + "flos": 61665279488640.0, + "grad_norm": 0.7099682243394547, + "language_loss": 0.55304372, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57982278, + "num_input_tokens_seen": 203916400, + "step": 9467, + "time_per_iteration": 3.384267807006836 + }, + { + "auxiliary_loss_clip": 0.01424873, + "auxiliary_loss_mlp": 0.0123539, + "balance_loss_clip": 1.10441089, + "balance_loss_mlp": 1.0383606, + "epoch": 0.5692469562603337, + "flos": 21327368843520.0, + "grad_norm": 2.144066061048887, + "language_loss": 0.6361028, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.66270536, + "num_input_tokens_seen": 203935870, + "step": 9468, + "time_per_iteration": 2.788574695587158 + }, + { + "auxiliary_loss_clip": 0.01424752, + "auxiliary_loss_mlp": 0.01230664, + "balance_loss_clip": 1.10597801, + "balance_loss_mlp": 1.0356375, + "epoch": 0.5693070795130016, + "flos": 23370162014880.0, + "grad_norm": 2.04558337422979, + "language_loss": 0.79004776, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81660199, + "num_input_tokens_seen": 203954950, + "step": 9469, + "time_per_iteration": 4.308763027191162 + }, + { + "auxiliary_loss_clip": 0.01425737, + "auxiliary_loss_mlp": 0.0123004, + "balance_loss_clip": 1.10512185, + "balance_loss_mlp": 1.02995944, + "epoch": 0.5693672027656697, + "flos": 18699224549760.0, + "grad_norm": 2.001084024815741, + "language_loss": 0.69421029, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.72076797, + "num_input_tokens_seen": 203972715, + "step": 9470, + "time_per_iteration": 2.781646251678467 + }, + { + "auxiliary_loss_clip": 0.01425329, + "auxiliary_loss_mlp": 0.01226972, + "balance_loss_clip": 1.10627425, + "balance_loss_mlp": 1.02641451, + "epoch": 0.5694273260183376, + "flos": 20451295460160.0, + "grad_norm": 1.8889790069504024, + "language_loss": 0.74589068, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.77241373, + "num_input_tokens_seen": 203990775, + "step": 9471, + "time_per_iteration": 4.206036567687988 + }, + { + "auxiliary_loss_clip": 0.01432601, + "auxiliary_loss_mlp": 0.01237969, + "balance_loss_clip": 1.11103845, + "balance_loss_mlp": 1.04027224, + "epoch": 0.5694874492710056, + "flos": 17605137916800.0, + "grad_norm": 2.1045608950741803, + "language_loss": 0.57452053, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.60122621, + "num_input_tokens_seen": 204008845, + "step": 9472, + "time_per_iteration": 2.7767043113708496 + }, + { + "auxiliary_loss_clip": 0.01427612, + "auxiliary_loss_mlp": 0.01230973, + "balance_loss_clip": 1.10716546, + "balance_loss_mlp": 1.03327632, + "epoch": 0.5695475725236735, + "flos": 13372540568640.0, + "grad_norm": 2.135943845030175, + "language_loss": 0.73835444, + "learning_rate": 1.648400251450638e-06, + "loss": 0.76494026, + "num_input_tokens_seen": 204023755, + "step": 9473, + "time_per_iteration": 2.7573931217193604 + }, + { + "auxiliary_loss_clip": 0.01489126, + "auxiliary_loss_mlp": 0.01194153, + "balance_loss_clip": 1.18900514, + "balance_loss_mlp": 1.00074768, + "epoch": 0.5696076957763415, + "flos": 68181326733600.0, + "grad_norm": 0.6726534025020682, + "language_loss": 0.57478702, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.60161984, + "num_input_tokens_seen": 204091255, + "step": 9474, + "time_per_iteration": 3.4442641735076904 + }, + { + "auxiliary_loss_clip": 0.01431267, + "auxiliary_loss_mlp": 0.0122466, + "balance_loss_clip": 1.11205888, + "balance_loss_mlp": 1.02715385, + "epoch": 0.5696678190290094, + "flos": 33841093705920.0, + "grad_norm": 1.8229874037029852, + "language_loss": 0.53627104, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.56283033, + "num_input_tokens_seen": 204113285, + "step": 9475, + "time_per_iteration": 2.8843374252319336 + }, + { + "auxiliary_loss_clip": 0.01429776, + "auxiliary_loss_mlp": 0.01234172, + "balance_loss_clip": 1.10949254, + "balance_loss_mlp": 1.03666615, + "epoch": 0.5697279422816774, + "flos": 26358855040800.0, + "grad_norm": 1.771545631437704, + "language_loss": 0.79602671, + "learning_rate": 1.647250122983675e-06, + "loss": 0.82266617, + "num_input_tokens_seen": 204133045, + "step": 9476, + "time_per_iteration": 4.311923980712891 + }, + { + "auxiliary_loss_clip": 0.01435068, + "auxiliary_loss_mlp": 0.01245791, + "balance_loss_clip": 1.11563301, + "balance_loss_mlp": 1.04981041, + "epoch": 0.5697880655343454, + "flos": 22932770101920.0, + "grad_norm": 2.0464632389967994, + "language_loss": 0.66578531, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.69259393, + "num_input_tokens_seen": 204152590, + "step": 9477, + "time_per_iteration": 2.8969433307647705 + }, + { + "auxiliary_loss_clip": 0.01429809, + "auxiliary_loss_mlp": 0.01234145, + "balance_loss_clip": 1.11120319, + "balance_loss_mlp": 1.03749704, + "epoch": 0.5698481887870134, + "flos": 26763780084480.0, + "grad_norm": 1.7894607009670118, + "language_loss": 0.70750612, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73414564, + "num_input_tokens_seen": 204171815, + "step": 9478, + "time_per_iteration": 2.871880054473877 + }, + { + "auxiliary_loss_clip": 0.01429169, + "auxiliary_loss_mlp": 0.01228556, + "balance_loss_clip": 1.11060274, + "balance_loss_mlp": 1.03095436, + "epoch": 0.5699083120396814, + "flos": 15744098309760.0, + "grad_norm": 1.9720348985369422, + "language_loss": 0.69663966, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.72321689, + "num_input_tokens_seen": 204188535, + "step": 9479, + "time_per_iteration": 2.7625062465667725 + }, + { + "auxiliary_loss_clip": 0.01428876, + "auxiliary_loss_mlp": 0.01224642, + "balance_loss_clip": 1.11112809, + "balance_loss_mlp": 1.02952003, + "epoch": 0.5699684352923493, + "flos": 19539341673120.0, + "grad_norm": 1.6782293976431044, + "language_loss": 0.71534485, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.74188006, + "num_input_tokens_seen": 204208365, + "step": 9480, + "time_per_iteration": 2.810128688812256 + }, + { + "auxiliary_loss_clip": 0.01431031, + "auxiliary_loss_mlp": 0.01247218, + "balance_loss_clip": 1.11228871, + "balance_loss_mlp": 1.05247808, + "epoch": 0.5700285585450173, + "flos": 16255223294400.0, + "grad_norm": 2.6061074848997468, + "language_loss": 0.72022593, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.74700838, + "num_input_tokens_seen": 204226560, + "step": 9481, + "time_per_iteration": 2.733767509460449 + }, + { + "auxiliary_loss_clip": 0.01434688, + "auxiliary_loss_mlp": 0.01237549, + "balance_loss_clip": 1.1158241, + "balance_loss_mlp": 1.03823078, + "epoch": 0.5700886817976852, + "flos": 19867044254400.0, + "grad_norm": 1.6968145639308025, + "language_loss": 0.78782475, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.81454718, + "num_input_tokens_seen": 204245410, + "step": 9482, + "time_per_iteration": 2.7958743572235107 + }, + { + "auxiliary_loss_clip": 0.01431433, + "auxiliary_loss_mlp": 0.01229519, + "balance_loss_clip": 1.11322689, + "balance_loss_mlp": 1.03296661, + "epoch": 0.5701488050503533, + "flos": 23843965325760.0, + "grad_norm": 2.0914920015142733, + "language_loss": 0.7781601, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.80476958, + "num_input_tokens_seen": 204264840, + "step": 9483, + "time_per_iteration": 2.846484661102295 + }, + { + "auxiliary_loss_clip": 0.01429888, + "auxiliary_loss_mlp": 0.01241119, + "balance_loss_clip": 1.11208403, + "balance_loss_mlp": 1.04342198, + "epoch": 0.5702089283030212, + "flos": 23661946264320.0, + "grad_norm": 2.0090009303888876, + "language_loss": 0.81658638, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.84329647, + "num_input_tokens_seen": 204284335, + "step": 9484, + "time_per_iteration": 2.86016583442688 + }, + { + "auxiliary_loss_clip": 0.01429068, + "auxiliary_loss_mlp": 0.01236411, + "balance_loss_clip": 1.1115247, + "balance_loss_mlp": 1.04033542, + "epoch": 0.5702690515556892, + "flos": 27893633336640.0, + "grad_norm": 2.6631578523263326, + "language_loss": 0.6111756, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.63783038, + "num_input_tokens_seen": 204302590, + "step": 9485, + "time_per_iteration": 2.7917306423187256 + }, + { + "auxiliary_loss_clip": 0.01424048, + "auxiliary_loss_mlp": 0.01234132, + "balance_loss_clip": 1.10600352, + "balance_loss_mlp": 1.03510058, + "epoch": 0.5703291748083571, + "flos": 24026211956160.0, + "grad_norm": 1.884095877569884, + "language_loss": 0.65173858, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67832035, + "num_input_tokens_seen": 204323055, + "step": 9486, + "time_per_iteration": 2.8837647438049316 + }, + { + "auxiliary_loss_clip": 0.01503476, + "auxiliary_loss_mlp": 0.01222732, + "balance_loss_clip": 1.20844769, + "balance_loss_mlp": 1.0304718, + "epoch": 0.5703892980610251, + "flos": 57030449996160.0, + "grad_norm": 0.6634398328853265, + "language_loss": 0.47972333, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50698543, + "num_input_tokens_seen": 204386160, + "step": 9487, + "time_per_iteration": 3.386669158935547 + }, + { + "auxiliary_loss_clip": 0.01425139, + "auxiliary_loss_mlp": 0.01232772, + "balance_loss_clip": 1.1081003, + "balance_loss_mlp": 1.03631544, + "epoch": 0.570449421313693, + "flos": 24353231830560.0, + "grad_norm": 1.7474753165456804, + "language_loss": 0.86204565, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.88862479, + "num_input_tokens_seen": 204406315, + "step": 9488, + "time_per_iteration": 2.8412415981292725 + }, + { + "auxiliary_loss_clip": 0.01425465, + "auxiliary_loss_mlp": 0.01232076, + "balance_loss_clip": 1.10731125, + "balance_loss_mlp": 1.03313911, + "epoch": 0.570509544566361, + "flos": 24830979670080.0, + "grad_norm": 1.6173019892385452, + "language_loss": 0.7918849, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81846035, + "num_input_tokens_seen": 204427645, + "step": 9489, + "time_per_iteration": 2.8624298572540283 + }, + { + "auxiliary_loss_clip": 0.01426472, + "auxiliary_loss_mlp": 0.0123082, + "balance_loss_clip": 1.10874474, + "balance_loss_mlp": 1.0339818, + "epoch": 0.570569667819029, + "flos": 21399433076160.0, + "grad_norm": 1.667672686335408, + "language_loss": 0.70030165, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72687459, + "num_input_tokens_seen": 204445910, + "step": 9490, + "time_per_iteration": 2.802128553390503 + }, + { + "auxiliary_loss_clip": 0.01427764, + "auxiliary_loss_mlp": 0.01226813, + "balance_loss_clip": 1.11062157, + "balance_loss_mlp": 1.02978408, + "epoch": 0.570629791071697, + "flos": 23218713414720.0, + "grad_norm": 1.9215593682932541, + "language_loss": 0.76322222, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78976804, + "num_input_tokens_seen": 204464680, + "step": 9491, + "time_per_iteration": 2.838308095932007 + }, + { + "auxiliary_loss_clip": 0.01518125, + "auxiliary_loss_mlp": 0.01211052, + "balance_loss_clip": 1.22298729, + "balance_loss_mlp": 1.01688385, + "epoch": 0.570689914324365, + "flos": 65291020447680.0, + "grad_norm": 0.78467236338216, + "language_loss": 0.57341397, + "learning_rate": 1.641118147266011e-06, + "loss": 0.60070574, + "num_input_tokens_seen": 204525580, + "step": 9492, + "time_per_iteration": 3.329342842102051 + }, + { + "auxiliary_loss_clip": 0.01427639, + "auxiliary_loss_mlp": 0.01234948, + "balance_loss_clip": 1.11110878, + "balance_loss_mlp": 1.0360111, + "epoch": 0.5707500375770329, + "flos": 21144022296480.0, + "grad_norm": 1.7448214017653294, + "language_loss": 0.71400082, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.74062669, + "num_input_tokens_seen": 204541320, + "step": 9493, + "time_per_iteration": 2.75816011428833 + }, + { + "auxiliary_loss_clip": 0.01427684, + "auxiliary_loss_mlp": 0.01232509, + "balance_loss_clip": 1.10987866, + "balance_loss_mlp": 1.03586161, + "epoch": 0.5708101608297009, + "flos": 20814802588800.0, + "grad_norm": 3.2831675638502493, + "language_loss": 0.78030604, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80690801, + "num_input_tokens_seen": 204560275, + "step": 9494, + "time_per_iteration": 2.797002077102661 + }, + { + "auxiliary_loss_clip": 0.01426699, + "auxiliary_loss_mlp": 0.01233383, + "balance_loss_clip": 1.10714674, + "balance_loss_mlp": 1.03673553, + "epoch": 0.5708702840823688, + "flos": 25814732192640.0, + "grad_norm": 2.799918459115127, + "language_loss": 0.80245751, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82905841, + "num_input_tokens_seen": 204579430, + "step": 9495, + "time_per_iteration": 2.809720039367676 + }, + { + "auxiliary_loss_clip": 0.01427238, + "auxiliary_loss_mlp": 0.01240394, + "balance_loss_clip": 1.1077733, + "balance_loss_mlp": 1.04231524, + "epoch": 0.5709304073350369, + "flos": 23653753781760.0, + "grad_norm": 2.0986727468031607, + "language_loss": 0.65925705, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68593335, + "num_input_tokens_seen": 204597710, + "step": 9496, + "time_per_iteration": 2.872525453567505 + }, + { + "auxiliary_loss_clip": 0.01423173, + "auxiliary_loss_mlp": 0.01235631, + "balance_loss_clip": 1.10508299, + "balance_loss_mlp": 1.03717184, + "epoch": 0.5709905305877048, + "flos": 16109425990080.0, + "grad_norm": 2.0868187495806305, + "language_loss": 0.69923347, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.7258215, + "num_input_tokens_seen": 204616140, + "step": 9497, + "time_per_iteration": 2.786227226257324 + }, + { + "auxiliary_loss_clip": 0.01424493, + "auxiliary_loss_mlp": 0.01231776, + "balance_loss_clip": 1.10666537, + "balance_loss_mlp": 1.03283954, + "epoch": 0.5710506538403728, + "flos": 24752960716320.0, + "grad_norm": 1.7486800914421614, + "language_loss": 0.81160861, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83817124, + "num_input_tokens_seen": 204636470, + "step": 9498, + "time_per_iteration": 2.8602683544158936 + }, + { + "auxiliary_loss_clip": 0.01426578, + "auxiliary_loss_mlp": 0.01234697, + "balance_loss_clip": 1.10891652, + "balance_loss_mlp": 1.0367142, + "epoch": 0.5711107770930407, + "flos": 21984480773280.0, + "grad_norm": 1.9641579848973376, + "language_loss": 0.66515124, + "learning_rate": 1.638436499891469e-06, + "loss": 0.691764, + "num_input_tokens_seen": 204656640, + "step": 9499, + "time_per_iteration": 2.83268404006958 + }, + { + "auxiliary_loss_clip": 0.01427713, + "auxiliary_loss_mlp": 0.01230241, + "balance_loss_clip": 1.11006582, + "balance_loss_mlp": 1.0315907, + "epoch": 0.5711709003457087, + "flos": 19576094424480.0, + "grad_norm": 2.2963264454631407, + "language_loss": 0.71693587, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.74351537, + "num_input_tokens_seen": 204675475, + "step": 9500, + "time_per_iteration": 4.189098119735718 + }, + { + "auxiliary_loss_clip": 0.01430001, + "auxiliary_loss_mlp": 0.01240279, + "balance_loss_clip": 1.11190891, + "balance_loss_mlp": 1.04325032, + "epoch": 0.5712310235983766, + "flos": 24244945840800.0, + "grad_norm": 2.8402408515319433, + "language_loss": 0.76311707, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78981984, + "num_input_tokens_seen": 204695385, + "step": 9501, + "time_per_iteration": 2.8432114124298096 + }, + { + "auxiliary_loss_clip": 0.01429361, + "auxiliary_loss_mlp": 0.01229956, + "balance_loss_clip": 1.11165023, + "balance_loss_mlp": 1.03159142, + "epoch": 0.5712911468510447, + "flos": 20998262920320.0, + "grad_norm": 1.6869370510554682, + "language_loss": 0.75042915, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.77702224, + "num_input_tokens_seen": 204714730, + "step": 9502, + "time_per_iteration": 2.786033868789673 + }, + { + "auxiliary_loss_clip": 0.01429028, + "auxiliary_loss_mlp": 0.01227008, + "balance_loss_clip": 1.11004972, + "balance_loss_mlp": 1.03093219, + "epoch": 0.5713512701037126, + "flos": 18919020422880.0, + "grad_norm": 1.585322257294268, + "language_loss": 0.82161993, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84818029, + "num_input_tokens_seen": 204735025, + "step": 9503, + "time_per_iteration": 2.8002829551696777 + }, + { + "auxiliary_loss_clip": 0.01433123, + "auxiliary_loss_mlp": 0.01241306, + "balance_loss_clip": 1.11316872, + "balance_loss_mlp": 1.04580235, + "epoch": 0.5714113933563806, + "flos": 17414964300960.0, + "grad_norm": 2.4837904912003497, + "language_loss": 0.85932881, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.88607311, + "num_input_tokens_seen": 204751365, + "step": 9504, + "time_per_iteration": 2.784557342529297 + }, + { + "auxiliary_loss_clip": 0.01433199, + "auxiliary_loss_mlp": 0.01230473, + "balance_loss_clip": 1.11447465, + "balance_loss_mlp": 1.03296733, + "epoch": 0.5714715166090486, + "flos": 20195429542560.0, + "grad_norm": 2.8937131942158216, + "language_loss": 0.75198555, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77862221, + "num_input_tokens_seen": 204768980, + "step": 9505, + "time_per_iteration": 2.755622148513794 + }, + { + "auxiliary_loss_clip": 0.01444262, + "auxiliary_loss_mlp": 0.0122635, + "balance_loss_clip": 1.12455463, + "balance_loss_mlp": 1.02569699, + "epoch": 0.5715316398617165, + "flos": 18553654814400.0, + "grad_norm": 1.517671675377446, + "language_loss": 0.82117844, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84788454, + "num_input_tokens_seen": 204788110, + "step": 9506, + "time_per_iteration": 2.7595345973968506 + }, + { + "auxiliary_loss_clip": 0.01437789, + "auxiliary_loss_mlp": 0.01235432, + "balance_loss_clip": 1.11766613, + "balance_loss_mlp": 1.03744972, + "epoch": 0.5715917631143845, + "flos": 18480149311680.0, + "grad_norm": 2.682504244434124, + "language_loss": 0.7768054, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.80353755, + "num_input_tokens_seen": 204807240, + "step": 9507, + "time_per_iteration": 4.326131582260132 + }, + { + "auxiliary_loss_clip": 0.01435108, + "auxiliary_loss_mlp": 0.01233207, + "balance_loss_clip": 1.11480379, + "balance_loss_mlp": 1.03570068, + "epoch": 0.5716518863670524, + "flos": 24022077786720.0, + "grad_norm": 1.5377976530192976, + "language_loss": 0.68459809, + "learning_rate": 1.63498965540751e-06, + "loss": 0.71128118, + "num_input_tokens_seen": 204826415, + "step": 9508, + "time_per_iteration": 4.357468128204346 + }, + { + "auxiliary_loss_clip": 0.01434408, + "auxiliary_loss_mlp": 0.01225817, + "balance_loss_clip": 1.11408126, + "balance_loss_mlp": 1.0258317, + "epoch": 0.5717120096197205, + "flos": 17821292686560.0, + "grad_norm": 2.210652583166297, + "language_loss": 0.79419446, + "learning_rate": 1.634606741699593e-06, + "loss": 0.82079667, + "num_input_tokens_seen": 204844305, + "step": 9509, + "time_per_iteration": 2.7792680263519287 + }, + { + "auxiliary_loss_clip": 0.0143463, + "auxiliary_loss_mlp": 0.01226567, + "balance_loss_clip": 1.11466575, + "balance_loss_mlp": 1.02953792, + "epoch": 0.5717721328723884, + "flos": 21867888516480.0, + "grad_norm": 2.238961186471177, + "language_loss": 0.71864712, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74525905, + "num_input_tokens_seen": 204861765, + "step": 9510, + "time_per_iteration": 2.821152687072754 + }, + { + "auxiliary_loss_clip": 0.01436221, + "auxiliary_loss_mlp": 0.01224396, + "balance_loss_clip": 1.11498296, + "balance_loss_mlp": 1.02698517, + "epoch": 0.5718322561250564, + "flos": 28440069802560.0, + "grad_norm": 1.9185048061362722, + "language_loss": 0.6932916, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71989775, + "num_input_tokens_seen": 204882505, + "step": 9511, + "time_per_iteration": 2.8353042602539062 + }, + { + "auxiliary_loss_clip": 0.01429944, + "auxiliary_loss_mlp": 0.01230646, + "balance_loss_clip": 1.11105061, + "balance_loss_mlp": 1.03285372, + "epoch": 0.5718923793777243, + "flos": 13553459713440.0, + "grad_norm": 3.5755448235046057, + "language_loss": 0.61432076, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.64092672, + "num_input_tokens_seen": 204899830, + "step": 9512, + "time_per_iteration": 2.84743595123291 + }, + { + "auxiliary_loss_clip": 0.01429785, + "auxiliary_loss_mlp": 0.01225976, + "balance_loss_clip": 1.11018085, + "balance_loss_mlp": 1.02827954, + "epoch": 0.5719525026303923, + "flos": 17823985585920.0, + "grad_norm": 10.286538488737186, + "language_loss": 0.7627148, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78927243, + "num_input_tokens_seen": 204918100, + "step": 9513, + "time_per_iteration": 4.2700183391571045 + }, + { + "auxiliary_loss_clip": 0.0148391, + "auxiliary_loss_mlp": 0.01182281, + "balance_loss_clip": 1.18573463, + "balance_loss_mlp": 0.99002075, + "epoch": 0.5720126258830602, + "flos": 61303631204160.0, + "grad_norm": 0.8925514011829692, + "language_loss": 0.66887653, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.6955384, + "num_input_tokens_seen": 204972925, + "step": 9514, + "time_per_iteration": 3.306558847427368 + }, + { + "auxiliary_loss_clip": 0.01432734, + "auxiliary_loss_mlp": 0.01233363, + "balance_loss_clip": 1.11215115, + "balance_loss_mlp": 1.03337717, + "epoch": 0.5720727491357283, + "flos": 23990331552480.0, + "grad_norm": 2.2827467254938183, + "language_loss": 0.8134023, + "learning_rate": 1.63230955093099e-06, + "loss": 0.84006333, + "num_input_tokens_seen": 204990910, + "step": 9515, + "time_per_iteration": 2.820340394973755 + }, + { + "auxiliary_loss_clip": 0.01430272, + "auxiliary_loss_mlp": 0.01227447, + "balance_loss_clip": 1.10972905, + "balance_loss_mlp": 1.03137207, + "epoch": 0.5721328723883962, + "flos": 23407749185760.0, + "grad_norm": 2.0339248574321998, + "language_loss": 0.85871673, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88529396, + "num_input_tokens_seen": 205010500, + "step": 9516, + "time_per_iteration": 2.775547981262207 + }, + { + "auxiliary_loss_clip": 0.014331, + "auxiliary_loss_mlp": 0.01228188, + "balance_loss_clip": 1.11294246, + "balance_loss_mlp": 1.03001404, + "epoch": 0.5721929956410642, + "flos": 18806789904480.0, + "grad_norm": 1.7116516161840472, + "language_loss": 0.87408555, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.90069836, + "num_input_tokens_seen": 205028560, + "step": 9517, + "time_per_iteration": 2.8032209873199463 + }, + { + "auxiliary_loss_clip": 0.0143929, + "auxiliary_loss_mlp": 0.01232906, + "balance_loss_clip": 1.11941147, + "balance_loss_mlp": 1.03320622, + "epoch": 0.5722531188937322, + "flos": 27199010092320.0, + "grad_norm": 1.8969409484519326, + "language_loss": 0.85172558, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.87844753, + "num_input_tokens_seen": 205048650, + "step": 9518, + "time_per_iteration": 2.7971103191375732 + }, + { + "auxiliary_loss_clip": 0.01436387, + "auxiliary_loss_mlp": 0.01237341, + "balance_loss_clip": 1.11729562, + "balance_loss_mlp": 1.04212379, + "epoch": 0.5723132421464001, + "flos": 15197510131200.0, + "grad_norm": 1.619116548558092, + "language_loss": 0.7906673, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.81740463, + "num_input_tokens_seen": 205066480, + "step": 9519, + "time_per_iteration": 2.770719528198242 + }, + { + "auxiliary_loss_clip": 0.01434379, + "auxiliary_loss_mlp": 0.01231633, + "balance_loss_clip": 1.11424506, + "balance_loss_mlp": 1.03689313, + "epoch": 0.5723733653990681, + "flos": 27601887015360.0, + "grad_norm": 1.4695357671458664, + "language_loss": 0.82841414, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85507429, + "num_input_tokens_seen": 205087475, + "step": 9520, + "time_per_iteration": 2.836308717727661 + }, + { + "auxiliary_loss_clip": 0.0143202, + "auxiliary_loss_mlp": 0.01230273, + "balance_loss_clip": 1.11307144, + "balance_loss_mlp": 1.03295815, + "epoch": 0.572433488651736, + "flos": 18224814388320.0, + "grad_norm": 2.0646308091675873, + "language_loss": 0.72043002, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74705297, + "num_input_tokens_seen": 205106495, + "step": 9521, + "time_per_iteration": 2.7362842559814453 + }, + { + "auxiliary_loss_clip": 0.01436688, + "auxiliary_loss_mlp": 0.01232908, + "balance_loss_clip": 1.118222, + "balance_loss_mlp": 1.03769112, + "epoch": 0.5724936119044041, + "flos": 31251826140480.0, + "grad_norm": 2.6844056667640577, + "language_loss": 0.78425336, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.81094933, + "num_input_tokens_seen": 205128285, + "step": 9522, + "time_per_iteration": 2.863689422607422 + }, + { + "auxiliary_loss_clip": 0.01434708, + "auxiliary_loss_mlp": 0.01222553, + "balance_loss_clip": 1.11694074, + "balance_loss_mlp": 1.02666819, + "epoch": 0.572553735157072, + "flos": 19203863819040.0, + "grad_norm": 1.5197361679419337, + "language_loss": 0.71722853, + "learning_rate": 1.629247411248102e-06, + "loss": 0.74380112, + "num_input_tokens_seen": 205146595, + "step": 9523, + "time_per_iteration": 2.747915267944336 + }, + { + "auxiliary_loss_clip": 0.01435956, + "auxiliary_loss_mlp": 0.01220986, + "balance_loss_clip": 1.11701059, + "balance_loss_mlp": 1.02481508, + "epoch": 0.57261385840974, + "flos": 21217034733120.0, + "grad_norm": 1.7551229364285557, + "language_loss": 0.70123577, + "learning_rate": 1.628864706900738e-06, + "loss": 0.7278052, + "num_input_tokens_seen": 205164295, + "step": 9524, + "time_per_iteration": 2.825676441192627 + }, + { + "auxiliary_loss_clip": 0.01437483, + "auxiliary_loss_mlp": 0.01232907, + "balance_loss_clip": 1.11825967, + "balance_loss_mlp": 1.03711748, + "epoch": 0.5726739816624079, + "flos": 33987346148160.0, + "grad_norm": 1.404043761054311, + "language_loss": 0.65365863, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.68036246, + "num_input_tokens_seen": 205185380, + "step": 9525, + "time_per_iteration": 2.8412036895751953 + }, + { + "auxiliary_loss_clip": 0.01435184, + "auxiliary_loss_mlp": 0.01233248, + "balance_loss_clip": 1.11552954, + "balance_loss_mlp": 1.03841269, + "epoch": 0.5727341049150759, + "flos": 24278057488800.0, + "grad_norm": 2.0792598805021925, + "language_loss": 0.7255497, + "learning_rate": 1.628099340440984e-06, + "loss": 0.75223404, + "num_input_tokens_seen": 205204895, + "step": 9526, + "time_per_iteration": 2.9126431941986084 + }, + { + "auxiliary_loss_clip": 0.01439996, + "auxiliary_loss_mlp": 0.01232288, + "balance_loss_clip": 1.12009096, + "balance_loss_mlp": 1.03783345, + "epoch": 0.5727942281677438, + "flos": 28402975697760.0, + "grad_norm": 3.7964810661977495, + "language_loss": 0.7990064, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.82572925, + "num_input_tokens_seen": 205223440, + "step": 9527, + "time_per_iteration": 2.8652970790863037 + }, + { + "auxiliary_loss_clip": 0.01439608, + "auxiliary_loss_mlp": 0.01238218, + "balance_loss_clip": 1.12109816, + "balance_loss_mlp": 1.04052162, + "epoch": 0.5728543514204119, + "flos": 19538810678880.0, + "grad_norm": 1.7732959033761904, + "language_loss": 0.71968913, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.74646747, + "num_input_tokens_seen": 205242800, + "step": 9528, + "time_per_iteration": 2.7637457847595215 + }, + { + "auxiliary_loss_clip": 0.01435539, + "auxiliary_loss_mlp": 0.01234693, + "balance_loss_clip": 1.11571717, + "balance_loss_mlp": 1.03718722, + "epoch": 0.5729144746730798, + "flos": 21508894838880.0, + "grad_norm": 1.9467010110880878, + "language_loss": 0.85975868, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.88646102, + "num_input_tokens_seen": 205259465, + "step": 9529, + "time_per_iteration": 2.810577630996704 + }, + { + "auxiliary_loss_clip": 0.01466353, + "auxiliary_loss_mlp": 0.01202301, + "balance_loss_clip": 1.17204595, + "balance_loss_mlp": 1.01194763, + "epoch": 0.5729745979257478, + "flos": 58687661849760.0, + "grad_norm": 0.7571110363071637, + "language_loss": 0.56062627, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58731282, + "num_input_tokens_seen": 205314100, + "step": 9530, + "time_per_iteration": 3.2216598987579346 + }, + { + "auxiliary_loss_clip": 0.01433562, + "auxiliary_loss_mlp": 0.01234126, + "balance_loss_clip": 1.11424899, + "balance_loss_mlp": 1.03700185, + "epoch": 0.5730347211784158, + "flos": 18554109952320.0, + "grad_norm": 2.0503010672493387, + "language_loss": 0.66496158, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.69163847, + "num_input_tokens_seen": 205333420, + "step": 9531, + "time_per_iteration": 2.8190226554870605 + }, + { + "auxiliary_loss_clip": 0.01439212, + "auxiliary_loss_mlp": 0.01231797, + "balance_loss_clip": 1.11858678, + "balance_loss_mlp": 1.03562593, + "epoch": 0.5730948444310837, + "flos": 38034359187840.0, + "grad_norm": 2.251168457392364, + "language_loss": 0.76070678, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.78741693, + "num_input_tokens_seen": 205350995, + "step": 9532, + "time_per_iteration": 2.8906431198120117 + }, + { + "auxiliary_loss_clip": 0.01431688, + "auxiliary_loss_mlp": 0.01226699, + "balance_loss_clip": 1.1103301, + "balance_loss_mlp": 1.03024173, + "epoch": 0.5731549676837517, + "flos": 25229343142080.0, + "grad_norm": 1.3320607272647786, + "language_loss": 0.78750843, + "learning_rate": 1.625421002822686e-06, + "loss": 0.81409228, + "num_input_tokens_seen": 205372675, + "step": 9533, + "time_per_iteration": 2.820024251937866 + }, + { + "auxiliary_loss_clip": 0.01434448, + "auxiliary_loss_mlp": 0.01223987, + "balance_loss_clip": 1.11434174, + "balance_loss_mlp": 1.02667201, + "epoch": 0.5732150909364196, + "flos": 23370503368320.0, + "grad_norm": 1.9352034461827594, + "language_loss": 0.85935712, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.88594145, + "num_input_tokens_seen": 205392590, + "step": 9534, + "time_per_iteration": 2.847414016723633 + }, + { + "auxiliary_loss_clip": 0.01435576, + "auxiliary_loss_mlp": 0.01227994, + "balance_loss_clip": 1.11594605, + "balance_loss_mlp": 1.02877092, + "epoch": 0.5732752141890877, + "flos": 23081791299840.0, + "grad_norm": 1.7773542541011351, + "language_loss": 0.75297517, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77961087, + "num_input_tokens_seen": 205414885, + "step": 9535, + "time_per_iteration": 2.787997245788574 + }, + { + "auxiliary_loss_clip": 0.01435017, + "auxiliary_loss_mlp": 0.01238257, + "balance_loss_clip": 1.1133213, + "balance_loss_mlp": 1.04094172, + "epoch": 0.5733353374417556, + "flos": 24354559316160.0, + "grad_norm": 1.6816038077938615, + "language_loss": 0.71232843, + "learning_rate": 1.624273356614346e-06, + "loss": 0.73906118, + "num_input_tokens_seen": 205434440, + "step": 9536, + "time_per_iteration": 2.893234968185425 + }, + { + "auxiliary_loss_clip": 0.01437929, + "auxiliary_loss_mlp": 0.01235376, + "balance_loss_clip": 1.11780643, + "balance_loss_mlp": 1.04101717, + "epoch": 0.5733954606944236, + "flos": 27201323710080.0, + "grad_norm": 2.1013007128344077, + "language_loss": 0.69824088, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.72497386, + "num_input_tokens_seen": 205454225, + "step": 9537, + "time_per_iteration": 4.344544887542725 + }, + { + "auxiliary_loss_clip": 0.01434939, + "auxiliary_loss_mlp": 0.01237251, + "balance_loss_clip": 1.11502457, + "balance_loss_mlp": 1.04012644, + "epoch": 0.5734555839470915, + "flos": 28767544814880.0, + "grad_norm": 2.222567235122763, + "language_loss": 0.63307846, + "learning_rate": 1.623508330355902e-06, + "loss": 0.65980035, + "num_input_tokens_seen": 205474750, + "step": 9538, + "time_per_iteration": 2.875769853591919 + }, + { + "auxiliary_loss_clip": 0.01435891, + "auxiliary_loss_mlp": 0.01231697, + "balance_loss_clip": 1.11588693, + "balance_loss_mlp": 1.03371406, + "epoch": 0.5735157071997595, + "flos": 22969067715360.0, + "grad_norm": 1.8067520290022652, + "language_loss": 0.83199245, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85866833, + "num_input_tokens_seen": 205495495, + "step": 9539, + "time_per_iteration": 2.8398377895355225 + }, + { + "auxiliary_loss_clip": 0.01439077, + "auxiliary_loss_mlp": 0.01231608, + "balance_loss_clip": 1.11961198, + "balance_loss_mlp": 1.03410196, + "epoch": 0.5735758304524274, + "flos": 18991463937120.0, + "grad_norm": 1.9481648855601497, + "language_loss": 0.7295602, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75626707, + "num_input_tokens_seen": 205510070, + "step": 9540, + "time_per_iteration": 2.899717092514038 + }, + { + "auxiliary_loss_clip": 0.01429662, + "auxiliary_loss_mlp": 0.01221982, + "balance_loss_clip": 1.1085403, + "balance_loss_mlp": 1.02590644, + "epoch": 0.5736359537050955, + "flos": 28400206942080.0, + "grad_norm": 2.554939462134261, + "language_loss": 0.80078608, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82730258, + "num_input_tokens_seen": 205530190, + "step": 9541, + "time_per_iteration": 2.9063754081726074 + }, + { + "auxiliary_loss_clip": 0.01435278, + "auxiliary_loss_mlp": 0.01232003, + "balance_loss_clip": 1.11447668, + "balance_loss_mlp": 1.03487825, + "epoch": 0.5736960769577634, + "flos": 15628719754080.0, + "grad_norm": 2.1686551562606597, + "language_loss": 0.64051068, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.66718346, + "num_input_tokens_seen": 205547380, + "step": 9542, + "time_per_iteration": 2.6970012187957764 + }, + { + "auxiliary_loss_clip": 0.01439688, + "auxiliary_loss_mlp": 0.01228817, + "balance_loss_clip": 1.11702371, + "balance_loss_mlp": 1.03293192, + "epoch": 0.5737562002104314, + "flos": 18005928791040.0, + "grad_norm": 2.4507359539887443, + "language_loss": 0.83788943, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.86457449, + "num_input_tokens_seen": 205566540, + "step": 9543, + "time_per_iteration": 2.7587289810180664 + }, + { + "auxiliary_loss_clip": 0.01448118, + "auxiliary_loss_mlp": 0.012407, + "balance_loss_clip": 1.12461066, + "balance_loss_mlp": 1.04100013, + "epoch": 0.5738163234630994, + "flos": 20699955027360.0, + "grad_norm": 2.201943224753249, + "language_loss": 0.72869951, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.7555877, + "num_input_tokens_seen": 205584200, + "step": 9544, + "time_per_iteration": 2.807901620864868 + }, + { + "auxiliary_loss_clip": 0.01442086, + "auxiliary_loss_mlp": 0.01233964, + "balance_loss_clip": 1.12097132, + "balance_loss_mlp": 1.03559947, + "epoch": 0.5738764467157673, + "flos": 23151579842880.0, + "grad_norm": 3.5988200862101, + "language_loss": 0.75947052, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78623098, + "num_input_tokens_seen": 205604675, + "step": 9545, + "time_per_iteration": 2.783644437789917 + }, + { + "auxiliary_loss_clip": 0.01447793, + "auxiliary_loss_mlp": 0.01238542, + "balance_loss_clip": 1.12452555, + "balance_loss_mlp": 1.03846109, + "epoch": 0.5739365699684353, + "flos": 29495279707200.0, + "grad_norm": 1.8932577178487928, + "language_loss": 0.55926019, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58612359, + "num_input_tokens_seen": 205624680, + "step": 9546, + "time_per_iteration": 5.738330364227295 + }, + { + "auxiliary_loss_clip": 0.01441474, + "auxiliary_loss_mlp": 0.01230248, + "balance_loss_clip": 1.11871541, + "balance_loss_mlp": 1.03178787, + "epoch": 0.5739966932211032, + "flos": 14028969791520.0, + "grad_norm": 15.806108708620654, + "language_loss": 0.76198792, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78870517, + "num_input_tokens_seen": 205641950, + "step": 9547, + "time_per_iteration": 2.78214693069458 + }, + { + "auxiliary_loss_clip": 0.01445851, + "auxiliary_loss_mlp": 0.01233263, + "balance_loss_clip": 1.12272108, + "balance_loss_mlp": 1.0346123, + "epoch": 0.5740568164737713, + "flos": 19064021235840.0, + "grad_norm": 1.8669764417213854, + "language_loss": 0.7432825, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.77007359, + "num_input_tokens_seen": 205660130, + "step": 9548, + "time_per_iteration": 2.7490580081939697 + }, + { + "auxiliary_loss_clip": 0.01444365, + "auxiliary_loss_mlp": 0.01225386, + "balance_loss_clip": 1.12187672, + "balance_loss_mlp": 1.02587748, + "epoch": 0.5741169397264392, + "flos": 22130050508640.0, + "grad_norm": 2.76452029654264, + "language_loss": 0.69583684, + "learning_rate": 1.619301709822355e-06, + "loss": 0.7225343, + "num_input_tokens_seen": 205678895, + "step": 9549, + "time_per_iteration": 2.7948951721191406 + }, + { + "auxiliary_loss_clip": 0.01451063, + "auxiliary_loss_mlp": 0.01225013, + "balance_loss_clip": 1.1286006, + "balance_loss_mlp": 1.02731633, + "epoch": 0.5741770629791072, + "flos": 24939189803520.0, + "grad_norm": 1.8368255851502358, + "language_loss": 0.79690796, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.82366872, + "num_input_tokens_seen": 205698450, + "step": 9550, + "time_per_iteration": 2.792076349258423 + }, + { + "auxiliary_loss_clip": 0.01450872, + "auxiliary_loss_mlp": 0.01235144, + "balance_loss_clip": 1.12881064, + "balance_loss_mlp": 1.0359211, + "epoch": 0.5742371862317751, + "flos": 18803148801120.0, + "grad_norm": 1.831686681717406, + "language_loss": 0.67888725, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70574749, + "num_input_tokens_seen": 205714870, + "step": 9551, + "time_per_iteration": 4.318353176116943 + }, + { + "auxiliary_loss_clip": 0.01439258, + "auxiliary_loss_mlp": 0.01228543, + "balance_loss_clip": 1.11901534, + "balance_loss_mlp": 1.03056073, + "epoch": 0.5742973094844431, + "flos": 24462883234080.0, + "grad_norm": 2.104921371740231, + "language_loss": 0.71700519, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.74368322, + "num_input_tokens_seen": 205736045, + "step": 9552, + "time_per_iteration": 2.809431552886963 + }, + { + "auxiliary_loss_clip": 0.01445262, + "auxiliary_loss_mlp": 0.01225881, + "balance_loss_clip": 1.1229949, + "balance_loss_mlp": 1.02522826, + "epoch": 0.574357432737111, + "flos": 21654919712160.0, + "grad_norm": 1.9529926865541405, + "language_loss": 0.80161881, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82833028, + "num_input_tokens_seen": 205754445, + "step": 9553, + "time_per_iteration": 2.7612533569335938 + }, + { + "auxiliary_loss_clip": 0.01432652, + "auxiliary_loss_mlp": 0.01236776, + "balance_loss_clip": 1.11157775, + "balance_loss_mlp": 1.03907943, + "epoch": 0.5744175559897791, + "flos": 16546590334080.0, + "grad_norm": 2.124168411619483, + "language_loss": 0.83735967, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.86405396, + "num_input_tokens_seen": 205770595, + "step": 9554, + "time_per_iteration": 2.763319492340088 + }, + { + "auxiliary_loss_clip": 0.01438878, + "auxiliary_loss_mlp": 0.01231712, + "balance_loss_clip": 1.11718798, + "balance_loss_mlp": 1.03287053, + "epoch": 0.574477679242447, + "flos": 24209937784800.0, + "grad_norm": 1.6649906101450098, + "language_loss": 0.70754826, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.73425412, + "num_input_tokens_seen": 205791935, + "step": 9555, + "time_per_iteration": 2.851558208465576 + }, + { + "auxiliary_loss_clip": 0.01440042, + "auxiliary_loss_mlp": 0.01226398, + "balance_loss_clip": 1.11911559, + "balance_loss_mlp": 1.02851045, + "epoch": 0.574537802495115, + "flos": 14904777677760.0, + "grad_norm": 2.703110353977638, + "language_loss": 0.72991782, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.75658226, + "num_input_tokens_seen": 205807260, + "step": 9556, + "time_per_iteration": 2.8583617210388184 + }, + { + "auxiliary_loss_clip": 0.01432989, + "auxiliary_loss_mlp": 0.01222652, + "balance_loss_clip": 1.11262047, + "balance_loss_mlp": 1.02600455, + "epoch": 0.5745979257477829, + "flos": 24937217539200.0, + "grad_norm": 4.277324997256343, + "language_loss": 0.74341959, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76997602, + "num_input_tokens_seen": 205826885, + "step": 9557, + "time_per_iteration": 2.845484972000122 + }, + { + "auxiliary_loss_clip": 0.0143267, + "auxiliary_loss_mlp": 0.01223821, + "balance_loss_clip": 1.11242747, + "balance_loss_mlp": 1.02688754, + "epoch": 0.5746580490004509, + "flos": 17237534546880.0, + "grad_norm": 1.59419928557312, + "language_loss": 0.67799312, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.70455807, + "num_input_tokens_seen": 205844630, + "step": 9558, + "time_per_iteration": 2.7260541915893555 + }, + { + "auxiliary_loss_clip": 0.01434676, + "auxiliary_loss_mlp": 0.01231265, + "balance_loss_clip": 1.11461914, + "balance_loss_mlp": 1.03251886, + "epoch": 0.5747181722531189, + "flos": 13189535375040.0, + "grad_norm": 2.358302211908833, + "language_loss": 0.71360111, + "learning_rate": 1.615479024621659e-06, + "loss": 0.74026054, + "num_input_tokens_seen": 205860960, + "step": 9559, + "time_per_iteration": 2.8462905883789062 + }, + { + "auxiliary_loss_clip": 0.01433466, + "auxiliary_loss_mlp": 0.01222751, + "balance_loss_clip": 1.11481738, + "balance_loss_mlp": 1.02743876, + "epoch": 0.5747782955057869, + "flos": 22965047330400.0, + "grad_norm": 1.745065604478319, + "language_loss": 0.79081333, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81737554, + "num_input_tokens_seen": 205880675, + "step": 9560, + "time_per_iteration": 2.7798798084259033 + }, + { + "auxiliary_loss_clip": 0.0142833, + "auxiliary_loss_mlp": 0.01228799, + "balance_loss_clip": 1.10886729, + "balance_loss_mlp": 1.03262806, + "epoch": 0.5748384187584549, + "flos": 23405435568000.0, + "grad_norm": 1.816494444852874, + "language_loss": 0.64569223, + "learning_rate": 1.614714662090588e-06, + "loss": 0.67226356, + "num_input_tokens_seen": 205900050, + "step": 9561, + "time_per_iteration": 2.7690963745117188 + }, + { + "auxiliary_loss_clip": 0.01436396, + "auxiliary_loss_mlp": 0.0123958, + "balance_loss_clip": 1.11608672, + "balance_loss_mlp": 1.04169226, + "epoch": 0.5748985420111228, + "flos": 17787460403520.0, + "grad_norm": 8.62060948346173, + "language_loss": 0.71226478, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73902452, + "num_input_tokens_seen": 205918855, + "step": 9562, + "time_per_iteration": 2.8262860774993896 + }, + { + "auxiliary_loss_clip": 0.01431099, + "auxiliary_loss_mlp": 0.01248256, + "balance_loss_clip": 1.11317253, + "balance_loss_mlp": 1.05380237, + "epoch": 0.5749586652637908, + "flos": 19868371740000.0, + "grad_norm": 1.763180551542828, + "language_loss": 0.84208858, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86888212, + "num_input_tokens_seen": 205936970, + "step": 9563, + "time_per_iteration": 2.7899813652038574 + }, + { + "auxiliary_loss_clip": 0.01435435, + "auxiliary_loss_mlp": 0.01234151, + "balance_loss_clip": 1.11600399, + "balance_loss_mlp": 1.03635859, + "epoch": 0.5750187885164587, + "flos": 21289326534720.0, + "grad_norm": 3.266279041808396, + "language_loss": 0.57144988, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59814572, + "num_input_tokens_seen": 205954630, + "step": 9564, + "time_per_iteration": 2.835594415664673 + }, + { + "auxiliary_loss_clip": 0.01432616, + "auxiliary_loss_mlp": 0.01226641, + "balance_loss_clip": 1.1150825, + "balance_loss_mlp": 1.02999306, + "epoch": 0.5750789117691267, + "flos": 18806524407360.0, + "grad_norm": 2.172091777371946, + "language_loss": 0.76186502, + "learning_rate": 1.613186112465078e-06, + "loss": 0.78845757, + "num_input_tokens_seen": 205971510, + "step": 9565, + "time_per_iteration": 2.771475315093994 + }, + { + "auxiliary_loss_clip": 0.01471134, + "auxiliary_loss_mlp": 0.01209908, + "balance_loss_clip": 1.18040121, + "balance_loss_mlp": 1.02031708, + "epoch": 0.5751390350217946, + "flos": 70670728360800.0, + "grad_norm": 0.7411179945718446, + "language_loss": 0.60668129, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.63349169, + "num_input_tokens_seen": 206035125, + "step": 9566, + "time_per_iteration": 3.4462335109710693 + }, + { + "auxiliary_loss_clip": 0.01435808, + "auxiliary_loss_mlp": 0.01232379, + "balance_loss_clip": 1.11696649, + "balance_loss_mlp": 1.03611338, + "epoch": 0.5751991582744627, + "flos": 14248082957760.0, + "grad_norm": 2.0538156729152424, + "language_loss": 0.75703168, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.78371358, + "num_input_tokens_seen": 206052075, + "step": 9567, + "time_per_iteration": 2.7459053993225098 + }, + { + "auxiliary_loss_clip": 0.01430742, + "auxiliary_loss_mlp": 0.01227666, + "balance_loss_clip": 1.11173463, + "balance_loss_mlp": 1.03225791, + "epoch": 0.5752592815271306, + "flos": 18329876484480.0, + "grad_norm": 1.5259365124801387, + "language_loss": 0.746351, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.77293509, + "num_input_tokens_seen": 206069970, + "step": 9568, + "time_per_iteration": 2.8469326496124268 + }, + { + "auxiliary_loss_clip": 0.01438001, + "auxiliary_loss_mlp": 0.01236392, + "balance_loss_clip": 1.11836517, + "balance_loss_mlp": 1.03850448, + "epoch": 0.5753194047797986, + "flos": 20924833273920.0, + "grad_norm": 2.2417035424145464, + "language_loss": 0.71424109, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.74098504, + "num_input_tokens_seen": 206088950, + "step": 9569, + "time_per_iteration": 2.8031816482543945 + }, + { + "auxiliary_loss_clip": 0.014375, + "auxiliary_loss_mlp": 0.01236563, + "balance_loss_clip": 1.11784208, + "balance_loss_mlp": 1.03953409, + "epoch": 0.5753795280324665, + "flos": 19283855037120.0, + "grad_norm": 2.4501221590018365, + "language_loss": 0.55554938, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.58228999, + "num_input_tokens_seen": 206107780, + "step": 9570, + "time_per_iteration": 2.750638961791992 + }, + { + "auxiliary_loss_clip": 0.01434358, + "auxiliary_loss_mlp": 0.01233193, + "balance_loss_clip": 1.11465037, + "balance_loss_mlp": 1.0375942, + "epoch": 0.5754396512851345, + "flos": 21655033496640.0, + "grad_norm": 1.5328687904141935, + "language_loss": 0.64125431, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66792983, + "num_input_tokens_seen": 206127445, + "step": 9571, + "time_per_iteration": 2.7910149097442627 + }, + { + "auxiliary_loss_clip": 0.01435203, + "auxiliary_loss_mlp": 0.0122972, + "balance_loss_clip": 1.11569989, + "balance_loss_mlp": 1.03202283, + "epoch": 0.5754997745378025, + "flos": 51025376751840.0, + "grad_norm": 1.5903701818087297, + "language_loss": 0.67162949, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69827878, + "num_input_tokens_seen": 206152005, + "step": 9572, + "time_per_iteration": 3.019681453704834 + }, + { + "auxiliary_loss_clip": 0.01436553, + "auxiliary_loss_mlp": 0.01238402, + "balance_loss_clip": 1.11687064, + "balance_loss_mlp": 1.04089594, + "epoch": 0.5755598977904705, + "flos": 22859112886560.0, + "grad_norm": 2.0484483089539274, + "language_loss": 0.72244191, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74919152, + "num_input_tokens_seen": 206169875, + "step": 9573, + "time_per_iteration": 2.7974557876586914 + }, + { + "auxiliary_loss_clip": 0.0143833, + "auxiliary_loss_mlp": 0.01230563, + "balance_loss_clip": 1.1191839, + "balance_loss_mlp": 1.0349648, + "epoch": 0.5756200210431385, + "flos": 38475923198400.0, + "grad_norm": 1.8492194559708721, + "language_loss": 0.76526225, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.79195124, + "num_input_tokens_seen": 206192635, + "step": 9574, + "time_per_iteration": 2.919499397277832 + }, + { + "auxiliary_loss_clip": 0.01430959, + "auxiliary_loss_mlp": 0.01238522, + "balance_loss_clip": 1.11156845, + "balance_loss_mlp": 1.03929949, + "epoch": 0.5756801442958064, + "flos": 23912009173440.0, + "grad_norm": 6.294404129713491, + "language_loss": 0.6660533, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.69274807, + "num_input_tokens_seen": 206211485, + "step": 9575, + "time_per_iteration": 2.784069299697876 + }, + { + "auxiliary_loss_clip": 0.01437053, + "auxiliary_loss_mlp": 0.0123221, + "balance_loss_clip": 1.11787057, + "balance_loss_mlp": 1.0370878, + "epoch": 0.5757402675484744, + "flos": 21107383329600.0, + "grad_norm": 1.50198908659737, + "language_loss": 0.79621863, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.82291132, + "num_input_tokens_seen": 206231740, + "step": 9576, + "time_per_iteration": 4.177018404006958 + }, + { + "auxiliary_loss_clip": 0.01441223, + "auxiliary_loss_mlp": 0.01224514, + "balance_loss_clip": 1.12035739, + "balance_loss_mlp": 1.02614975, + "epoch": 0.5758003908011423, + "flos": 20561250288960.0, + "grad_norm": 1.7558615017254466, + "language_loss": 0.69760883, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.72426617, + "num_input_tokens_seen": 206250975, + "step": 9577, + "time_per_iteration": 2.7846479415893555 + }, + { + "auxiliary_loss_clip": 0.01429725, + "auxiliary_loss_mlp": 0.01233582, + "balance_loss_clip": 1.10916209, + "balance_loss_mlp": 1.03826976, + "epoch": 0.5758605140538103, + "flos": 16474943311200.0, + "grad_norm": 1.773772769777169, + "language_loss": 0.67064548, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.6972785, + "num_input_tokens_seen": 206268800, + "step": 9578, + "time_per_iteration": 2.8131000995635986 + }, + { + "auxiliary_loss_clip": 0.01427095, + "auxiliary_loss_mlp": 0.01227763, + "balance_loss_clip": 1.10863066, + "balance_loss_mlp": 1.03340435, + "epoch": 0.5759206373064782, + "flos": 21289667888160.0, + "grad_norm": 1.6354027272139784, + "language_loss": 0.73020184, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.75675046, + "num_input_tokens_seen": 206287190, + "step": 9579, + "time_per_iteration": 2.8334548473358154 + }, + { + "auxiliary_loss_clip": 0.01436507, + "auxiliary_loss_mlp": 0.01230365, + "balance_loss_clip": 1.11694515, + "balance_loss_mlp": 1.02723193, + "epoch": 0.5759807605591463, + "flos": 26070294684960.0, + "grad_norm": 6.191666970530258, + "language_loss": 0.65110821, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.67777693, + "num_input_tokens_seen": 206307020, + "step": 9580, + "time_per_iteration": 2.7859995365142822 + }, + { + "auxiliary_loss_clip": 0.01430531, + "auxiliary_loss_mlp": 0.01234576, + "balance_loss_clip": 1.11208785, + "balance_loss_mlp": 1.03468633, + "epoch": 0.5760408838118142, + "flos": 18874682039520.0, + "grad_norm": 2.6539764908438257, + "language_loss": 0.85766453, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.88431561, + "num_input_tokens_seen": 206324095, + "step": 9581, + "time_per_iteration": 2.8066892623901367 + }, + { + "auxiliary_loss_clip": 0.01440461, + "auxiliary_loss_mlp": 0.01241408, + "balance_loss_clip": 1.12083125, + "balance_loss_mlp": 1.04609597, + "epoch": 0.5761010070644822, + "flos": 15379984330560.0, + "grad_norm": 2.4188738525526596, + "language_loss": 0.67454708, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.70136577, + "num_input_tokens_seen": 206343210, + "step": 9582, + "time_per_iteration": 2.7345075607299805 + }, + { + "auxiliary_loss_clip": 0.01465959, + "auxiliary_loss_mlp": 0.0121106, + "balance_loss_clip": 1.17492199, + "balance_loss_mlp": 1.02261353, + "epoch": 0.5761611303171501, + "flos": 71479819884960.0, + "grad_norm": 0.7626141590547504, + "language_loss": 0.57168245, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59845257, + "num_input_tokens_seen": 206415935, + "step": 9583, + "time_per_iteration": 5.0540547370910645 + }, + { + "auxiliary_loss_clip": 0.01434681, + "auxiliary_loss_mlp": 0.01232557, + "balance_loss_clip": 1.11512506, + "balance_loss_mlp": 1.03323901, + "epoch": 0.5762212535698181, + "flos": 16247637662400.0, + "grad_norm": 2.1032438455454203, + "language_loss": 0.82254201, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84921438, + "num_input_tokens_seen": 206431900, + "step": 9584, + "time_per_iteration": 4.245422840118408 + }, + { + "auxiliary_loss_clip": 0.01462557, + "auxiliary_loss_mlp": 0.01215385, + "balance_loss_clip": 1.17114592, + "balance_loss_mlp": 1.02732086, + "epoch": 0.5762813768224861, + "flos": 70192790880480.0, + "grad_norm": 0.6261917219645988, + "language_loss": 0.49528149, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.52206093, + "num_input_tokens_seen": 206501200, + "step": 9585, + "time_per_iteration": 3.3356058597564697 + }, + { + "auxiliary_loss_clip": 0.0142987, + "auxiliary_loss_mlp": 0.01228428, + "balance_loss_clip": 1.1100769, + "balance_loss_mlp": 1.03521407, + "epoch": 0.5763415000751541, + "flos": 20519908230240.0, + "grad_norm": 1.6685327415957745, + "language_loss": 0.84873366, + "learning_rate": 1.605165098835465e-06, + "loss": 0.87531662, + "num_input_tokens_seen": 206520575, + "step": 9586, + "time_per_iteration": 2.8946666717529297 + }, + { + "auxiliary_loss_clip": 0.014312, + "auxiliary_loss_mlp": 0.01223583, + "balance_loss_clip": 1.1094377, + "balance_loss_mlp": 1.02445567, + "epoch": 0.5764016233278221, + "flos": 15817376243520.0, + "grad_norm": 2.1239990407051317, + "language_loss": 0.80215192, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82869977, + "num_input_tokens_seen": 206538060, + "step": 9587, + "time_per_iteration": 2.82473087310791 + }, + { + "auxiliary_loss_clip": 0.01434504, + "auxiliary_loss_mlp": 0.01230077, + "balance_loss_clip": 1.11411047, + "balance_loss_mlp": 1.03190351, + "epoch": 0.57646174658049, + "flos": 20774143236960.0, + "grad_norm": 1.5689655627213308, + "language_loss": 0.65924031, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68588614, + "num_input_tokens_seen": 206557320, + "step": 9588, + "time_per_iteration": 2.7631967067718506 + }, + { + "auxiliary_loss_clip": 0.01430479, + "auxiliary_loss_mlp": 0.01223012, + "balance_loss_clip": 1.10940456, + "balance_loss_mlp": 1.02531552, + "epoch": 0.576521869833158, + "flos": 23552181076320.0, + "grad_norm": 2.60394452988509, + "language_loss": 0.78623486, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.81276977, + "num_input_tokens_seen": 206575780, + "step": 9589, + "time_per_iteration": 2.846790313720703 + }, + { + "auxiliary_loss_clip": 0.01427071, + "auxiliary_loss_mlp": 0.01221921, + "balance_loss_clip": 1.10689545, + "balance_loss_mlp": 1.02508283, + "epoch": 0.5765819930858259, + "flos": 20268669548160.0, + "grad_norm": 1.9988293554817362, + "language_loss": 0.79515374, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.82164365, + "num_input_tokens_seen": 206594100, + "step": 9590, + "time_per_iteration": 4.32176399230957 + }, + { + "auxiliary_loss_clip": 0.01429906, + "auxiliary_loss_mlp": 0.0122666, + "balance_loss_clip": 1.10976219, + "balance_loss_mlp": 1.03068018, + "epoch": 0.5766421163384939, + "flos": 23151010920480.0, + "grad_norm": 1.7961183003344199, + "language_loss": 0.62895143, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.6555171, + "num_input_tokens_seen": 206613325, + "step": 9591, + "time_per_iteration": 2.7846486568450928 + }, + { + "auxiliary_loss_clip": 0.01430864, + "auxiliary_loss_mlp": 0.01227518, + "balance_loss_clip": 1.10943496, + "balance_loss_mlp": 1.02972603, + "epoch": 0.5767022395911618, + "flos": 25851295303200.0, + "grad_norm": 1.7639917566636236, + "language_loss": 0.78268278, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.80926663, + "num_input_tokens_seen": 206634265, + "step": 9592, + "time_per_iteration": 2.814976453781128 + }, + { + "auxiliary_loss_clip": 0.01444686, + "auxiliary_loss_mlp": 0.01184494, + "balance_loss_clip": 1.15184116, + "balance_loss_mlp": 0.99185181, + "epoch": 0.5767623628438299, + "flos": 68300763602400.0, + "grad_norm": 0.7337718840141781, + "language_loss": 0.59544766, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.62173951, + "num_input_tokens_seen": 206696990, + "step": 9593, + "time_per_iteration": 3.472304582595825 + }, + { + "auxiliary_loss_clip": 0.01430143, + "auxiliary_loss_mlp": 0.01228225, + "balance_loss_clip": 1.10995007, + "balance_loss_mlp": 1.02928853, + "epoch": 0.5768224860964978, + "flos": 30190964940000.0, + "grad_norm": 1.5948817499747048, + "language_loss": 0.70956057, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73614424, + "num_input_tokens_seen": 206717815, + "step": 9594, + "time_per_iteration": 2.8582875728607178 + }, + { + "auxiliary_loss_clip": 0.01434171, + "auxiliary_loss_mlp": 0.01225414, + "balance_loss_clip": 1.11376548, + "balance_loss_mlp": 1.02952921, + "epoch": 0.5768826093491658, + "flos": 17896998022560.0, + "grad_norm": 1.7948526954197233, + "language_loss": 0.70737088, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73396671, + "num_input_tokens_seen": 206735985, + "step": 9595, + "time_per_iteration": 2.7790071964263916 + }, + { + "auxiliary_loss_clip": 0.01428726, + "auxiliary_loss_mlp": 0.01235477, + "balance_loss_clip": 1.10782862, + "balance_loss_mlp": 1.03654099, + "epoch": 0.5769427326018337, + "flos": 17459113043520.0, + "grad_norm": 2.3396660534634166, + "language_loss": 0.69563341, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.7222755, + "num_input_tokens_seen": 206753370, + "step": 9596, + "time_per_iteration": 2.727555274963379 + }, + { + "auxiliary_loss_clip": 0.01431733, + "auxiliary_loss_mlp": 0.0122597, + "balance_loss_clip": 1.11068654, + "balance_loss_mlp": 1.02913213, + "epoch": 0.5770028558545017, + "flos": 39424895233920.0, + "grad_norm": 2.2831121143801396, + "language_loss": 0.67692471, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.70350182, + "num_input_tokens_seen": 206777645, + "step": 9597, + "time_per_iteration": 2.9881887435913086 + }, + { + "auxiliary_loss_clip": 0.01429019, + "auxiliary_loss_mlp": 0.01236368, + "balance_loss_clip": 1.10813904, + "balance_loss_mlp": 1.03952956, + "epoch": 0.5770629791071697, + "flos": 21538137814560.0, + "grad_norm": 1.7546473147159678, + "language_loss": 0.8192209, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.84587467, + "num_input_tokens_seen": 206794865, + "step": 9598, + "time_per_iteration": 2.7520527839660645 + }, + { + "auxiliary_loss_clip": 0.01427022, + "auxiliary_loss_mlp": 0.01238579, + "balance_loss_clip": 1.10586524, + "balance_loss_mlp": 1.04307628, + "epoch": 0.5771231023598377, + "flos": 20888914942080.0, + "grad_norm": 1.5236897019458218, + "language_loss": 0.73139501, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.75805104, + "num_input_tokens_seen": 206814095, + "step": 9599, + "time_per_iteration": 2.8302202224731445 + }, + { + "auxiliary_loss_clip": 0.01432106, + "auxiliary_loss_mlp": 0.01215702, + "balance_loss_clip": 1.11159515, + "balance_loss_mlp": 1.02067614, + "epoch": 0.5771832256125057, + "flos": 18079168796640.0, + "grad_norm": 1.9273193183176622, + "language_loss": 0.77911639, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.8055945, + "num_input_tokens_seen": 206832245, + "step": 9600, + "time_per_iteration": 2.8408091068267822 + }, + { + "auxiliary_loss_clip": 0.01431148, + "auxiliary_loss_mlp": 0.01235483, + "balance_loss_clip": 1.11162627, + "balance_loss_mlp": 1.03912163, + "epoch": 0.5772433488651736, + "flos": 26361472083840.0, + "grad_norm": 1.6298722524227018, + "language_loss": 0.72213781, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74880415, + "num_input_tokens_seen": 206851535, + "step": 9601, + "time_per_iteration": 2.7916419506073 + }, + { + "auxiliary_loss_clip": 0.0143702, + "auxiliary_loss_mlp": 0.01236192, + "balance_loss_clip": 1.11537123, + "balance_loss_mlp": 1.03792346, + "epoch": 0.5773034721178416, + "flos": 19682939144160.0, + "grad_norm": 1.7899099665478788, + "language_loss": 0.68566042, + "learning_rate": 1.599058274973348e-06, + "loss": 0.71239245, + "num_input_tokens_seen": 206870595, + "step": 9602, + "time_per_iteration": 2.863060712814331 + }, + { + "auxiliary_loss_clip": 0.01431108, + "auxiliary_loss_mlp": 0.01222657, + "balance_loss_clip": 1.11044824, + "balance_loss_mlp": 1.02744031, + "epoch": 0.5773635953705095, + "flos": 25085252604960.0, + "grad_norm": 2.4408822093699736, + "language_loss": 0.73385096, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.76038855, + "num_input_tokens_seen": 206892320, + "step": 9603, + "time_per_iteration": 2.778848648071289 + }, + { + "auxiliary_loss_clip": 0.01429983, + "auxiliary_loss_mlp": 0.01226758, + "balance_loss_clip": 1.10919929, + "balance_loss_mlp": 1.02887022, + "epoch": 0.5774237186231775, + "flos": 21035129456160.0, + "grad_norm": 1.7251229736444238, + "language_loss": 0.76496869, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.79153609, + "num_input_tokens_seen": 206912485, + "step": 9604, + "time_per_iteration": 2.8074028491973877 + }, + { + "auxiliary_loss_clip": 0.0143624, + "auxiliary_loss_mlp": 0.01235924, + "balance_loss_clip": 1.11539924, + "balance_loss_mlp": 1.03670192, + "epoch": 0.5774838418758454, + "flos": 15233542247520.0, + "grad_norm": 1.8100420688714518, + "language_loss": 0.83687329, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.86359495, + "num_input_tokens_seen": 206929100, + "step": 9605, + "time_per_iteration": 2.825505256652832 + }, + { + "auxiliary_loss_clip": 0.01433047, + "auxiliary_loss_mlp": 0.01240619, + "balance_loss_clip": 1.11283851, + "balance_loss_mlp": 1.04092014, + "epoch": 0.5775439651285135, + "flos": 23584647945600.0, + "grad_norm": 1.8420678443838212, + "language_loss": 0.78218627, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80892289, + "num_input_tokens_seen": 206947020, + "step": 9606, + "time_per_iteration": 2.7974212169647217 + }, + { + "auxiliary_loss_clip": 0.01429471, + "auxiliary_loss_mlp": 0.01223624, + "balance_loss_clip": 1.10819769, + "balance_loss_mlp": 1.02840734, + "epoch": 0.5776040883811814, + "flos": 18042453973440.0, + "grad_norm": 1.7833687832616223, + "language_loss": 0.74173874, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76826972, + "num_input_tokens_seen": 206964065, + "step": 9607, + "time_per_iteration": 2.758002281188965 + }, + { + "auxiliary_loss_clip": 0.01430616, + "auxiliary_loss_mlp": 0.01234282, + "balance_loss_clip": 1.10927641, + "balance_loss_mlp": 1.03639495, + "epoch": 0.5776642116338494, + "flos": 18626629322880.0, + "grad_norm": 1.8789562067127477, + "language_loss": 0.69586134, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.72251034, + "num_input_tokens_seen": 206981940, + "step": 9608, + "time_per_iteration": 2.7328639030456543 + }, + { + "auxiliary_loss_clip": 0.01427735, + "auxiliary_loss_mlp": 0.012293, + "balance_loss_clip": 1.10771489, + "balance_loss_mlp": 1.02969635, + "epoch": 0.5777243348865173, + "flos": 28405061746560.0, + "grad_norm": 2.009237453955176, + "language_loss": 0.75977075, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78634107, + "num_input_tokens_seen": 207002365, + "step": 9609, + "time_per_iteration": 2.820924997329712 + }, + { + "auxiliary_loss_clip": 0.01423854, + "auxiliary_loss_mlp": 0.0122432, + "balance_loss_clip": 1.10389423, + "balance_loss_mlp": 1.0270046, + "epoch": 0.5777844581391853, + "flos": 24027008447520.0, + "grad_norm": 1.6317226158207685, + "language_loss": 0.77305448, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79953623, + "num_input_tokens_seen": 207021195, + "step": 9610, + "time_per_iteration": 2.8210721015930176 + }, + { + "auxiliary_loss_clip": 0.01429038, + "auxiliary_loss_mlp": 0.0123312, + "balance_loss_clip": 1.11005425, + "balance_loss_mlp": 1.03437471, + "epoch": 0.5778445813918534, + "flos": 17778850711200.0, + "grad_norm": 2.8325576689384673, + "language_loss": 0.68775797, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.71437955, + "num_input_tokens_seen": 207037465, + "step": 9611, + "time_per_iteration": 2.8159193992614746 + }, + { + "auxiliary_loss_clip": 0.01425726, + "auxiliary_loss_mlp": 0.01235582, + "balance_loss_clip": 1.10608292, + "balance_loss_mlp": 1.03922105, + "epoch": 0.5779047046445213, + "flos": 22235188461120.0, + "grad_norm": 2.3229405548605784, + "language_loss": 0.82969075, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85630381, + "num_input_tokens_seen": 207054230, + "step": 9612, + "time_per_iteration": 2.827503204345703 + }, + { + "auxiliary_loss_clip": 0.01426077, + "auxiliary_loss_mlp": 0.01229659, + "balance_loss_clip": 1.1074512, + "balance_loss_mlp": 1.03053212, + "epoch": 0.5779648278971893, + "flos": 21436527180960.0, + "grad_norm": 2.0286347442532318, + "language_loss": 0.79649365, + "learning_rate": 1.594862087742667e-06, + "loss": 0.82305098, + "num_input_tokens_seen": 207073150, + "step": 9613, + "time_per_iteration": 4.115844011306763 + }, + { + "auxiliary_loss_clip": 0.01422182, + "auxiliary_loss_mlp": 0.01227721, + "balance_loss_clip": 1.10345566, + "balance_loss_mlp": 1.03135991, + "epoch": 0.5780249511498572, + "flos": 19028140832160.0, + "grad_norm": 3.8846527816761607, + "language_loss": 0.77563894, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.80213797, + "num_input_tokens_seen": 207090375, + "step": 9614, + "time_per_iteration": 2.758822202682495 + }, + { + "auxiliary_loss_clip": 0.0142571, + "auxiliary_loss_mlp": 0.01228329, + "balance_loss_clip": 1.10557258, + "balance_loss_mlp": 1.02777159, + "epoch": 0.5780850744025252, + "flos": 12125981275200.0, + "grad_norm": 2.272273501662529, + "language_loss": 0.81359768, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.84013808, + "num_input_tokens_seen": 207106030, + "step": 9615, + "time_per_iteration": 2.759904623031616 + }, + { + "auxiliary_loss_clip": 0.01423334, + "auxiliary_loss_mlp": 0.01222065, + "balance_loss_clip": 1.1036551, + "balance_loss_mlp": 1.02284288, + "epoch": 0.5781451976551931, + "flos": 25046679301920.0, + "grad_norm": 1.9792987486729814, + "language_loss": 0.67242873, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69888276, + "num_input_tokens_seen": 207125435, + "step": 9616, + "time_per_iteration": 2.8443024158477783 + }, + { + "auxiliary_loss_clip": 0.01424623, + "auxiliary_loss_mlp": 0.01219012, + "balance_loss_clip": 1.10654569, + "balance_loss_mlp": 1.02160156, + "epoch": 0.5782053209078611, + "flos": 19247671208160.0, + "grad_norm": 2.0710648039677184, + "language_loss": 0.77735025, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.80378664, + "num_input_tokens_seen": 207145095, + "step": 9617, + "time_per_iteration": 2.7651727199554443 + }, + { + "auxiliary_loss_clip": 0.01430205, + "auxiliary_loss_mlp": 0.01218479, + "balance_loss_clip": 1.11063004, + "balance_loss_mlp": 1.01887512, + "epoch": 0.578265444160529, + "flos": 25996182331680.0, + "grad_norm": 1.6901102858671007, + "language_loss": 0.75078601, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77727288, + "num_input_tokens_seen": 207166045, + "step": 9618, + "time_per_iteration": 2.810476541519165 + }, + { + "auxiliary_loss_clip": 0.01421647, + "auxiliary_loss_mlp": 0.01214399, + "balance_loss_clip": 1.10304952, + "balance_loss_mlp": 1.01708412, + "epoch": 0.5783255674131971, + "flos": 21800754944640.0, + "grad_norm": 1.5504516713660246, + "language_loss": 0.81598186, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.84234232, + "num_input_tokens_seen": 207185290, + "step": 9619, + "time_per_iteration": 2.8388559818267822 + }, + { + "auxiliary_loss_clip": 0.01419966, + "auxiliary_loss_mlp": 0.01221863, + "balance_loss_clip": 1.10177398, + "balance_loss_mlp": 1.02502441, + "epoch": 0.578385690665865, + "flos": 24792027085440.0, + "grad_norm": 3.5277520745902144, + "language_loss": 0.72144043, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.7478587, + "num_input_tokens_seen": 207205505, + "step": 9620, + "time_per_iteration": 2.8284871578216553 + }, + { + "auxiliary_loss_clip": 0.01421401, + "auxiliary_loss_mlp": 0.01227152, + "balance_loss_clip": 1.10415506, + "balance_loss_mlp": 1.03107691, + "epoch": 0.578445813918533, + "flos": 21214872828000.0, + "grad_norm": 1.848343759447369, + "language_loss": 0.76952231, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79600787, + "num_input_tokens_seen": 207225315, + "step": 9621, + "time_per_iteration": 4.491133451461792 + }, + { + "auxiliary_loss_clip": 0.01418953, + "auxiliary_loss_mlp": 0.01219239, + "balance_loss_clip": 1.10217035, + "balance_loss_mlp": 1.02144742, + "epoch": 0.5785059371712009, + "flos": 25049182560480.0, + "grad_norm": 1.6579013844827712, + "language_loss": 0.70803756, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.73441947, + "num_input_tokens_seen": 207247690, + "step": 9622, + "time_per_iteration": 4.39555287361145 + }, + { + "auxiliary_loss_clip": 0.0144555, + "auxiliary_loss_mlp": 0.01202202, + "balance_loss_clip": 1.15747678, + "balance_loss_mlp": 1.01261139, + "epoch": 0.5785660604238689, + "flos": 70850547588960.0, + "grad_norm": 0.8007174412253801, + "language_loss": 0.55931163, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.58578914, + "num_input_tokens_seen": 207301735, + "step": 9623, + "time_per_iteration": 3.3863589763641357 + }, + { + "auxiliary_loss_clip": 0.01421925, + "auxiliary_loss_mlp": 0.01231628, + "balance_loss_clip": 1.10233378, + "balance_loss_mlp": 1.03307295, + "epoch": 0.578626183676537, + "flos": 31652692871040.0, + "grad_norm": 2.16678993685194, + "language_loss": 0.71452087, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.74105638, + "num_input_tokens_seen": 207321240, + "step": 9624, + "time_per_iteration": 2.8433475494384766 + }, + { + "auxiliary_loss_clip": 0.01431372, + "auxiliary_loss_mlp": 0.01222198, + "balance_loss_clip": 1.11395371, + "balance_loss_mlp": 1.02183151, + "epoch": 0.5786863069292049, + "flos": 21866978240640.0, + "grad_norm": 1.7945205903222068, + "language_loss": 0.81994075, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84647644, + "num_input_tokens_seen": 207339540, + "step": 9625, + "time_per_iteration": 2.773482084274292 + }, + { + "auxiliary_loss_clip": 0.01426795, + "auxiliary_loss_mlp": 0.01228472, + "balance_loss_clip": 1.11022198, + "balance_loss_mlp": 1.03163409, + "epoch": 0.5787464301818729, + "flos": 23367051905760.0, + "grad_norm": 1.3858426162363742, + "language_loss": 0.69997704, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72652972, + "num_input_tokens_seen": 207360470, + "step": 9626, + "time_per_iteration": 2.8405301570892334 + }, + { + "auxiliary_loss_clip": 0.0142529, + "auxiliary_loss_mlp": 0.01226783, + "balance_loss_clip": 1.10851359, + "balance_loss_mlp": 1.03118443, + "epoch": 0.5788065534345408, + "flos": 30006404691840.0, + "grad_norm": 1.4421614444671, + "language_loss": 0.71663338, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.74315417, + "num_input_tokens_seen": 207383080, + "step": 9627, + "time_per_iteration": 2.9555442333221436 + }, + { + "auxiliary_loss_clip": 0.01421622, + "auxiliary_loss_mlp": 0.01216619, + "balance_loss_clip": 1.10411894, + "balance_loss_mlp": 1.0203526, + "epoch": 0.5788666766872088, + "flos": 24529751308800.0, + "grad_norm": 1.6284141075791112, + "language_loss": 0.83998507, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86636746, + "num_input_tokens_seen": 207401000, + "step": 9628, + "time_per_iteration": 4.390478610992432 + }, + { + "auxiliary_loss_clip": 0.01423991, + "auxiliary_loss_mlp": 0.01230768, + "balance_loss_clip": 1.10693955, + "balance_loss_mlp": 1.03660035, + "epoch": 0.5789267999398767, + "flos": 23735110413600.0, + "grad_norm": 1.668041412491925, + "language_loss": 0.72177768, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74832529, + "num_input_tokens_seen": 207419230, + "step": 9629, + "time_per_iteration": 2.7933027744293213 + }, + { + "auxiliary_loss_clip": 0.01433754, + "auxiliary_loss_mlp": 0.01230405, + "balance_loss_clip": 1.11646295, + "balance_loss_mlp": 1.03127742, + "epoch": 0.5789869231925447, + "flos": 21136778017920.0, + "grad_norm": 2.6479208584263922, + "language_loss": 0.74666715, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.77330869, + "num_input_tokens_seen": 207437615, + "step": 9630, + "time_per_iteration": 2.787003517150879 + }, + { + "auxiliary_loss_clip": 0.01429731, + "auxiliary_loss_mlp": 0.01230508, + "balance_loss_clip": 1.11225212, + "balance_loss_mlp": 1.03338397, + "epoch": 0.5790470464452127, + "flos": 21211459293600.0, + "grad_norm": 1.8021615196766954, + "language_loss": 0.79309255, + "learning_rate": 1.587999618060523e-06, + "loss": 0.81969494, + "num_input_tokens_seen": 207457270, + "step": 9631, + "time_per_iteration": 2.8000967502593994 + }, + { + "auxiliary_loss_clip": 0.01424361, + "auxiliary_loss_mlp": 0.01226409, + "balance_loss_clip": 1.10675001, + "balance_loss_mlp": 1.03243184, + "epoch": 0.5791071696978807, + "flos": 23406649269120.0, + "grad_norm": 1.8085941668818077, + "language_loss": 0.75369805, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.78020579, + "num_input_tokens_seen": 207477890, + "step": 9632, + "time_per_iteration": 2.834688663482666 + }, + { + "auxiliary_loss_clip": 0.01431043, + "auxiliary_loss_mlp": 0.01220964, + "balance_loss_clip": 1.11195779, + "balance_loss_mlp": 1.02193189, + "epoch": 0.5791672929505486, + "flos": 24208799940000.0, + "grad_norm": 2.194031823320468, + "language_loss": 0.80018747, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.8267076, + "num_input_tokens_seen": 207497670, + "step": 9633, + "time_per_iteration": 2.809065580368042 + }, + { + "auxiliary_loss_clip": 0.01429509, + "auxiliary_loss_mlp": 0.01235686, + "balance_loss_clip": 1.11064243, + "balance_loss_mlp": 1.03922915, + "epoch": 0.5792274162032166, + "flos": 24350956140960.0, + "grad_norm": 2.2450401491339975, + "language_loss": 0.77864945, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.80530131, + "num_input_tokens_seen": 207516105, + "step": 9634, + "time_per_iteration": 2.8069469928741455 + }, + { + "auxiliary_loss_clip": 0.01428991, + "auxiliary_loss_mlp": 0.01242081, + "balance_loss_clip": 1.10978425, + "balance_loss_mlp": 1.04486084, + "epoch": 0.5792875394558845, + "flos": 20451523029120.0, + "grad_norm": 3.3972384757989373, + "language_loss": 0.63600433, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.66271508, + "num_input_tokens_seen": 207533685, + "step": 9635, + "time_per_iteration": 2.843595027923584 + }, + { + "auxiliary_loss_clip": 0.01427014, + "auxiliary_loss_mlp": 0.01236645, + "balance_loss_clip": 1.10888243, + "balance_loss_mlp": 1.04199982, + "epoch": 0.5793476627085525, + "flos": 24062585425920.0, + "grad_norm": 1.401585009822565, + "language_loss": 0.77159297, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79822958, + "num_input_tokens_seen": 207552840, + "step": 9636, + "time_per_iteration": 2.8293466567993164 + }, + { + "auxiliary_loss_clip": 0.01426007, + "auxiliary_loss_mlp": 0.01225798, + "balance_loss_clip": 1.10799229, + "balance_loss_mlp": 1.03191566, + "epoch": 0.5794077859612206, + "flos": 22056469149600.0, + "grad_norm": 1.9318492458977494, + "language_loss": 0.67986107, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70637912, + "num_input_tokens_seen": 207572095, + "step": 9637, + "time_per_iteration": 2.8300139904022217 + }, + { + "auxiliary_loss_clip": 0.01428026, + "auxiliary_loss_mlp": 0.01227549, + "balance_loss_clip": 1.10989594, + "balance_loss_mlp": 1.03023338, + "epoch": 0.5794679092138885, + "flos": 11436326619840.0, + "grad_norm": 2.36438784503204, + "language_loss": 0.72085857, + "learning_rate": 1.585332242234043e-06, + "loss": 0.74741429, + "num_input_tokens_seen": 207587495, + "step": 9638, + "time_per_iteration": 2.721379280090332 + }, + { + "auxiliary_loss_clip": 0.01433111, + "auxiliary_loss_mlp": 0.01241766, + "balance_loss_clip": 1.11577368, + "balance_loss_mlp": 1.04826522, + "epoch": 0.5795280324665565, + "flos": 18882229743360.0, + "grad_norm": 1.721738071521723, + "language_loss": 0.72313172, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74988043, + "num_input_tokens_seen": 207606795, + "step": 9639, + "time_per_iteration": 2.7986786365509033 + }, + { + "auxiliary_loss_clip": 0.01426344, + "auxiliary_loss_mlp": 0.01226527, + "balance_loss_clip": 1.10966814, + "balance_loss_mlp": 1.03197706, + "epoch": 0.5795881557192244, + "flos": 13007630098080.0, + "grad_norm": 2.3124691795618264, + "language_loss": 0.69835436, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.72488308, + "num_input_tokens_seen": 207623620, + "step": 9640, + "time_per_iteration": 2.7345166206359863 + }, + { + "auxiliary_loss_clip": 0.01425894, + "auxiliary_loss_mlp": 0.01240435, + "balance_loss_clip": 1.10930943, + "balance_loss_mlp": 1.04302466, + "epoch": 0.5796482789718924, + "flos": 19934139898080.0, + "grad_norm": 2.713372426936365, + "language_loss": 0.77740318, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.80406642, + "num_input_tokens_seen": 207639380, + "step": 9641, + "time_per_iteration": 2.747982978820801 + }, + { + "auxiliary_loss_clip": 0.01427488, + "auxiliary_loss_mlp": 0.01237337, + "balance_loss_clip": 1.11155605, + "balance_loss_mlp": 1.04250121, + "epoch": 0.5797084022245603, + "flos": 21652833663360.0, + "grad_norm": 1.8709789899298668, + "language_loss": 0.73536384, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.76201212, + "num_input_tokens_seen": 207657915, + "step": 9642, + "time_per_iteration": 2.7912800312042236 + }, + { + "auxiliary_loss_clip": 0.01426536, + "auxiliary_loss_mlp": 0.0122809, + "balance_loss_clip": 1.11181521, + "balance_loss_mlp": 1.03287244, + "epoch": 0.5797685254772283, + "flos": 26033997071520.0, + "grad_norm": 1.576671683837089, + "language_loss": 0.73512584, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.76167208, + "num_input_tokens_seen": 207678620, + "step": 9643, + "time_per_iteration": 2.7910525798797607 + }, + { + "auxiliary_loss_clip": 0.01422814, + "auxiliary_loss_mlp": 0.01225414, + "balance_loss_clip": 1.10618448, + "balance_loss_mlp": 1.02943373, + "epoch": 0.5798286487298963, + "flos": 22707057435840.0, + "grad_norm": 3.7201833054201026, + "language_loss": 0.67670131, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.70318353, + "num_input_tokens_seen": 207696980, + "step": 9644, + "time_per_iteration": 2.824692487716675 + }, + { + "auxiliary_loss_clip": 0.01430616, + "auxiliary_loss_mlp": 0.01231738, + "balance_loss_clip": 1.1147809, + "balance_loss_mlp": 1.03375506, + "epoch": 0.5798887719825643, + "flos": 23151238489440.0, + "grad_norm": 2.9107792401910046, + "language_loss": 0.85945141, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.88607502, + "num_input_tokens_seen": 207714065, + "step": 9645, + "time_per_iteration": 2.7800612449645996 + }, + { + "auxiliary_loss_clip": 0.0143126, + "auxiliary_loss_mlp": 0.01229959, + "balance_loss_clip": 1.11485386, + "balance_loss_mlp": 1.03130841, + "epoch": 0.5799488952352322, + "flos": 24428368244160.0, + "grad_norm": 1.8635587811514822, + "language_loss": 0.75161612, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77822834, + "num_input_tokens_seen": 207734720, + "step": 9646, + "time_per_iteration": 2.8712613582611084 + }, + { + "auxiliary_loss_clip": 0.01431805, + "auxiliary_loss_mlp": 0.01235326, + "balance_loss_clip": 1.11647475, + "balance_loss_mlp": 1.03772426, + "epoch": 0.5800090184879002, + "flos": 38398776592320.0, + "grad_norm": 2.027880944218081, + "language_loss": 0.59141576, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61808711, + "num_input_tokens_seen": 207755435, + "step": 9647, + "time_per_iteration": 2.9048330783843994 + }, + { + "auxiliary_loss_clip": 0.01425279, + "auxiliary_loss_mlp": 0.01233669, + "balance_loss_clip": 1.10925901, + "balance_loss_mlp": 1.03978753, + "epoch": 0.5800691417405681, + "flos": 19786484113920.0, + "grad_norm": 1.479950395917135, + "language_loss": 0.84268034, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86926985, + "num_input_tokens_seen": 207773570, + "step": 9648, + "time_per_iteration": 2.7852771282196045 + }, + { + "auxiliary_loss_clip": 0.01479746, + "auxiliary_loss_mlp": 0.01212769, + "balance_loss_clip": 1.19640613, + "balance_loss_mlp": 1.02241516, + "epoch": 0.5801292649932361, + "flos": 70320913662240.0, + "grad_norm": 0.8467050641997498, + "language_loss": 0.62921894, + "learning_rate": 1.581142210256242e-06, + "loss": 0.65614408, + "num_input_tokens_seen": 207830095, + "step": 9649, + "time_per_iteration": 3.363051652908325 + }, + { + "auxiliary_loss_clip": 0.01427777, + "auxiliary_loss_mlp": 0.01220515, + "balance_loss_clip": 1.1119324, + "balance_loss_mlp": 1.02663302, + "epoch": 0.5801893882459042, + "flos": 18736849648800.0, + "grad_norm": 1.8548502722551568, + "language_loss": 0.8183583, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84484124, + "num_input_tokens_seen": 207848555, + "step": 9650, + "time_per_iteration": 2.778388261795044 + }, + { + "auxiliary_loss_clip": 0.01425511, + "auxiliary_loss_mlp": 0.01229043, + "balance_loss_clip": 1.10931373, + "balance_loss_mlp": 1.03191841, + "epoch": 0.5802495114985721, + "flos": 15598338933600.0, + "grad_norm": 2.460339005896142, + "language_loss": 0.7755968, + "learning_rate": 1.580380592177698e-06, + "loss": 0.80214238, + "num_input_tokens_seen": 207867060, + "step": 9651, + "time_per_iteration": 4.1403422355651855 + }, + { + "auxiliary_loss_clip": 0.01429523, + "auxiliary_loss_mlp": 0.01237814, + "balance_loss_clip": 1.11420381, + "balance_loss_mlp": 1.0390687, + "epoch": 0.5803096347512401, + "flos": 18257129544960.0, + "grad_norm": 1.7228824737992583, + "language_loss": 0.74514085, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.77181423, + "num_input_tokens_seen": 207884520, + "step": 9652, + "time_per_iteration": 2.813762903213501 + }, + { + "auxiliary_loss_clip": 0.01422996, + "auxiliary_loss_mlp": 0.01231572, + "balance_loss_clip": 1.10792542, + "balance_loss_mlp": 1.03330278, + "epoch": 0.580369758003908, + "flos": 22895372571840.0, + "grad_norm": 3.1351295515975277, + "language_loss": 0.76972258, + "learning_rate": 1.579619037747193e-06, + "loss": 0.79626822, + "num_input_tokens_seen": 207905370, + "step": 9653, + "time_per_iteration": 2.875518798828125 + }, + { + "auxiliary_loss_clip": 0.01424253, + "auxiliary_loss_mlp": 0.01230785, + "balance_loss_clip": 1.1091454, + "balance_loss_mlp": 1.03356516, + "epoch": 0.580429881256576, + "flos": 18699679687680.0, + "grad_norm": 2.090822986113553, + "language_loss": 0.74357915, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.77012944, + "num_input_tokens_seen": 207923790, + "step": 9654, + "time_per_iteration": 2.7664926052093506 + }, + { + "auxiliary_loss_clip": 0.01430451, + "auxiliary_loss_mlp": 0.01222961, + "balance_loss_clip": 1.1155709, + "balance_loss_mlp": 1.0267899, + "epoch": 0.5804900045092439, + "flos": 24684347946240.0, + "grad_norm": 2.601071353301155, + "language_loss": 0.70218861, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72872275, + "num_input_tokens_seen": 207942335, + "step": 9655, + "time_per_iteration": 2.7883312702178955 + }, + { + "auxiliary_loss_clip": 0.01428433, + "auxiliary_loss_mlp": 0.01231665, + "balance_loss_clip": 1.11205089, + "balance_loss_mlp": 1.03368258, + "epoch": 0.580550127761912, + "flos": 23115054660480.0, + "grad_norm": 2.310961692255723, + "language_loss": 0.69730806, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.72390902, + "num_input_tokens_seen": 207961975, + "step": 9656, + "time_per_iteration": 2.8179256916046143 + }, + { + "auxiliary_loss_clip": 0.01428354, + "auxiliary_loss_mlp": 0.01226529, + "balance_loss_clip": 1.11377418, + "balance_loss_mlp": 1.02950025, + "epoch": 0.5806102510145799, + "flos": 18477494340480.0, + "grad_norm": 1.6724780010645475, + "language_loss": 0.71757495, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.74412382, + "num_input_tokens_seen": 207979520, + "step": 9657, + "time_per_iteration": 2.723076343536377 + }, + { + "auxiliary_loss_clip": 0.01425409, + "auxiliary_loss_mlp": 0.01236694, + "balance_loss_clip": 1.10969234, + "balance_loss_mlp": 1.03871083, + "epoch": 0.5806703742672479, + "flos": 23917963894560.0, + "grad_norm": 2.233774410820822, + "language_loss": 0.70843256, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.73505354, + "num_input_tokens_seen": 207998375, + "step": 9658, + "time_per_iteration": 2.821333169937134 + }, + { + "auxiliary_loss_clip": 0.0147016, + "auxiliary_loss_mlp": 0.01206848, + "balance_loss_clip": 1.18811989, + "balance_loss_mlp": 1.01649475, + "epoch": 0.5807304975199158, + "flos": 66318732072000.0, + "grad_norm": 0.6572382612841036, + "language_loss": 0.53550053, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.56227064, + "num_input_tokens_seen": 208060605, + "step": 9659, + "time_per_iteration": 3.3101396560668945 + }, + { + "auxiliary_loss_clip": 0.01420668, + "auxiliary_loss_mlp": 0.0122759, + "balance_loss_clip": 1.1047951, + "balance_loss_mlp": 1.03275418, + "epoch": 0.5807906207725838, + "flos": 31725136385280.0, + "grad_norm": 2.516154833232238, + "language_loss": 0.6266942, + "learning_rate": 1.576954100136366e-06, + "loss": 0.65317678, + "num_input_tokens_seen": 208080320, + "step": 9660, + "time_per_iteration": 5.900420904159546 + }, + { + "auxiliary_loss_clip": 0.01418328, + "auxiliary_loss_mlp": 0.01228318, + "balance_loss_clip": 1.1021111, + "balance_loss_mlp": 1.0310986, + "epoch": 0.5808507440252517, + "flos": 23803192189440.0, + "grad_norm": 3.1033370077516746, + "language_loss": 0.65508926, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.68155569, + "num_input_tokens_seen": 208099305, + "step": 9661, + "time_per_iteration": 2.772482395172119 + }, + { + "auxiliary_loss_clip": 0.01421759, + "auxiliary_loss_mlp": 0.012179, + "balance_loss_clip": 1.10699606, + "balance_loss_mlp": 1.02468562, + "epoch": 0.5809108672779197, + "flos": 13700129365440.0, + "grad_norm": 1.5690278435989973, + "language_loss": 0.74607605, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.77247262, + "num_input_tokens_seen": 208116960, + "step": 9662, + "time_per_iteration": 2.7837727069854736 + }, + { + "auxiliary_loss_clip": 0.01457303, + "auxiliary_loss_mlp": 0.01193855, + "balance_loss_clip": 1.17430496, + "balance_loss_mlp": 1.00197601, + "epoch": 0.5809709905305876, + "flos": 69142511636640.0, + "grad_norm": 0.870940810557471, + "language_loss": 0.58294153, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60945314, + "num_input_tokens_seen": 208182190, + "step": 9663, + "time_per_iteration": 3.3482613563537598 + }, + { + "auxiliary_loss_clip": 0.01420552, + "auxiliary_loss_mlp": 0.01218807, + "balance_loss_clip": 1.10387373, + "balance_loss_mlp": 1.02244568, + "epoch": 0.5810311137832557, + "flos": 19829532939840.0, + "grad_norm": 2.213379624476884, + "language_loss": 0.81653064, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84292424, + "num_input_tokens_seen": 208197015, + "step": 9664, + "time_per_iteration": 2.7627696990966797 + }, + { + "auxiliary_loss_clip": 0.01413394, + "auxiliary_loss_mlp": 0.01221575, + "balance_loss_clip": 1.09662414, + "balance_loss_mlp": 1.02635813, + "epoch": 0.5810912370359237, + "flos": 29240892987840.0, + "grad_norm": 2.1734858427344936, + "language_loss": 0.81877637, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.84512603, + "num_input_tokens_seen": 208215795, + "step": 9665, + "time_per_iteration": 2.837188959121704 + }, + { + "auxiliary_loss_clip": 0.01423036, + "auxiliary_loss_mlp": 0.01237976, + "balance_loss_clip": 1.10557091, + "balance_loss_mlp": 1.04113746, + "epoch": 0.5811513602885916, + "flos": 22787883073440.0, + "grad_norm": 5.883501377117202, + "language_loss": 0.81331354, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83992362, + "num_input_tokens_seen": 208234655, + "step": 9666, + "time_per_iteration": 4.252375364303589 + }, + { + "auxiliary_loss_clip": 0.01424858, + "auxiliary_loss_mlp": 0.01231054, + "balance_loss_clip": 1.10859251, + "balance_loss_mlp": 1.0348835, + "epoch": 0.5812114835412596, + "flos": 18736356582720.0, + "grad_norm": 1.864764471884751, + "language_loss": 0.80067915, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.8272382, + "num_input_tokens_seen": 208251300, + "step": 9667, + "time_per_iteration": 2.8040335178375244 + }, + { + "auxiliary_loss_clip": 0.01418744, + "auxiliary_loss_mlp": 0.012311, + "balance_loss_clip": 1.10186768, + "balance_loss_mlp": 1.03302193, + "epoch": 0.5812716067939275, + "flos": 26433650100960.0, + "grad_norm": 1.8555107832743545, + "language_loss": 0.78830767, + "learning_rate": 1.573909419957653e-06, + "loss": 0.8148061, + "num_input_tokens_seen": 208272685, + "step": 9668, + "time_per_iteration": 2.837526798248291 + }, + { + "auxiliary_loss_clip": 0.01423045, + "auxiliary_loss_mlp": 0.01222141, + "balance_loss_clip": 1.10659242, + "balance_loss_mlp": 1.0246346, + "epoch": 0.5813317300465956, + "flos": 43401209454720.0, + "grad_norm": 1.871108793821069, + "language_loss": 0.64415842, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.67061031, + "num_input_tokens_seen": 208294315, + "step": 9669, + "time_per_iteration": 2.9486172199249268 + }, + { + "auxiliary_loss_clip": 0.01423794, + "auxiliary_loss_mlp": 0.01230835, + "balance_loss_clip": 1.10696697, + "balance_loss_mlp": 1.03208888, + "epoch": 0.5813918532992635, + "flos": 24787513634400.0, + "grad_norm": 1.5267116438113741, + "language_loss": 0.73207748, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75862384, + "num_input_tokens_seen": 208315610, + "step": 9670, + "time_per_iteration": 2.880521774291992 + }, + { + "auxiliary_loss_clip": 0.01421207, + "auxiliary_loss_mlp": 0.0122953, + "balance_loss_clip": 1.10491037, + "balance_loss_mlp": 1.03154755, + "epoch": 0.5814519765519315, + "flos": 22859605952640.0, + "grad_norm": 2.136026801974731, + "language_loss": 0.79333282, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81984019, + "num_input_tokens_seen": 208334725, + "step": 9671, + "time_per_iteration": 2.7526679039001465 + }, + { + "auxiliary_loss_clip": 0.01424933, + "auxiliary_loss_mlp": 0.01236064, + "balance_loss_clip": 1.10769486, + "balance_loss_mlp": 1.03884435, + "epoch": 0.5815120998045994, + "flos": 24063116420160.0, + "grad_norm": 1.943227024023087, + "language_loss": 0.61630303, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.64291298, + "num_input_tokens_seen": 208353825, + "step": 9672, + "time_per_iteration": 2.797487735748291 + }, + { + "auxiliary_loss_clip": 0.01421591, + "auxiliary_loss_mlp": 0.01220115, + "balance_loss_clip": 1.10424876, + "balance_loss_mlp": 1.02642369, + "epoch": 0.5815722230572674, + "flos": 24281774448480.0, + "grad_norm": 1.6281390284755035, + "language_loss": 0.81609344, + "learning_rate": 1.572007019492342e-06, + "loss": 0.84251046, + "num_input_tokens_seen": 208374160, + "step": 9673, + "time_per_iteration": 2.827894926071167 + }, + { + "auxiliary_loss_clip": 0.01426443, + "auxiliary_loss_mlp": 0.01238773, + "balance_loss_clip": 1.10947132, + "balance_loss_mlp": 1.04288864, + "epoch": 0.5816323463099353, + "flos": 22202569879200.0, + "grad_norm": 2.4758369298976453, + "language_loss": 0.88101584, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.90766799, + "num_input_tokens_seen": 208392105, + "step": 9674, + "time_per_iteration": 2.7464864253997803 + }, + { + "auxiliary_loss_clip": 0.01431028, + "auxiliary_loss_mlp": 0.01234648, + "balance_loss_clip": 1.1137917, + "balance_loss_mlp": 1.03809559, + "epoch": 0.5816924695626033, + "flos": 24136773635520.0, + "grad_norm": 1.4720247842728207, + "language_loss": 0.78826505, + "learning_rate": 1.571246172811984e-06, + "loss": 0.8149218, + "num_input_tokens_seen": 208411755, + "step": 9675, + "time_per_iteration": 2.867107391357422 + }, + { + "auxiliary_loss_clip": 0.0142388, + "auxiliary_loss_mlp": 0.01234155, + "balance_loss_clip": 1.10872161, + "balance_loss_mlp": 1.04008222, + "epoch": 0.5817525928152713, + "flos": 21326344783200.0, + "grad_norm": 2.2174448543873178, + "language_loss": 0.70310962, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72968996, + "num_input_tokens_seen": 208429995, + "step": 9676, + "time_per_iteration": 2.8247058391571045 + }, + { + "auxiliary_loss_clip": 0.01421696, + "auxiliary_loss_mlp": 0.0122461, + "balance_loss_clip": 1.10458434, + "balance_loss_mlp": 1.02986968, + "epoch": 0.5818127160679393, + "flos": 26936468818560.0, + "grad_norm": 2.3040009252289386, + "language_loss": 0.63377249, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.66023558, + "num_input_tokens_seen": 208443655, + "step": 9677, + "time_per_iteration": 2.8623945713043213 + }, + { + "auxiliary_loss_clip": 0.01450922, + "auxiliary_loss_mlp": 0.01206695, + "balance_loss_clip": 1.16733098, + "balance_loss_mlp": 1.0171051, + "epoch": 0.5818728393206073, + "flos": 63926047981440.0, + "grad_norm": 0.8035516381941715, + "language_loss": 0.5413236, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56789976, + "num_input_tokens_seen": 208498405, + "step": 9678, + "time_per_iteration": 3.383244752883911 + }, + { + "auxiliary_loss_clip": 0.01449665, + "auxiliary_loss_mlp": 0.01200935, + "balance_loss_clip": 1.16658998, + "balance_loss_mlp": 1.01134491, + "epoch": 0.5819329625732752, + "flos": 64960321541760.0, + "grad_norm": 0.7362966236048, + "language_loss": 0.5609827, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58748865, + "num_input_tokens_seen": 208559075, + "step": 9679, + "time_per_iteration": 3.1565701961517334 + }, + { + "auxiliary_loss_clip": 0.01418708, + "auxiliary_loss_mlp": 0.01223598, + "balance_loss_clip": 1.10415053, + "balance_loss_mlp": 1.02971601, + "epoch": 0.5819930858259432, + "flos": 21217414014720.0, + "grad_norm": 5.516719197155658, + "language_loss": 0.65761924, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.68404233, + "num_input_tokens_seen": 208577770, + "step": 9680, + "time_per_iteration": 2.82963490486145 + }, + { + "auxiliary_loss_clip": 0.0142939, + "auxiliary_loss_mlp": 0.01226237, + "balance_loss_clip": 1.11399508, + "balance_loss_mlp": 1.03073382, + "epoch": 0.5820532090786111, + "flos": 19460260730880.0, + "grad_norm": 2.2984065230709176, + "language_loss": 0.83973169, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.86628795, + "num_input_tokens_seen": 208595110, + "step": 9681, + "time_per_iteration": 2.7856080532073975 + }, + { + "auxiliary_loss_clip": 0.01426354, + "auxiliary_loss_mlp": 0.01229012, + "balance_loss_clip": 1.11068571, + "balance_loss_mlp": 1.03341341, + "epoch": 0.5821133323312792, + "flos": 17714561751360.0, + "grad_norm": 1.9204945647756695, + "language_loss": 0.7592271, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.78578079, + "num_input_tokens_seen": 208612080, + "step": 9682, + "time_per_iteration": 2.775381088256836 + }, + { + "auxiliary_loss_clip": 0.01424943, + "auxiliary_loss_mlp": 0.01236692, + "balance_loss_clip": 1.11034381, + "balance_loss_mlp": 1.03975868, + "epoch": 0.5821734555839471, + "flos": 24574127620320.0, + "grad_norm": 2.1835381538145993, + "language_loss": 0.74962842, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77624476, + "num_input_tokens_seen": 208630235, + "step": 9683, + "time_per_iteration": 2.786705255508423 + }, + { + "auxiliary_loss_clip": 0.01430113, + "auxiliary_loss_mlp": 0.01230692, + "balance_loss_clip": 1.11510122, + "balance_loss_mlp": 1.03299522, + "epoch": 0.5822335788366151, + "flos": 22384437228000.0, + "grad_norm": 2.2072306069789986, + "language_loss": 0.74156946, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.76817751, + "num_input_tokens_seen": 208647925, + "step": 9684, + "time_per_iteration": 2.775778293609619 + }, + { + "auxiliary_loss_clip": 0.01426012, + "auxiliary_loss_mlp": 0.01238306, + "balance_loss_clip": 1.11204648, + "balance_loss_mlp": 1.04423356, + "epoch": 0.582293702089283, + "flos": 26724827499840.0, + "grad_norm": 2.2521791205495347, + "language_loss": 0.77973199, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80637515, + "num_input_tokens_seen": 208666180, + "step": 9685, + "time_per_iteration": 2.7847704887390137 + }, + { + "auxiliary_loss_clip": 0.01427346, + "auxiliary_loss_mlp": 0.01224556, + "balance_loss_clip": 1.1130873, + "balance_loss_mlp": 1.02781248, + "epoch": 0.582353825341951, + "flos": 17350637412960.0, + "grad_norm": 1.7157575143522017, + "language_loss": 0.75314194, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77966094, + "num_input_tokens_seen": 208684240, + "step": 9686, + "time_per_iteration": 2.755668878555298 + }, + { + "auxiliary_loss_clip": 0.01453189, + "auxiliary_loss_mlp": 0.01186829, + "balance_loss_clip": 1.16942549, + "balance_loss_mlp": 0.99609375, + "epoch": 0.5824139485946189, + "flos": 55479248807040.0, + "grad_norm": 0.8315615627154036, + "language_loss": 0.57284999, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.5992502, + "num_input_tokens_seen": 208736090, + "step": 9687, + "time_per_iteration": 3.17472505569458 + }, + { + "auxiliary_loss_clip": 0.01428468, + "auxiliary_loss_mlp": 0.01223768, + "balance_loss_clip": 1.11398113, + "balance_loss_mlp": 1.0282644, + "epoch": 0.582474071847287, + "flos": 20305118874240.0, + "grad_norm": 1.9183703928574323, + "language_loss": 0.69858742, + "learning_rate": 1.566302259738727e-06, + "loss": 0.7251097, + "num_input_tokens_seen": 208754600, + "step": 9688, + "time_per_iteration": 2.802630662918091 + }, + { + "auxiliary_loss_clip": 0.01432933, + "auxiliary_loss_mlp": 0.01226569, + "balance_loss_clip": 1.118855, + "balance_loss_mlp": 1.02982605, + "epoch": 0.5825341950999549, + "flos": 23880262939200.0, + "grad_norm": 2.2033774855526227, + "language_loss": 0.65249872, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67909372, + "num_input_tokens_seen": 208773140, + "step": 9689, + "time_per_iteration": 4.2794623374938965 + }, + { + "auxiliary_loss_clip": 0.01433172, + "auxiliary_loss_mlp": 0.01234435, + "balance_loss_clip": 1.11943197, + "balance_loss_mlp": 1.04112506, + "epoch": 0.5825943183526229, + "flos": 23115320157600.0, + "grad_norm": 1.8602660526232555, + "language_loss": 0.73577881, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.76245487, + "num_input_tokens_seen": 208793410, + "step": 9690, + "time_per_iteration": 2.84370756149292 + }, + { + "auxiliary_loss_clip": 0.01429821, + "auxiliary_loss_mlp": 0.01225022, + "balance_loss_clip": 1.11585927, + "balance_loss_mlp": 1.03018653, + "epoch": 0.5826544416052909, + "flos": 22859681808960.0, + "grad_norm": 1.9961647187283225, + "language_loss": 0.75891012, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78545856, + "num_input_tokens_seen": 208811920, + "step": 9691, + "time_per_iteration": 2.7721922397613525 + }, + { + "auxiliary_loss_clip": 0.01428902, + "auxiliary_loss_mlp": 0.01224498, + "balance_loss_clip": 1.1148144, + "balance_loss_mlp": 1.02784991, + "epoch": 0.5827145648579588, + "flos": 31504430236320.0, + "grad_norm": 1.7908516695251484, + "language_loss": 0.80826151, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.83479548, + "num_input_tokens_seen": 208834720, + "step": 9692, + "time_per_iteration": 2.8697879314422607 + }, + { + "auxiliary_loss_clip": 0.01441904, + "auxiliary_loss_mlp": 0.01191315, + "balance_loss_clip": 1.15857422, + "balance_loss_mlp": 1.00134277, + "epoch": 0.5827746881106268, + "flos": 69818588010720.0, + "grad_norm": 0.7542843454139868, + "language_loss": 0.56932575, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.59565789, + "num_input_tokens_seen": 208898415, + "step": 9693, + "time_per_iteration": 3.285276174545288 + }, + { + "auxiliary_loss_clip": 0.01427419, + "auxiliary_loss_mlp": 0.01218678, + "balance_loss_clip": 1.11190343, + "balance_loss_mlp": 1.02241182, + "epoch": 0.5828348113632947, + "flos": 23114637450720.0, + "grad_norm": 1.7350168430979025, + "language_loss": 0.79307008, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81953102, + "num_input_tokens_seen": 208919045, + "step": 9694, + "time_per_iteration": 2.9284403324127197 + }, + { + "auxiliary_loss_clip": 0.01427894, + "auxiliary_loss_mlp": 0.012104, + "balance_loss_clip": 1.11384261, + "balance_loss_mlp": 1.01604128, + "epoch": 0.5828949346159628, + "flos": 21875322435840.0, + "grad_norm": 1.401705707128522, + "language_loss": 0.76358294, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78996587, + "num_input_tokens_seen": 208939375, + "step": 9695, + "time_per_iteration": 2.778336524963379 + }, + { + "auxiliary_loss_clip": 0.01443342, + "auxiliary_loss_mlp": 0.01188141, + "balance_loss_clip": 1.16031265, + "balance_loss_mlp": 0.99740601, + "epoch": 0.5829550578686307, + "flos": 65970130710240.0, + "grad_norm": 0.7719288123062055, + "language_loss": 0.54955733, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57587218, + "num_input_tokens_seen": 209004760, + "step": 9696, + "time_per_iteration": 3.3521382808685303 + }, + { + "auxiliary_loss_clip": 0.01427885, + "auxiliary_loss_mlp": 0.01227735, + "balance_loss_clip": 1.11382043, + "balance_loss_mlp": 1.03137326, + "epoch": 0.5830151811212987, + "flos": 16291672620480.0, + "grad_norm": 2.295110965429143, + "language_loss": 0.76430941, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.7908656, + "num_input_tokens_seen": 209022930, + "step": 9697, + "time_per_iteration": 4.198563814163208 + }, + { + "auxiliary_loss_clip": 0.01426117, + "auxiliary_loss_mlp": 0.01221441, + "balance_loss_clip": 1.11163557, + "balance_loss_mlp": 1.0244118, + "epoch": 0.5830753043739666, + "flos": 24171061056480.0, + "grad_norm": 2.6925719075904646, + "language_loss": 0.77721512, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.80369061, + "num_input_tokens_seen": 209043740, + "step": 9698, + "time_per_iteration": 4.248088121414185 + }, + { + "auxiliary_loss_clip": 0.0143975, + "auxiliary_loss_mlp": 0.01221271, + "balance_loss_clip": 1.12441516, + "balance_loss_mlp": 1.02328825, + "epoch": 0.5831354276266346, + "flos": 27061860408480.0, + "grad_norm": 1.6780648221759897, + "language_loss": 0.83711982, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.86372995, + "num_input_tokens_seen": 209068885, + "step": 9699, + "time_per_iteration": 2.872204065322876 + }, + { + "auxiliary_loss_clip": 0.01435596, + "auxiliary_loss_mlp": 0.01226667, + "balance_loss_clip": 1.11940646, + "balance_loss_mlp": 1.02906573, + "epoch": 0.5831955508793025, + "flos": 23625762435360.0, + "grad_norm": 2.642546368504562, + "language_loss": 0.65874112, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68536377, + "num_input_tokens_seen": 209087340, + "step": 9700, + "time_per_iteration": 2.826035976409912 + }, + { + "auxiliary_loss_clip": 0.01438016, + "auxiliary_loss_mlp": 0.01224042, + "balance_loss_clip": 1.1231662, + "balance_loss_mlp": 1.02548718, + "epoch": 0.5832556741319705, + "flos": 24975601201440.0, + "grad_norm": 2.2734424012952994, + "language_loss": 0.714643, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.74126363, + "num_input_tokens_seen": 209108840, + "step": 9701, + "time_per_iteration": 2.828552007675171 + }, + { + "auxiliary_loss_clip": 0.01429286, + "auxiliary_loss_mlp": 0.01218776, + "balance_loss_clip": 1.1130141, + "balance_loss_mlp": 1.02327311, + "epoch": 0.5833157973846385, + "flos": 23223795788160.0, + "grad_norm": 1.7514023317100234, + "language_loss": 0.85354012, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.88002074, + "num_input_tokens_seen": 209127985, + "step": 9702, + "time_per_iteration": 2.805591583251953 + }, + { + "auxiliary_loss_clip": 0.01428375, + "auxiliary_loss_mlp": 0.01216626, + "balance_loss_clip": 1.1127038, + "balance_loss_mlp": 1.02303052, + "epoch": 0.5833759206373065, + "flos": 21979929394080.0, + "grad_norm": 1.6895202380834016, + "language_loss": 0.77995133, + "learning_rate": 1.560601200301392e-06, + "loss": 0.80640137, + "num_input_tokens_seen": 209146885, + "step": 9703, + "time_per_iteration": 2.794857978820801 + }, + { + "auxiliary_loss_clip": 0.01427929, + "auxiliary_loss_mlp": 0.01229307, + "balance_loss_clip": 1.11204529, + "balance_loss_mlp": 1.03408968, + "epoch": 0.5834360438899745, + "flos": 21764571115680.0, + "grad_norm": 4.71953218126018, + "language_loss": 0.71254724, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73911959, + "num_input_tokens_seen": 209166130, + "step": 9704, + "time_per_iteration": 4.249423027038574 + }, + { + "auxiliary_loss_clip": 0.01422177, + "auxiliary_loss_mlp": 0.01237951, + "balance_loss_clip": 1.10930538, + "balance_loss_mlp": 1.04178047, + "epoch": 0.5834961671426424, + "flos": 15994123290720.0, + "grad_norm": 2.636788586164456, + "language_loss": 0.81955969, + "learning_rate": 1.559841341236335e-06, + "loss": 0.84616101, + "num_input_tokens_seen": 209183350, + "step": 9705, + "time_per_iteration": 2.7221341133117676 + }, + { + "auxiliary_loss_clip": 0.01424459, + "auxiliary_loss_mlp": 0.01229505, + "balance_loss_clip": 1.10931408, + "balance_loss_mlp": 1.03400159, + "epoch": 0.5835562903953104, + "flos": 22820425799040.0, + "grad_norm": 1.64664237044076, + "language_loss": 0.80880702, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.83534664, + "num_input_tokens_seen": 209203945, + "step": 9706, + "time_per_iteration": 2.8169891834259033 + }, + { + "auxiliary_loss_clip": 0.01428462, + "auxiliary_loss_mlp": 0.01226097, + "balance_loss_clip": 1.11439705, + "balance_loss_mlp": 1.02925837, + "epoch": 0.5836164136479783, + "flos": 48471610308480.0, + "grad_norm": 2.8687710794364696, + "language_loss": 0.75078046, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.77732605, + "num_input_tokens_seen": 209227080, + "step": 9707, + "time_per_iteration": 2.940000534057617 + }, + { + "auxiliary_loss_clip": 0.01423855, + "auxiliary_loss_mlp": 0.01223642, + "balance_loss_clip": 1.11062169, + "balance_loss_mlp": 1.03052294, + "epoch": 0.5836765369006464, + "flos": 26908401615840.0, + "grad_norm": 1.8286672773843375, + "language_loss": 0.81421548, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.84069049, + "num_input_tokens_seen": 209248170, + "step": 9708, + "time_per_iteration": 2.811875343322754 + }, + { + "auxiliary_loss_clip": 0.01432222, + "auxiliary_loss_mlp": 0.01233127, + "balance_loss_clip": 1.11919332, + "balance_loss_mlp": 1.03867269, + "epoch": 0.5837366601533143, + "flos": 20086119492480.0, + "grad_norm": 1.7970653928611604, + "language_loss": 0.78485847, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.81151187, + "num_input_tokens_seen": 209267730, + "step": 9709, + "time_per_iteration": 2.7587311267852783 + }, + { + "auxiliary_loss_clip": 0.01442547, + "auxiliary_loss_mlp": 0.01214844, + "balance_loss_clip": 1.16122365, + "balance_loss_mlp": 1.02639771, + "epoch": 0.5837967834059823, + "flos": 65370594456000.0, + "grad_norm": 0.7609076802258022, + "language_loss": 0.56586635, + "learning_rate": 1.557941985915844e-06, + "loss": 0.59244025, + "num_input_tokens_seen": 209332510, + "step": 9710, + "time_per_iteration": 3.315976858139038 + }, + { + "auxiliary_loss_clip": 0.01430592, + "auxiliary_loss_mlp": 0.01234327, + "balance_loss_clip": 1.11703682, + "balance_loss_mlp": 1.04101682, + "epoch": 0.5838569066586502, + "flos": 25341042666240.0, + "grad_norm": 1.722340890585659, + "language_loss": 0.65728974, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.68393898, + "num_input_tokens_seen": 209353355, + "step": 9711, + "time_per_iteration": 2.8499059677124023 + }, + { + "auxiliary_loss_clip": 0.01431741, + "auxiliary_loss_mlp": 0.0123266, + "balance_loss_clip": 1.11828637, + "balance_loss_mlp": 1.03505826, + "epoch": 0.5839170299113182, + "flos": 22231016363520.0, + "grad_norm": 2.336892991158453, + "language_loss": 0.78497767, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.81162167, + "num_input_tokens_seen": 209370960, + "step": 9712, + "time_per_iteration": 2.747098207473755 + }, + { + "auxiliary_loss_clip": 0.01425108, + "auxiliary_loss_mlp": 0.01231363, + "balance_loss_clip": 1.11204672, + "balance_loss_mlp": 1.03709948, + "epoch": 0.5839771531639861, + "flos": 22202190597600.0, + "grad_norm": 2.5282966777616496, + "language_loss": 0.73533773, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.76190245, + "num_input_tokens_seen": 209390955, + "step": 9713, + "time_per_iteration": 2.8328678607940674 + }, + { + "auxiliary_loss_clip": 0.0143833, + "auxiliary_loss_mlp": 0.01231274, + "balance_loss_clip": 1.12359083, + "balance_loss_mlp": 1.03214693, + "epoch": 0.5840372764166541, + "flos": 22421569260960.0, + "grad_norm": 1.9801520273103264, + "language_loss": 0.69581413, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.72251016, + "num_input_tokens_seen": 209410260, + "step": 9714, + "time_per_iteration": 2.8799407482147217 + }, + { + "auxiliary_loss_clip": 0.01429023, + "auxiliary_loss_mlp": 0.01229415, + "balance_loss_clip": 1.11462998, + "balance_loss_mlp": 1.03381658, + "epoch": 0.5840973996693221, + "flos": 19830291503040.0, + "grad_norm": 1.938235275810801, + "language_loss": 0.8026886, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82927299, + "num_input_tokens_seen": 209429920, + "step": 9715, + "time_per_iteration": 2.7686212062835693 + }, + { + "auxiliary_loss_clip": 0.01432091, + "auxiliary_loss_mlp": 0.01238662, + "balance_loss_clip": 1.11848187, + "balance_loss_mlp": 1.04525721, + "epoch": 0.5841575229219901, + "flos": 21145160141280.0, + "grad_norm": 2.873781063647252, + "language_loss": 0.73335445, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.76006198, + "num_input_tokens_seen": 209449470, + "step": 9716, + "time_per_iteration": 2.772087812423706 + }, + { + "auxiliary_loss_clip": 0.01426984, + "auxiliary_loss_mlp": 0.01222107, + "balance_loss_clip": 1.11448121, + "balance_loss_mlp": 1.02822542, + "epoch": 0.5842176461746581, + "flos": 24642399036960.0, + "grad_norm": 1.6913550409169045, + "language_loss": 0.74739552, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.77388644, + "num_input_tokens_seen": 209467695, + "step": 9717, + "time_per_iteration": 2.8119208812713623 + }, + { + "auxiliary_loss_clip": 0.01424005, + "auxiliary_loss_mlp": 0.01227538, + "balance_loss_clip": 1.11157203, + "balance_loss_mlp": 1.03441858, + "epoch": 0.584277769427326, + "flos": 19132937431200.0, + "grad_norm": 2.207188228345761, + "language_loss": 0.79993439, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82644975, + "num_input_tokens_seen": 209484250, + "step": 9718, + "time_per_iteration": 2.8242058753967285 + }, + { + "auxiliary_loss_clip": 0.014309, + "auxiliary_loss_mlp": 0.01232259, + "balance_loss_clip": 1.11834502, + "balance_loss_mlp": 1.0380913, + "epoch": 0.584337892679994, + "flos": 22677359322240.0, + "grad_norm": 1.8870570286340043, + "language_loss": 0.67179877, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.6984303, + "num_input_tokens_seen": 209502830, + "step": 9719, + "time_per_iteration": 2.8513805866241455 + }, + { + "auxiliary_loss_clip": 0.01431649, + "auxiliary_loss_mlp": 0.01219363, + "balance_loss_clip": 1.11956549, + "balance_loss_mlp": 1.02443194, + "epoch": 0.5843980159326619, + "flos": 31286720412000.0, + "grad_norm": 3.058911676661636, + "language_loss": 0.75775087, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.78426093, + "num_input_tokens_seen": 209525995, + "step": 9720, + "time_per_iteration": 2.9240915775299072 + }, + { + "auxiliary_loss_clip": 0.01428577, + "auxiliary_loss_mlp": 0.01225062, + "balance_loss_clip": 1.11512256, + "balance_loss_mlp": 1.03165698, + "epoch": 0.58445813918533, + "flos": 22750371758880.0, + "grad_norm": 2.5357277304097248, + "language_loss": 0.82992864, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85646504, + "num_input_tokens_seen": 209545895, + "step": 9721, + "time_per_iteration": 2.7901570796966553 + }, + { + "auxiliary_loss_clip": 0.01452567, + "auxiliary_loss_mlp": 0.01191009, + "balance_loss_clip": 1.17017698, + "balance_loss_mlp": 1.00141907, + "epoch": 0.5845182624379979, + "flos": 60692602353120.0, + "grad_norm": 0.9278229684298532, + "language_loss": 0.7129842, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73941994, + "num_input_tokens_seen": 209602315, + "step": 9722, + "time_per_iteration": 3.364351511001587 + }, + { + "auxiliary_loss_clip": 0.01427236, + "auxiliary_loss_mlp": 0.0122537, + "balance_loss_clip": 1.11533237, + "balance_loss_mlp": 1.03148794, + "epoch": 0.5845783856906659, + "flos": 16364305775520.0, + "grad_norm": 2.1976592754166147, + "language_loss": 0.89323115, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91975719, + "num_input_tokens_seen": 209617615, + "step": 9723, + "time_per_iteration": 2.759148597717285 + }, + { + "auxiliary_loss_clip": 0.01428887, + "auxiliary_loss_mlp": 0.01227547, + "balance_loss_clip": 1.11770606, + "balance_loss_mlp": 1.03547752, + "epoch": 0.5846385089433338, + "flos": 20085436785600.0, + "grad_norm": 1.4316218948423547, + "language_loss": 0.68376446, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.71032882, + "num_input_tokens_seen": 209637005, + "step": 9724, + "time_per_iteration": 2.7907207012176514 + }, + { + "auxiliary_loss_clip": 0.01435987, + "auxiliary_loss_mlp": 0.0123168, + "balance_loss_clip": 1.12294555, + "balance_loss_mlp": 1.03865659, + "epoch": 0.5846986321960018, + "flos": 17312974385760.0, + "grad_norm": 1.738591083661797, + "language_loss": 0.86085165, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88752836, + "num_input_tokens_seen": 209653170, + "step": 9725, + "time_per_iteration": 2.781850576400757 + }, + { + "auxiliary_loss_clip": 0.01436065, + "auxiliary_loss_mlp": 0.01231976, + "balance_loss_clip": 1.12299228, + "balance_loss_mlp": 1.03990555, + "epoch": 0.5847587554486697, + "flos": 17199606022560.0, + "grad_norm": 1.7606488845707582, + "language_loss": 0.83066177, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85734212, + "num_input_tokens_seen": 209671275, + "step": 9726, + "time_per_iteration": 2.7898476123809814 + }, + { + "auxiliary_loss_clip": 0.01431914, + "auxiliary_loss_mlp": 0.01225494, + "balance_loss_clip": 1.1194917, + "balance_loss_mlp": 1.03266144, + "epoch": 0.5848188787013378, + "flos": 24531078794400.0, + "grad_norm": 2.172472309157792, + "language_loss": 0.66744673, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.69402081, + "num_input_tokens_seen": 209690380, + "step": 9727, + "time_per_iteration": 4.155052900314331 + }, + { + "auxiliary_loss_clip": 0.01437859, + "auxiliary_loss_mlp": 0.01234646, + "balance_loss_clip": 1.12592208, + "balance_loss_mlp": 1.03809357, + "epoch": 0.5848790019540057, + "flos": 20630318196960.0, + "grad_norm": 1.6598233984965989, + "language_loss": 0.8185935, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84531856, + "num_input_tokens_seen": 209708845, + "step": 9728, + "time_per_iteration": 2.842298746109009 + }, + { + "auxiliary_loss_clip": 0.01435073, + "auxiliary_loss_mlp": 0.01219941, + "balance_loss_clip": 1.12341285, + "balance_loss_mlp": 1.02529597, + "epoch": 0.5849391252066737, + "flos": 22421038266720.0, + "grad_norm": 1.8247337842843843, + "language_loss": 0.77865833, + "learning_rate": 1.550728272957027e-06, + "loss": 0.8052085, + "num_input_tokens_seen": 209729000, + "step": 9729, + "time_per_iteration": 2.7717015743255615 + }, + { + "auxiliary_loss_clip": 0.01436721, + "auxiliary_loss_mlp": 0.01231123, + "balance_loss_clip": 1.1230092, + "balance_loss_mlp": 1.03533363, + "epoch": 0.5849992484593417, + "flos": 25413296539680.0, + "grad_norm": 2.5202019428995266, + "language_loss": 0.70740139, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.73407984, + "num_input_tokens_seen": 209747435, + "step": 9730, + "time_per_iteration": 2.9096503257751465 + }, + { + "auxiliary_loss_clip": 0.01444446, + "auxiliary_loss_mlp": 0.01225002, + "balance_loss_clip": 1.13058078, + "balance_loss_mlp": 1.02492118, + "epoch": 0.5850593717120096, + "flos": 21067065331200.0, + "grad_norm": 1.8935186139230291, + "language_loss": 0.78562844, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.81232291, + "num_input_tokens_seen": 209764910, + "step": 9731, + "time_per_iteration": 2.8142335414886475 + }, + { + "auxiliary_loss_clip": 0.01435009, + "auxiliary_loss_mlp": 0.01228229, + "balance_loss_clip": 1.12027943, + "balance_loss_mlp": 1.03100896, + "epoch": 0.5851194949646776, + "flos": 25303910633280.0, + "grad_norm": 2.076910907748307, + "language_loss": 0.7062344, + "learning_rate": 1.549589825316528e-06, + "loss": 0.73286676, + "num_input_tokens_seen": 209786115, + "step": 9732, + "time_per_iteration": 2.8457887172698975 + }, + { + "auxiliary_loss_clip": 0.01440354, + "auxiliary_loss_mlp": 0.01230767, + "balance_loss_clip": 1.12658381, + "balance_loss_mlp": 1.03030443, + "epoch": 0.5851796182173455, + "flos": 23589540678240.0, + "grad_norm": 2.1571551283798365, + "language_loss": 0.52587104, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.55258226, + "num_input_tokens_seen": 209806095, + "step": 9733, + "time_per_iteration": 2.772352695465088 + }, + { + "auxiliary_loss_clip": 0.01442695, + "auxiliary_loss_mlp": 0.0122935, + "balance_loss_clip": 1.12777293, + "balance_loss_mlp": 1.03356099, + "epoch": 0.5852397414700136, + "flos": 24824797380000.0, + "grad_norm": 2.3752807726236647, + "language_loss": 0.87339342, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.90011388, + "num_input_tokens_seen": 209823650, + "step": 9734, + "time_per_iteration": 2.838825225830078 + }, + { + "auxiliary_loss_clip": 0.01436666, + "auxiliary_loss_mlp": 0.01222998, + "balance_loss_clip": 1.12217999, + "balance_loss_mlp": 1.02720916, + "epoch": 0.5852998647226815, + "flos": 19939829122080.0, + "grad_norm": 6.430422334497754, + "language_loss": 0.72372538, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.75032198, + "num_input_tokens_seen": 209843220, + "step": 9735, + "time_per_iteration": 4.257028341293335 + }, + { + "auxiliary_loss_clip": 0.01438145, + "auxiliary_loss_mlp": 0.01236912, + "balance_loss_clip": 1.12315845, + "balance_loss_mlp": 1.03807068, + "epoch": 0.5853599879753495, + "flos": 16721858183040.0, + "grad_norm": 2.681294961102713, + "language_loss": 0.74140072, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76815128, + "num_input_tokens_seen": 209854880, + "step": 9736, + "time_per_iteration": 4.298030614852905 + }, + { + "auxiliary_loss_clip": 0.01434492, + "auxiliary_loss_mlp": 0.01224907, + "balance_loss_clip": 1.12042737, + "balance_loss_mlp": 1.0274967, + "epoch": 0.5854201112280174, + "flos": 44460022534560.0, + "grad_norm": 6.456867975888416, + "language_loss": 0.70589423, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.73248827, + "num_input_tokens_seen": 209877870, + "step": 9737, + "time_per_iteration": 2.967827081680298 + }, + { + "auxiliary_loss_clip": 0.01437359, + "auxiliary_loss_mlp": 0.01222687, + "balance_loss_clip": 1.12461138, + "balance_loss_mlp": 1.02794647, + "epoch": 0.5854802344806854, + "flos": 20341264775040.0, + "grad_norm": 2.746424733674833, + "language_loss": 0.82501066, + "learning_rate": 1.547313391573169e-06, + "loss": 0.85161108, + "num_input_tokens_seen": 209896690, + "step": 9738, + "time_per_iteration": 2.8324875831604004 + }, + { + "auxiliary_loss_clip": 0.0143794, + "auxiliary_loss_mlp": 0.0123038, + "balance_loss_clip": 1.12451959, + "balance_loss_mlp": 1.03420925, + "epoch": 0.5855403577333533, + "flos": 20923050650400.0, + "grad_norm": 1.8890279676759787, + "language_loss": 0.68616128, + "learning_rate": 1.546934045946082e-06, + "loss": 0.71284449, + "num_input_tokens_seen": 209914640, + "step": 9739, + "time_per_iteration": 2.7599716186523438 + }, + { + "auxiliary_loss_clip": 0.01434195, + "auxiliary_loss_mlp": 0.0121976, + "balance_loss_clip": 1.11894333, + "balance_loss_mlp": 1.02301669, + "epoch": 0.5856004809860214, + "flos": 20450726537760.0, + "grad_norm": 2.840002744065828, + "language_loss": 0.59305239, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.61959195, + "num_input_tokens_seen": 209933375, + "step": 9740, + "time_per_iteration": 2.866987466812134 + }, + { + "auxiliary_loss_clip": 0.01429449, + "auxiliary_loss_mlp": 0.01217554, + "balance_loss_clip": 1.11453271, + "balance_loss_mlp": 1.02119255, + "epoch": 0.5856606042386893, + "flos": 19642507361280.0, + "grad_norm": 1.8645172251547373, + "language_loss": 0.75331151, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77978158, + "num_input_tokens_seen": 209952055, + "step": 9741, + "time_per_iteration": 2.8203656673431396 + }, + { + "auxiliary_loss_clip": 0.0143742, + "auxiliary_loss_mlp": 0.01227351, + "balance_loss_clip": 1.12235594, + "balance_loss_mlp": 1.02984464, + "epoch": 0.5857207274913573, + "flos": 21688145144640.0, + "grad_norm": 1.6170449790099204, + "language_loss": 0.75741506, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.78406274, + "num_input_tokens_seen": 209971190, + "step": 9742, + "time_per_iteration": 2.795931339263916 + }, + { + "auxiliary_loss_clip": 0.01429696, + "auxiliary_loss_mlp": 0.0122341, + "balance_loss_clip": 1.11520529, + "balance_loss_mlp": 1.02895546, + "epoch": 0.5857808507440253, + "flos": 23184805275360.0, + "grad_norm": 1.6768050217056292, + "language_loss": 0.74919546, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77572656, + "num_input_tokens_seen": 209990695, + "step": 9743, + "time_per_iteration": 4.330912828445435 + }, + { + "auxiliary_loss_clip": 0.01437237, + "auxiliary_loss_mlp": 0.01225662, + "balance_loss_clip": 1.1225847, + "balance_loss_mlp": 1.03120792, + "epoch": 0.5858409739966932, + "flos": 27237773036160.0, + "grad_norm": 1.8506719712188366, + "language_loss": 0.8115176, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83814657, + "num_input_tokens_seen": 210010210, + "step": 9744, + "time_per_iteration": 2.851755142211914 + }, + { + "auxiliary_loss_clip": 0.01432908, + "auxiliary_loss_mlp": 0.01231708, + "balance_loss_clip": 1.11668944, + "balance_loss_mlp": 1.03630018, + "epoch": 0.5859010972493612, + "flos": 27858018430080.0, + "grad_norm": 1.8410936635069206, + "language_loss": 0.71777844, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.74442458, + "num_input_tokens_seen": 210030030, + "step": 9745, + "time_per_iteration": 2.855619192123413 + }, + { + "auxiliary_loss_clip": 0.01443207, + "auxiliary_loss_mlp": 0.01195724, + "balance_loss_clip": 1.16019905, + "balance_loss_mlp": 1.00613403, + "epoch": 0.5859612205020291, + "flos": 70014564635040.0, + "grad_norm": 0.7168971460632781, + "language_loss": 0.5321852, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.5585745, + "num_input_tokens_seen": 210094840, + "step": 9746, + "time_per_iteration": 3.412393093109131 + }, + { + "auxiliary_loss_clip": 0.01429366, + "auxiliary_loss_mlp": 0.01235671, + "balance_loss_clip": 1.11331725, + "balance_loss_mlp": 1.03559065, + "epoch": 0.5860213437546972, + "flos": 24058034046720.0, + "grad_norm": 2.41866852852593, + "language_loss": 0.73583895, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.76248932, + "num_input_tokens_seen": 210114660, + "step": 9747, + "time_per_iteration": 2.8203532695770264 + }, + { + "auxiliary_loss_clip": 0.01429409, + "auxiliary_loss_mlp": 0.01220493, + "balance_loss_clip": 1.11321867, + "balance_loss_mlp": 1.02231979, + "epoch": 0.5860814670073651, + "flos": 18949287458880.0, + "grad_norm": 1.9217352984626777, + "language_loss": 0.8105157, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83701473, + "num_input_tokens_seen": 210132770, + "step": 9748, + "time_per_iteration": 2.933424711227417 + }, + { + "auxiliary_loss_clip": 0.01431515, + "auxiliary_loss_mlp": 0.01230635, + "balance_loss_clip": 1.11553764, + "balance_loss_mlp": 1.02960086, + "epoch": 0.5861415902600331, + "flos": 22563877174560.0, + "grad_norm": 1.7595781407375672, + "language_loss": 0.71741199, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74403346, + "num_input_tokens_seen": 210151895, + "step": 9749, + "time_per_iteration": 2.7706058025360107 + }, + { + "auxiliary_loss_clip": 0.01439697, + "auxiliary_loss_mlp": 0.01224102, + "balance_loss_clip": 1.12502742, + "balance_loss_mlp": 1.02630961, + "epoch": 0.586201713512701, + "flos": 14393766477600.0, + "grad_norm": 2.273240902583008, + "language_loss": 0.7496407, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.77627861, + "num_input_tokens_seen": 210168040, + "step": 9750, + "time_per_iteration": 2.8006033897399902 + }, + { + "auxiliary_loss_clip": 0.0143532, + "auxiliary_loss_mlp": 0.01222915, + "balance_loss_clip": 1.12092996, + "balance_loss_mlp": 1.02502751, + "epoch": 0.586261836765369, + "flos": 19500502872960.0, + "grad_norm": 1.820003046868057, + "language_loss": 0.71017587, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73675823, + "num_input_tokens_seen": 210187720, + "step": 9751, + "time_per_iteration": 2.8224291801452637 + }, + { + "auxiliary_loss_clip": 0.01437024, + "auxiliary_loss_mlp": 0.0123394, + "balance_loss_clip": 1.1213218, + "balance_loss_mlp": 1.03328657, + "epoch": 0.5863219600180369, + "flos": 20703785771520.0, + "grad_norm": 1.7750347341875055, + "language_loss": 0.74656653, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.77327621, + "num_input_tokens_seen": 210206080, + "step": 9752, + "time_per_iteration": 2.7468276023864746 + }, + { + "auxiliary_loss_clip": 0.0143684, + "auxiliary_loss_mlp": 0.01229623, + "balance_loss_clip": 1.12182856, + "balance_loss_mlp": 1.03440547, + "epoch": 0.586382083270705, + "flos": 19794183530400.0, + "grad_norm": 1.8874401559115808, + "language_loss": 0.77529979, + "learning_rate": 1.541625017642943e-06, + "loss": 0.8019644, + "num_input_tokens_seen": 210225660, + "step": 9753, + "time_per_iteration": 2.7756001949310303 + }, + { + "auxiliary_loss_clip": 0.01431039, + "auxiliary_loss_mlp": 0.01222169, + "balance_loss_clip": 1.11617291, + "balance_loss_mlp": 1.02685666, + "epoch": 0.5864422065233729, + "flos": 16501986453600.0, + "grad_norm": 1.7821786337958523, + "language_loss": 0.71070385, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.7372359, + "num_input_tokens_seen": 210242725, + "step": 9754, + "time_per_iteration": 2.7700819969177246 + }, + { + "auxiliary_loss_clip": 0.01436986, + "auxiliary_loss_mlp": 0.01227255, + "balance_loss_clip": 1.12107933, + "balance_loss_mlp": 1.0287956, + "epoch": 0.5865023297760409, + "flos": 20415415056480.0, + "grad_norm": 1.87303762036744, + "language_loss": 0.72074217, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74738461, + "num_input_tokens_seen": 210263225, + "step": 9755, + "time_per_iteration": 2.8353378772735596 + }, + { + "auxiliary_loss_clip": 0.01450276, + "auxiliary_loss_mlp": 0.01199554, + "balance_loss_clip": 1.16966987, + "balance_loss_mlp": 1.00958252, + "epoch": 0.5865624530287089, + "flos": 63357006332160.0, + "grad_norm": 0.7399796700988827, + "language_loss": 0.56881779, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59531605, + "num_input_tokens_seen": 210322310, + "step": 9756, + "time_per_iteration": 3.3233649730682373 + }, + { + "auxiliary_loss_clip": 0.01427002, + "auxiliary_loss_mlp": 0.0122873, + "balance_loss_clip": 1.11168349, + "balance_loss_mlp": 1.03532529, + "epoch": 0.5866225762813768, + "flos": 27018735726240.0, + "grad_norm": 2.008258113958927, + "language_loss": 0.76209927, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78865659, + "num_input_tokens_seen": 210340845, + "step": 9757, + "time_per_iteration": 2.8739964962005615 + }, + { + "auxiliary_loss_clip": 0.01448375, + "auxiliary_loss_mlp": 0.01205841, + "balance_loss_clip": 1.16769803, + "balance_loss_mlp": 1.01815796, + "epoch": 0.5866826995340448, + "flos": 72994041118080.0, + "grad_norm": 0.8585475886270211, + "language_loss": 0.60489231, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.6314345, + "num_input_tokens_seen": 210397815, + "step": 9758, + "time_per_iteration": 3.249037742614746 + }, + { + "auxiliary_loss_clip": 0.0143653, + "auxiliary_loss_mlp": 0.01235918, + "balance_loss_clip": 1.12013459, + "balance_loss_mlp": 1.03574181, + "epoch": 0.5867428227867127, + "flos": 21287733552000.0, + "grad_norm": 2.2487812755915, + "language_loss": 0.72285521, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.74957967, + "num_input_tokens_seen": 210413900, + "step": 9759, + "time_per_iteration": 2.7495319843292236 + }, + { + "auxiliary_loss_clip": 0.01440962, + "auxiliary_loss_mlp": 0.01232195, + "balance_loss_clip": 1.12495375, + "balance_loss_mlp": 1.0348804, + "epoch": 0.5868029460393808, + "flos": 33471366359040.0, + "grad_norm": 1.587667482281253, + "language_loss": 0.73435634, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.76108795, + "num_input_tokens_seen": 210434110, + "step": 9760, + "time_per_iteration": 2.9292988777160645 + }, + { + "auxiliary_loss_clip": 0.01435022, + "auxiliary_loss_mlp": 0.01229704, + "balance_loss_clip": 1.11891496, + "balance_loss_mlp": 1.03019488, + "epoch": 0.5868630692920487, + "flos": 17891119157760.0, + "grad_norm": 2.0326529461002174, + "language_loss": 0.73163319, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.7582804, + "num_input_tokens_seen": 210451685, + "step": 9761, + "time_per_iteration": 2.7179126739501953 + }, + { + "auxiliary_loss_clip": 0.01434464, + "auxiliary_loss_mlp": 0.0123195, + "balance_loss_clip": 1.11761141, + "balance_loss_mlp": 1.03520703, + "epoch": 0.5869231925447167, + "flos": 21037291361280.0, + "grad_norm": 2.126931257432981, + "language_loss": 0.75053018, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77719432, + "num_input_tokens_seen": 210470825, + "step": 9762, + "time_per_iteration": 2.8423802852630615 + }, + { + "auxiliary_loss_clip": 0.01436911, + "auxiliary_loss_mlp": 0.01235197, + "balance_loss_clip": 1.1214695, + "balance_loss_mlp": 1.03950334, + "epoch": 0.5869833157973846, + "flos": 74743267780800.0, + "grad_norm": 1.3879267397568045, + "language_loss": 0.72255427, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74927533, + "num_input_tokens_seen": 210500075, + "step": 9763, + "time_per_iteration": 3.2185728549957275 + }, + { + "auxiliary_loss_clip": 0.01427392, + "auxiliary_loss_mlp": 0.01220451, + "balance_loss_clip": 1.11299372, + "balance_loss_mlp": 1.02628326, + "epoch": 0.5870434390500526, + "flos": 17640904536000.0, + "grad_norm": 1.6694261999861726, + "language_loss": 0.79787421, + "learning_rate": 1.53745602625755e-06, + "loss": 0.82435274, + "num_input_tokens_seen": 210518150, + "step": 9764, + "time_per_iteration": 2.843851089477539 + }, + { + "auxiliary_loss_clip": 0.01434093, + "auxiliary_loss_mlp": 0.01226884, + "balance_loss_clip": 1.11851549, + "balance_loss_mlp": 1.03300178, + "epoch": 0.5871035623027205, + "flos": 21508250060160.0, + "grad_norm": 1.6846005180600125, + "language_loss": 0.78979468, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81640446, + "num_input_tokens_seen": 210537760, + "step": 9765, + "time_per_iteration": 4.30902624130249 + }, + { + "auxiliary_loss_clip": 0.01430857, + "auxiliary_loss_mlp": 0.01231166, + "balance_loss_clip": 1.11524606, + "balance_loss_mlp": 1.03747487, + "epoch": 0.5871636855553886, + "flos": 13553497641600.0, + "grad_norm": 1.8333987766064064, + "language_loss": 0.83967912, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.86629927, + "num_input_tokens_seen": 210555515, + "step": 9766, + "time_per_iteration": 2.8078434467315674 + }, + { + "auxiliary_loss_clip": 0.01431595, + "auxiliary_loss_mlp": 0.01238582, + "balance_loss_clip": 1.11567593, + "balance_loss_mlp": 1.04317439, + "epoch": 0.5872238088080565, + "flos": 26215295497920.0, + "grad_norm": 1.73321421159574, + "language_loss": 0.69614875, + "learning_rate": 1.536319396136257e-06, + "loss": 0.72285056, + "num_input_tokens_seen": 210575000, + "step": 9767, + "time_per_iteration": 2.8168835639953613 + }, + { + "auxiliary_loss_clip": 0.01428929, + "auxiliary_loss_mlp": 0.01228276, + "balance_loss_clip": 1.1123184, + "balance_loss_mlp": 1.03115201, + "epoch": 0.5872839320607245, + "flos": 30667878360000.0, + "grad_norm": 2.4234821562316986, + "language_loss": 0.63687104, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.66344309, + "num_input_tokens_seen": 210595185, + "step": 9768, + "time_per_iteration": 2.8783695697784424 + }, + { + "auxiliary_loss_clip": 0.01445841, + "auxiliary_loss_mlp": 0.01198914, + "balance_loss_clip": 1.16595173, + "balance_loss_mlp": 1.00970459, + "epoch": 0.5873440553133924, + "flos": 60310623846240.0, + "grad_norm": 0.7260824911314916, + "language_loss": 0.53797752, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.56442505, + "num_input_tokens_seen": 210653210, + "step": 9769, + "time_per_iteration": 3.305145263671875 + }, + { + "auxiliary_loss_clip": 0.01428566, + "auxiliary_loss_mlp": 0.01225117, + "balance_loss_clip": 1.11128569, + "balance_loss_mlp": 1.02970922, + "epoch": 0.5874041785660604, + "flos": 21541209995520.0, + "grad_norm": 1.4564522420719264, + "language_loss": 0.7056058, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.73214269, + "num_input_tokens_seen": 210673750, + "step": 9770, + "time_per_iteration": 2.894627094268799 + }, + { + "auxiliary_loss_clip": 0.01429349, + "auxiliary_loss_mlp": 0.01231422, + "balance_loss_clip": 1.11204076, + "balance_loss_mlp": 1.03734934, + "epoch": 0.5874643018187284, + "flos": 24391236211200.0, + "grad_norm": 2.0423222475315606, + "language_loss": 0.66829944, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69490713, + "num_input_tokens_seen": 210692960, + "step": 9771, + "time_per_iteration": 2.8893380165100098 + }, + { + "auxiliary_loss_clip": 0.01429027, + "auxiliary_loss_mlp": 0.01222113, + "balance_loss_clip": 1.11179674, + "balance_loss_mlp": 1.02565598, + "epoch": 0.5875244250713964, + "flos": 28150257817440.0, + "grad_norm": 1.5387546540077777, + "language_loss": 0.66029257, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.68680394, + "num_input_tokens_seen": 210714040, + "step": 9772, + "time_per_iteration": 2.930974245071411 + }, + { + "auxiliary_loss_clip": 0.01439647, + "auxiliary_loss_mlp": 0.01235159, + "balance_loss_clip": 1.12210321, + "balance_loss_mlp": 1.0363183, + "epoch": 0.5875845483240644, + "flos": 25814808048960.0, + "grad_norm": 1.6208334054002196, + "language_loss": 0.74367446, + "learning_rate": 1.534046611017519e-06, + "loss": 0.77042252, + "num_input_tokens_seen": 210733710, + "step": 9773, + "time_per_iteration": 4.323990106582642 + }, + { + "auxiliary_loss_clip": 0.01434622, + "auxiliary_loss_mlp": 0.01225757, + "balance_loss_clip": 1.11587942, + "balance_loss_mlp": 1.02968144, + "epoch": 0.5876446715767323, + "flos": 26909008466400.0, + "grad_norm": 2.643705374730554, + "language_loss": 0.53423822, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.56084204, + "num_input_tokens_seen": 210753580, + "step": 9774, + "time_per_iteration": 2.845682382583618 + }, + { + "auxiliary_loss_clip": 0.0143433, + "auxiliary_loss_mlp": 0.01238563, + "balance_loss_clip": 1.11610103, + "balance_loss_mlp": 1.04258347, + "epoch": 0.5877047948294003, + "flos": 36687820171680.0, + "grad_norm": 2.3665892024351733, + "language_loss": 0.64551413, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.67224312, + "num_input_tokens_seen": 210773495, + "step": 9775, + "time_per_iteration": 4.439508676528931 + }, + { + "auxiliary_loss_clip": 0.01432896, + "auxiliary_loss_mlp": 0.01232527, + "balance_loss_clip": 1.11443114, + "balance_loss_mlp": 1.03721428, + "epoch": 0.5877649180820682, + "flos": 26727179045760.0, + "grad_norm": 3.416964836181802, + "language_loss": 0.73913217, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.76578641, + "num_input_tokens_seen": 210793645, + "step": 9776, + "time_per_iteration": 2.8279566764831543 + }, + { + "auxiliary_loss_clip": 0.01431575, + "auxiliary_loss_mlp": 0.01232204, + "balance_loss_clip": 1.11388123, + "balance_loss_mlp": 1.03965759, + "epoch": 0.5878250413347362, + "flos": 21034295036640.0, + "grad_norm": 1.8629803459343874, + "language_loss": 0.73971784, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76635563, + "num_input_tokens_seen": 210813415, + "step": 9777, + "time_per_iteration": 2.7857906818389893 + }, + { + "auxiliary_loss_clip": 0.0142669, + "auxiliary_loss_mlp": 0.01219113, + "balance_loss_clip": 1.10868669, + "balance_loss_mlp": 1.02532625, + "epoch": 0.5878851645874041, + "flos": 25486650329760.0, + "grad_norm": 1.5416027756386201, + "language_loss": 0.74251479, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76897275, + "num_input_tokens_seen": 210833850, + "step": 9778, + "time_per_iteration": 2.7806396484375 + }, + { + "auxiliary_loss_clip": 0.01423746, + "auxiliary_loss_mlp": 0.01220954, + "balance_loss_clip": 1.10797417, + "balance_loss_mlp": 1.02878881, + "epoch": 0.5879452878400722, + "flos": 23771408027040.0, + "grad_norm": 1.975588132206969, + "language_loss": 0.70111603, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72756302, + "num_input_tokens_seen": 210853115, + "step": 9779, + "time_per_iteration": 2.860398530960083 + }, + { + "auxiliary_loss_clip": 0.01427653, + "auxiliary_loss_mlp": 0.01229149, + "balance_loss_clip": 1.10937285, + "balance_loss_mlp": 1.03393221, + "epoch": 0.5880054110927401, + "flos": 17826602628960.0, + "grad_norm": 2.119792281121957, + "language_loss": 0.67198688, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.69855487, + "num_input_tokens_seen": 210872090, + "step": 9780, + "time_per_iteration": 4.336871385574341 + }, + { + "auxiliary_loss_clip": 0.01426175, + "auxiliary_loss_mlp": 0.01232204, + "balance_loss_clip": 1.10973191, + "balance_loss_mlp": 1.03698683, + "epoch": 0.5880655343454081, + "flos": 19465418960640.0, + "grad_norm": 2.092469808613364, + "language_loss": 0.72660279, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.75318658, + "num_input_tokens_seen": 210888490, + "step": 9781, + "time_per_iteration": 2.7536559104919434 + }, + { + "auxiliary_loss_clip": 0.0142899, + "auxiliary_loss_mlp": 0.01225781, + "balance_loss_clip": 1.11300123, + "balance_loss_mlp": 1.03285217, + "epoch": 0.588125657598076, + "flos": 21399736501440.0, + "grad_norm": 1.7527978671012314, + "language_loss": 0.70574844, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.73229617, + "num_input_tokens_seen": 210908220, + "step": 9782, + "time_per_iteration": 2.8434853553771973 + }, + { + "auxiliary_loss_clip": 0.01425876, + "auxiliary_loss_mlp": 0.01227539, + "balance_loss_clip": 1.1093967, + "balance_loss_mlp": 1.03003275, + "epoch": 0.588185780850744, + "flos": 16036527337920.0, + "grad_norm": 2.52123087306832, + "language_loss": 0.70066381, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.727198, + "num_input_tokens_seen": 210923945, + "step": 9783, + "time_per_iteration": 2.724911689758301 + }, + { + "auxiliary_loss_clip": 0.01427245, + "auxiliary_loss_mlp": 0.01236479, + "balance_loss_clip": 1.11033344, + "balance_loss_mlp": 1.04040313, + "epoch": 0.588245904103412, + "flos": 23730255609120.0, + "grad_norm": 2.292617288965534, + "language_loss": 0.69549286, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.72213006, + "num_input_tokens_seen": 210941955, + "step": 9784, + "time_per_iteration": 2.8125317096710205 + }, + { + "auxiliary_loss_clip": 0.01425361, + "auxiliary_loss_mlp": 0.01236351, + "balance_loss_clip": 1.10921693, + "balance_loss_mlp": 1.04056168, + "epoch": 0.58830602735608, + "flos": 33805365014880.0, + "grad_norm": 2.103132769935485, + "language_loss": 0.69693631, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.72355342, + "num_input_tokens_seen": 210963105, + "step": 9785, + "time_per_iteration": 2.910951614379883 + }, + { + "auxiliary_loss_clip": 0.01423737, + "auxiliary_loss_mlp": 0.01218247, + "balance_loss_clip": 1.10832644, + "balance_loss_mlp": 1.02159929, + "epoch": 0.588366150608748, + "flos": 17092533733920.0, + "grad_norm": 2.300542412424444, + "language_loss": 0.76995432, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.7963742, + "num_input_tokens_seen": 210978720, + "step": 9786, + "time_per_iteration": 2.7837202548980713 + }, + { + "auxiliary_loss_clip": 0.01429295, + "auxiliary_loss_mlp": 0.01229632, + "balance_loss_clip": 1.11342287, + "balance_loss_mlp": 1.03212595, + "epoch": 0.5884262738614159, + "flos": 22129367801760.0, + "grad_norm": 1.7816644310097696, + "language_loss": 0.78995311, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81654239, + "num_input_tokens_seen": 210998750, + "step": 9787, + "time_per_iteration": 2.8473548889160156 + }, + { + "auxiliary_loss_clip": 0.01430668, + "auxiliary_loss_mlp": 0.01224345, + "balance_loss_clip": 1.11334777, + "balance_loss_mlp": 1.02598071, + "epoch": 0.5884863971140839, + "flos": 21033915755040.0, + "grad_norm": 1.6314819666907523, + "language_loss": 0.66292292, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68947303, + "num_input_tokens_seen": 211017550, + "step": 9788, + "time_per_iteration": 2.787899971008301 + }, + { + "auxiliary_loss_clip": 0.01433288, + "auxiliary_loss_mlp": 0.01225807, + "balance_loss_clip": 1.11757016, + "balance_loss_mlp": 1.02934968, + "epoch": 0.5885465203667518, + "flos": 23807364287040.0, + "grad_norm": 2.6534037332232403, + "language_loss": 0.79973221, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82632315, + "num_input_tokens_seen": 211034135, + "step": 9789, + "time_per_iteration": 2.8341000080108643 + }, + { + "auxiliary_loss_clip": 0.01433529, + "auxiliary_loss_mlp": 0.01226419, + "balance_loss_clip": 1.11792028, + "balance_loss_mlp": 1.03034401, + "epoch": 0.5886066436194198, + "flos": 18882609024960.0, + "grad_norm": 1.699126872904154, + "language_loss": 0.70690084, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.73350036, + "num_input_tokens_seen": 211053850, + "step": 9790, + "time_per_iteration": 2.8137571811676025 + }, + { + "auxiliary_loss_clip": 0.0143222, + "auxiliary_loss_mlp": 0.01225248, + "balance_loss_clip": 1.11746788, + "balance_loss_mlp": 1.03136599, + "epoch": 0.5886667668720877, + "flos": 24792444295200.0, + "grad_norm": 1.9446971480671222, + "language_loss": 0.83155781, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85813248, + "num_input_tokens_seen": 211072165, + "step": 9791, + "time_per_iteration": 2.9175946712493896 + }, + { + "auxiliary_loss_clip": 0.0143648, + "auxiliary_loss_mlp": 0.01230189, + "balance_loss_clip": 1.12064147, + "balance_loss_mlp": 1.03621173, + "epoch": 0.5887268901247558, + "flos": 21616422265440.0, + "grad_norm": 2.918056584081395, + "language_loss": 0.76445746, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.79112411, + "num_input_tokens_seen": 211089630, + "step": 9792, + "time_per_iteration": 2.833798408508301 + }, + { + "auxiliary_loss_clip": 0.01430324, + "auxiliary_loss_mlp": 0.01223901, + "balance_loss_clip": 1.11370587, + "balance_loss_mlp": 1.02753985, + "epoch": 0.5887870133774237, + "flos": 20483610616800.0, + "grad_norm": 2.347976289790023, + "language_loss": 0.69366324, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.72020555, + "num_input_tokens_seen": 211106120, + "step": 9793, + "time_per_iteration": 2.803978443145752 + }, + { + "auxiliary_loss_clip": 0.01427749, + "auxiliary_loss_mlp": 0.01223973, + "balance_loss_clip": 1.11290276, + "balance_loss_mlp": 1.02894711, + "epoch": 0.5888471366300917, + "flos": 19208187629280.0, + "grad_norm": 1.8056105109583458, + "language_loss": 0.60469449, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.63121176, + "num_input_tokens_seen": 211122450, + "step": 9794, + "time_per_iteration": 2.806975841522217 + }, + { + "auxiliary_loss_clip": 0.01432352, + "auxiliary_loss_mlp": 0.01233568, + "balance_loss_clip": 1.11591983, + "balance_loss_mlp": 1.03978157, + "epoch": 0.5889072598827596, + "flos": 19974875106240.0, + "grad_norm": 2.0025175781563154, + "language_loss": 0.64903665, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67569584, + "num_input_tokens_seen": 211141765, + "step": 9795, + "time_per_iteration": 2.76275372505188 + }, + { + "auxiliary_loss_clip": 0.01437173, + "auxiliary_loss_mlp": 0.01231093, + "balance_loss_clip": 1.12274837, + "balance_loss_mlp": 1.03625703, + "epoch": 0.5889673831354276, + "flos": 20743383134880.0, + "grad_norm": 1.8147086248126447, + "language_loss": 0.74122387, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76790649, + "num_input_tokens_seen": 211160475, + "step": 9796, + "time_per_iteration": 2.819520950317383 + }, + { + "auxiliary_loss_clip": 0.01431829, + "auxiliary_loss_mlp": 0.01227583, + "balance_loss_clip": 1.11717153, + "balance_loss_mlp": 1.03532195, + "epoch": 0.5890275063880956, + "flos": 25303265854560.0, + "grad_norm": 1.909438588383459, + "language_loss": 0.83015794, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.8567521, + "num_input_tokens_seen": 211180480, + "step": 9797, + "time_per_iteration": 2.835556745529175 + }, + { + "auxiliary_loss_clip": 0.01429112, + "auxiliary_loss_mlp": 0.01224802, + "balance_loss_clip": 1.11511779, + "balance_loss_mlp": 1.03120613, + "epoch": 0.5890876296407636, + "flos": 11766987597600.0, + "grad_norm": 4.173989924279925, + "language_loss": 0.79451454, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.82105368, + "num_input_tokens_seen": 211198000, + "step": 9798, + "time_per_iteration": 2.775033950805664 + }, + { + "auxiliary_loss_clip": 0.01430317, + "auxiliary_loss_mlp": 0.01226948, + "balance_loss_clip": 1.11514175, + "balance_loss_mlp": 1.03420997, + "epoch": 0.5891477528934316, + "flos": 13591160668800.0, + "grad_norm": 2.091876450641637, + "language_loss": 0.73727006, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.7638427, + "num_input_tokens_seen": 211214765, + "step": 9799, + "time_per_iteration": 2.752230405807495 + }, + { + "auxiliary_loss_clip": 0.01427679, + "auxiliary_loss_mlp": 0.01235541, + "balance_loss_clip": 1.11354768, + "balance_loss_mlp": 1.04061007, + "epoch": 0.5892078761460995, + "flos": 15050992191840.0, + "grad_norm": 2.2186568671912252, + "language_loss": 0.76472378, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.79135597, + "num_input_tokens_seen": 211232335, + "step": 9800, + "time_per_iteration": 2.93966007232666 + }, + { + "auxiliary_loss_clip": 0.01426912, + "auxiliary_loss_mlp": 0.01224788, + "balance_loss_clip": 1.11267209, + "balance_loss_mlp": 1.03128803, + "epoch": 0.5892679993987675, + "flos": 15780016641600.0, + "grad_norm": 2.103479429274868, + "language_loss": 0.78665972, + "learning_rate": 1.523448741022722e-06, + "loss": 0.81317675, + "num_input_tokens_seen": 211249985, + "step": 9801, + "time_per_iteration": 2.834752082824707 + }, + { + "auxiliary_loss_clip": 0.01422307, + "auxiliary_loss_mlp": 0.01224841, + "balance_loss_clip": 1.10801291, + "balance_loss_mlp": 1.02981436, + "epoch": 0.5893281226514354, + "flos": 25267575091680.0, + "grad_norm": 1.6639037903028386, + "language_loss": 0.65979713, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68626863, + "num_input_tokens_seen": 211268425, + "step": 9802, + "time_per_iteration": 2.9065563678741455 + }, + { + "auxiliary_loss_clip": 0.01431848, + "auxiliary_loss_mlp": 0.01221262, + "balance_loss_clip": 1.11865115, + "balance_loss_mlp": 1.02575922, + "epoch": 0.5893882459041034, + "flos": 19459957305600.0, + "grad_norm": 1.8487599498831173, + "language_loss": 0.78199494, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80852604, + "num_input_tokens_seen": 211286680, + "step": 9803, + "time_per_iteration": 4.130795478820801 + }, + { + "auxiliary_loss_clip": 0.01425231, + "auxiliary_loss_mlp": 0.01228281, + "balance_loss_clip": 1.1112442, + "balance_loss_mlp": 1.03373182, + "epoch": 0.5894483691567713, + "flos": 20636310846240.0, + "grad_norm": 2.043850109083224, + "language_loss": 0.72866791, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75520301, + "num_input_tokens_seen": 211307700, + "step": 9804, + "time_per_iteration": 2.790807008743286 + }, + { + "auxiliary_loss_clip": 0.0142333, + "auxiliary_loss_mlp": 0.01228319, + "balance_loss_clip": 1.1098218, + "balance_loss_mlp": 1.03405547, + "epoch": 0.5895084924094394, + "flos": 17779760987040.0, + "grad_norm": 1.6770783513439917, + "language_loss": 0.74777091, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.7742874, + "num_input_tokens_seen": 211324835, + "step": 9805, + "time_per_iteration": 2.7775020599365234 + }, + { + "auxiliary_loss_clip": 0.01424386, + "auxiliary_loss_mlp": 0.01229758, + "balance_loss_clip": 1.10957503, + "balance_loss_mlp": 1.03606665, + "epoch": 0.5895686156621073, + "flos": 20123175669120.0, + "grad_norm": 1.7775832576667965, + "language_loss": 0.78097761, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.80751902, + "num_input_tokens_seen": 211344130, + "step": 9806, + "time_per_iteration": 2.801231861114502 + }, + { + "auxiliary_loss_clip": 0.01428358, + "auxiliary_loss_mlp": 0.01230531, + "balance_loss_clip": 1.11465871, + "balance_loss_mlp": 1.0326438, + "epoch": 0.5896287389147753, + "flos": 20852162190720.0, + "grad_norm": 1.9487034426086645, + "language_loss": 0.7705549, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79714376, + "num_input_tokens_seen": 211362915, + "step": 9807, + "time_per_iteration": 2.764655113220215 + }, + { + "auxiliary_loss_clip": 0.01430884, + "auxiliary_loss_mlp": 0.01226571, + "balance_loss_clip": 1.1170578, + "balance_loss_mlp": 1.03078163, + "epoch": 0.5896888621674432, + "flos": 14539639638240.0, + "grad_norm": 2.2172861667274106, + "language_loss": 0.74273551, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76931, + "num_input_tokens_seen": 211380700, + "step": 9808, + "time_per_iteration": 2.774775266647339 + }, + { + "auxiliary_loss_clip": 0.01427407, + "auxiliary_loss_mlp": 0.01221666, + "balance_loss_clip": 1.11474109, + "balance_loss_mlp": 1.0230155, + "epoch": 0.5897489854201112, + "flos": 20888877013920.0, + "grad_norm": 2.0448186330675053, + "language_loss": 0.72299659, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.74948728, + "num_input_tokens_seen": 211400095, + "step": 9809, + "time_per_iteration": 2.781423568725586 + }, + { + "auxiliary_loss_clip": 0.01424977, + "auxiliary_loss_mlp": 0.0122105, + "balance_loss_clip": 1.11170185, + "balance_loss_mlp": 1.02459335, + "epoch": 0.5898091086727792, + "flos": 20013220840320.0, + "grad_norm": 2.004720957574905, + "language_loss": 0.82067561, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.8471359, + "num_input_tokens_seen": 211417810, + "step": 9810, + "time_per_iteration": 2.7807602882385254 + }, + { + "auxiliary_loss_clip": 0.01430087, + "auxiliary_loss_mlp": 0.01225232, + "balance_loss_clip": 1.11669457, + "balance_loss_mlp": 1.02963352, + "epoch": 0.5898692319254472, + "flos": 16255109509920.0, + "grad_norm": 1.8705137815854664, + "language_loss": 0.81091368, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.83746684, + "num_input_tokens_seen": 211436020, + "step": 9811, + "time_per_iteration": 2.78729248046875 + }, + { + "auxiliary_loss_clip": 0.01429509, + "auxiliary_loss_mlp": 0.01229164, + "balance_loss_clip": 1.11835039, + "balance_loss_mlp": 1.03070402, + "epoch": 0.5899293551781152, + "flos": 20450612753280.0, + "grad_norm": 2.347159055324263, + "language_loss": 0.76672393, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.79331064, + "num_input_tokens_seen": 211454335, + "step": 9812, + "time_per_iteration": 4.263381481170654 + }, + { + "auxiliary_loss_clip": 0.01423218, + "auxiliary_loss_mlp": 0.0122774, + "balance_loss_clip": 1.1131742, + "balance_loss_mlp": 1.03528881, + "epoch": 0.5899894784307831, + "flos": 13883437984320.0, + "grad_norm": 1.766884181539932, + "language_loss": 0.70764601, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.73415565, + "num_input_tokens_seen": 211472775, + "step": 9813, + "time_per_iteration": 4.321744441986084 + }, + { + "auxiliary_loss_clip": 0.01425138, + "auxiliary_loss_mlp": 0.01224785, + "balance_loss_clip": 1.11451197, + "balance_loss_mlp": 1.0313797, + "epoch": 0.5900496016834511, + "flos": 20085853995360.0, + "grad_norm": 1.6966922346811761, + "language_loss": 0.72533774, + "learning_rate": 1.518533098148494e-06, + "loss": 0.7518369, + "num_input_tokens_seen": 211492195, + "step": 9814, + "time_per_iteration": 2.84450101852417 + }, + { + "auxiliary_loss_clip": 0.01430798, + "auxiliary_loss_mlp": 0.01223039, + "balance_loss_clip": 1.12185848, + "balance_loss_mlp": 1.02543819, + "epoch": 0.590109724936119, + "flos": 20260818419040.0, + "grad_norm": 1.8732083554977843, + "language_loss": 0.78974175, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.81628013, + "num_input_tokens_seen": 211510220, + "step": 9815, + "time_per_iteration": 2.7443525791168213 + }, + { + "auxiliary_loss_clip": 0.01432556, + "auxiliary_loss_mlp": 0.01230027, + "balance_loss_clip": 1.12149143, + "balance_loss_mlp": 1.03252101, + "epoch": 0.590169848188787, + "flos": 24236905070880.0, + "grad_norm": 2.7670675406814333, + "language_loss": 0.7603122, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.78693807, + "num_input_tokens_seen": 211526260, + "step": 9816, + "time_per_iteration": 2.772724151611328 + }, + { + "auxiliary_loss_clip": 0.01438944, + "auxiliary_loss_mlp": 0.01228626, + "balance_loss_clip": 1.12967551, + "balance_loss_mlp": 1.03150141, + "epoch": 0.590229971441455, + "flos": 17786625984000.0, + "grad_norm": 1.9785961505182361, + "language_loss": 0.81143177, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83810747, + "num_input_tokens_seen": 211542890, + "step": 9817, + "time_per_iteration": 2.7708375453948975 + }, + { + "auxiliary_loss_clip": 0.01432357, + "auxiliary_loss_mlp": 0.01226372, + "balance_loss_clip": 1.12148964, + "balance_loss_mlp": 1.03458762, + "epoch": 0.590290094694123, + "flos": 22239095061600.0, + "grad_norm": 1.6904534330258423, + "language_loss": 0.7683537, + "learning_rate": 1.517021211933682e-06, + "loss": 0.79494101, + "num_input_tokens_seen": 211562685, + "step": 9818, + "time_per_iteration": 4.268857955932617 + }, + { + "auxiliary_loss_clip": 0.01433602, + "auxiliary_loss_mlp": 0.01223845, + "balance_loss_clip": 1.12432575, + "balance_loss_mlp": 1.02853251, + "epoch": 0.5903502179467909, + "flos": 19100622274560.0, + "grad_norm": 3.6557102935903623, + "language_loss": 0.66658044, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.69315487, + "num_input_tokens_seen": 211579960, + "step": 9819, + "time_per_iteration": 2.7704572677612305 + }, + { + "auxiliary_loss_clip": 0.01439487, + "auxiliary_loss_mlp": 0.01227623, + "balance_loss_clip": 1.12958217, + "balance_loss_mlp": 1.03307319, + "epoch": 0.5904103411994589, + "flos": 24237360208800.0, + "grad_norm": 1.5939157571628793, + "language_loss": 0.77911365, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.8057847, + "num_input_tokens_seen": 211599310, + "step": 9820, + "time_per_iteration": 2.7692337036132812 + }, + { + "auxiliary_loss_clip": 0.01467121, + "auxiliary_loss_mlp": 0.01194893, + "balance_loss_clip": 1.1883235, + "balance_loss_mlp": 1.00492096, + "epoch": 0.5904704644521268, + "flos": 64882568085120.0, + "grad_norm": 0.9359902661948237, + "language_loss": 0.65037751, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67699766, + "num_input_tokens_seen": 211658790, + "step": 9821, + "time_per_iteration": 3.2883687019348145 + }, + { + "auxiliary_loss_clip": 0.01431766, + "auxiliary_loss_mlp": 0.01214477, + "balance_loss_clip": 1.12270391, + "balance_loss_mlp": 1.01678014, + "epoch": 0.5905305877047948, + "flos": 19612050684480.0, + "grad_norm": 1.9193895045802656, + "language_loss": 0.62116593, + "learning_rate": 1.515509618752521e-06, + "loss": 0.64762837, + "num_input_tokens_seen": 211677240, + "step": 9822, + "time_per_iteration": 2.728541374206543 + }, + { + "auxiliary_loss_clip": 0.01435745, + "auxiliary_loss_mlp": 0.01221346, + "balance_loss_clip": 1.1260767, + "balance_loss_mlp": 1.02355433, + "epoch": 0.5905907109574628, + "flos": 18991653577920.0, + "grad_norm": 1.9006943789685689, + "language_loss": 0.82739127, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.85396218, + "num_input_tokens_seen": 211695485, + "step": 9823, + "time_per_iteration": 2.78731632232666 + }, + { + "auxiliary_loss_clip": 0.01442313, + "auxiliary_loss_mlp": 0.01222998, + "balance_loss_clip": 1.1327728, + "balance_loss_mlp": 1.02463412, + "epoch": 0.5906508342101308, + "flos": 22202607807360.0, + "grad_norm": 2.248086470417332, + "language_loss": 0.73107362, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75772673, + "num_input_tokens_seen": 211713090, + "step": 9824, + "time_per_iteration": 2.8564376831054688 + }, + { + "auxiliary_loss_clip": 0.01436659, + "auxiliary_loss_mlp": 0.01237041, + "balance_loss_clip": 1.12697005, + "balance_loss_mlp": 1.04134715, + "epoch": 0.5907109574627988, + "flos": 20888801157600.0, + "grad_norm": 2.3238877725852802, + "language_loss": 0.82959628, + "learning_rate": 1.514376116721693e-06, + "loss": 0.85633332, + "num_input_tokens_seen": 211732510, + "step": 9825, + "time_per_iteration": 2.752592086791992 + }, + { + "auxiliary_loss_clip": 0.01437247, + "auxiliary_loss_mlp": 0.01216582, + "balance_loss_clip": 1.12887263, + "balance_loss_mlp": 1.02289081, + "epoch": 0.5907710807154667, + "flos": 21508781054400.0, + "grad_norm": 1.656326383458591, + "language_loss": 0.76621628, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.79275453, + "num_input_tokens_seen": 211748695, + "step": 9826, + "time_per_iteration": 2.772550106048584 + }, + { + "auxiliary_loss_clip": 0.01437816, + "auxiliary_loss_mlp": 0.012226, + "balance_loss_clip": 1.12810874, + "balance_loss_mlp": 1.02662015, + "epoch": 0.5908312039681347, + "flos": 22020588745920.0, + "grad_norm": 1.9317102561785797, + "language_loss": 0.72311318, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74971735, + "num_input_tokens_seen": 211768545, + "step": 9827, + "time_per_iteration": 2.8093810081481934 + }, + { + "auxiliary_loss_clip": 0.01435782, + "auxiliary_loss_mlp": 0.01224858, + "balance_loss_clip": 1.12685072, + "balance_loss_mlp": 1.02887821, + "epoch": 0.5908913272208026, + "flos": 18481552653600.0, + "grad_norm": 1.7959871292343437, + "language_loss": 0.79805779, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.82466418, + "num_input_tokens_seen": 211786665, + "step": 9828, + "time_per_iteration": 2.781558036804199 + }, + { + "auxiliary_loss_clip": 0.01444039, + "auxiliary_loss_mlp": 0.01234436, + "balance_loss_clip": 1.13457441, + "balance_loss_mlp": 1.03731179, + "epoch": 0.5909514504734706, + "flos": 12313803345120.0, + "grad_norm": 4.640771540049675, + "language_loss": 0.88540173, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.9121865, + "num_input_tokens_seen": 211801215, + "step": 9829, + "time_per_iteration": 2.7604002952575684 + }, + { + "auxiliary_loss_clip": 0.01475252, + "auxiliary_loss_mlp": 0.01182526, + "balance_loss_clip": 1.19501257, + "balance_loss_mlp": 0.995224, + "epoch": 0.5910115737261386, + "flos": 70220061591840.0, + "grad_norm": 0.7570613938277977, + "language_loss": 0.57806504, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.60464287, + "num_input_tokens_seen": 211857005, + "step": 9830, + "time_per_iteration": 3.224943161010742 + }, + { + "auxiliary_loss_clip": 0.01447772, + "auxiliary_loss_mlp": 0.01250348, + "balance_loss_clip": 1.13926911, + "balance_loss_mlp": 1.0513165, + "epoch": 0.5910716969788066, + "flos": 22019943967200.0, + "grad_norm": 2.462184640046111, + "language_loss": 0.75725079, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.78423202, + "num_input_tokens_seen": 211876675, + "step": 9831, + "time_per_iteration": 2.82016658782959 + }, + { + "auxiliary_loss_clip": 0.01438599, + "auxiliary_loss_mlp": 0.01227639, + "balance_loss_clip": 1.13005328, + "balance_loss_mlp": 1.03490186, + "epoch": 0.5911318202314745, + "flos": 21253787484480.0, + "grad_norm": 1.7383527408850912, + "language_loss": 0.77939868, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.80606103, + "num_input_tokens_seen": 211895725, + "step": 9832, + "time_per_iteration": 2.7586987018585205 + }, + { + "auxiliary_loss_clip": 0.014355, + "auxiliary_loss_mlp": 0.0122741, + "balance_loss_clip": 1.12668467, + "balance_loss_mlp": 1.03247905, + "epoch": 0.5911919434841425, + "flos": 17823530448000.0, + "grad_norm": 1.8214507377304763, + "language_loss": 0.83394033, + "learning_rate": 1.511354255945847e-06, + "loss": 0.86056942, + "num_input_tokens_seen": 211913860, + "step": 9833, + "time_per_iteration": 2.766981363296509 + }, + { + "auxiliary_loss_clip": 0.01436522, + "auxiliary_loss_mlp": 0.01231399, + "balance_loss_clip": 1.12735248, + "balance_loss_mlp": 1.03780341, + "epoch": 0.5912520667368104, + "flos": 20376803825280.0, + "grad_norm": 1.597376538558956, + "language_loss": 0.74176991, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76844919, + "num_input_tokens_seen": 211932880, + "step": 9834, + "time_per_iteration": 2.7620720863342285 + }, + { + "auxiliary_loss_clip": 0.01440974, + "auxiliary_loss_mlp": 0.01234319, + "balance_loss_clip": 1.13226211, + "balance_loss_mlp": 1.04034126, + "epoch": 0.5913121899894784, + "flos": 17932461216480.0, + "grad_norm": 2.5114162706652468, + "language_loss": 0.7809875, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.80774039, + "num_input_tokens_seen": 211948625, + "step": 9835, + "time_per_iteration": 2.8577358722686768 + }, + { + "auxiliary_loss_clip": 0.01439426, + "auxiliary_loss_mlp": 0.01227859, + "balance_loss_clip": 1.13009, + "balance_loss_mlp": 1.03245139, + "epoch": 0.5913723132421465, + "flos": 22129102304640.0, + "grad_norm": 2.916228577501902, + "language_loss": 0.73411739, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76079023, + "num_input_tokens_seen": 211965355, + "step": 9836, + "time_per_iteration": 2.8274009227752686 + }, + { + "auxiliary_loss_clip": 0.01441295, + "auxiliary_loss_mlp": 0.01235686, + "balance_loss_clip": 1.1321435, + "balance_loss_mlp": 1.04209018, + "epoch": 0.5914324364948144, + "flos": 15699304788480.0, + "grad_norm": 1.990654097890263, + "language_loss": 0.81945992, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84622967, + "num_input_tokens_seen": 211982245, + "step": 9837, + "time_per_iteration": 2.7465357780456543 + }, + { + "auxiliary_loss_clip": 0.01435374, + "auxiliary_loss_mlp": 0.01220101, + "balance_loss_clip": 1.12531579, + "balance_loss_mlp": 1.02555156, + "epoch": 0.5914925597474824, + "flos": 22749537339360.0, + "grad_norm": 1.8932409931514182, + "language_loss": 0.79510891, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.82166374, + "num_input_tokens_seen": 212000250, + "step": 9838, + "time_per_iteration": 2.754946231842041 + }, + { + "auxiliary_loss_clip": 0.01442213, + "auxiliary_loss_mlp": 0.0123756, + "balance_loss_clip": 1.13409448, + "balance_loss_mlp": 1.04234278, + "epoch": 0.5915526830001503, + "flos": 18294565003200.0, + "grad_norm": 2.350534696576548, + "language_loss": 0.6939187, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.72071648, + "num_input_tokens_seen": 212017505, + "step": 9839, + "time_per_iteration": 2.812476634979248 + }, + { + "auxiliary_loss_clip": 0.01437811, + "auxiliary_loss_mlp": 0.01227646, + "balance_loss_clip": 1.12979054, + "balance_loss_mlp": 1.03385901, + "epoch": 0.5916128062528183, + "flos": 17020962567360.0, + "grad_norm": 1.8475991426058613, + "language_loss": 0.65726435, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.68391895, + "num_input_tokens_seen": 212034595, + "step": 9840, + "time_per_iteration": 2.850801944732666 + }, + { + "auxiliary_loss_clip": 0.0143868, + "auxiliary_loss_mlp": 0.0123058, + "balance_loss_clip": 1.12808633, + "balance_loss_mlp": 1.03622174, + "epoch": 0.5916729295054862, + "flos": 24756563891520.0, + "grad_norm": 1.9122903503935076, + "language_loss": 0.81387895, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.84057152, + "num_input_tokens_seen": 212055775, + "step": 9841, + "time_per_iteration": 4.149972677230835 + }, + { + "auxiliary_loss_clip": 0.01436475, + "auxiliary_loss_mlp": 0.01226754, + "balance_loss_clip": 1.12687743, + "balance_loss_mlp": 1.03315854, + "epoch": 0.5917330527581542, + "flos": 15959873797920.0, + "grad_norm": 1.7440547658275123, + "language_loss": 0.69277024, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71940255, + "num_input_tokens_seen": 212074000, + "step": 9842, + "time_per_iteration": 2.807429075241089 + }, + { + "auxiliary_loss_clip": 0.01439361, + "auxiliary_loss_mlp": 0.0122389, + "balance_loss_clip": 1.13122511, + "balance_loss_mlp": 1.02638435, + "epoch": 0.5917931760108222, + "flos": 23802737051520.0, + "grad_norm": 2.116375963023135, + "language_loss": 0.83149481, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.85812736, + "num_input_tokens_seen": 212091415, + "step": 9843, + "time_per_iteration": 2.8224029541015625 + }, + { + "auxiliary_loss_clip": 0.01442238, + "auxiliary_loss_mlp": 0.01218368, + "balance_loss_clip": 1.13287711, + "balance_loss_mlp": 1.02124333, + "epoch": 0.5918532992634902, + "flos": 23251369924800.0, + "grad_norm": 2.720620028077892, + "language_loss": 0.8186636, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.84526968, + "num_input_tokens_seen": 212105255, + "step": 9844, + "time_per_iteration": 2.7352488040924072 + }, + { + "auxiliary_loss_clip": 0.01446604, + "auxiliary_loss_mlp": 0.01242023, + "balance_loss_clip": 1.13707876, + "balance_loss_mlp": 1.04823613, + "epoch": 0.5919134225161581, + "flos": 19501868286720.0, + "grad_norm": 2.091387072540141, + "language_loss": 0.74740386, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.77429014, + "num_input_tokens_seen": 212122765, + "step": 9845, + "time_per_iteration": 2.88869309425354 + }, + { + "auxiliary_loss_clip": 0.0144392, + "auxiliary_loss_mlp": 0.01220179, + "balance_loss_clip": 1.1340239, + "balance_loss_mlp": 1.02362669, + "epoch": 0.5919735457688261, + "flos": 38804042989440.0, + "grad_norm": 1.7563003284158667, + "language_loss": 0.63537717, + "learning_rate": 1.506446264718213e-06, + "loss": 0.66201818, + "num_input_tokens_seen": 212143960, + "step": 9846, + "time_per_iteration": 3.018388032913208 + }, + { + "auxiliary_loss_clip": 0.01444678, + "auxiliary_loss_mlp": 0.01215646, + "balance_loss_clip": 1.13652229, + "balance_loss_mlp": 1.01937938, + "epoch": 0.592033669021494, + "flos": 22166196409440.0, + "grad_norm": 2.052287582786027, + "language_loss": 0.76316357, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78976673, + "num_input_tokens_seen": 212162005, + "step": 9847, + "time_per_iteration": 2.830575466156006 + }, + { + "auxiliary_loss_clip": 0.01446617, + "auxiliary_loss_mlp": 0.01214928, + "balance_loss_clip": 1.1364392, + "balance_loss_mlp": 1.01723146, + "epoch": 0.592093792274162, + "flos": 22713201797760.0, + "grad_norm": 1.8187130247420507, + "language_loss": 0.61905754, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.64567304, + "num_input_tokens_seen": 212181635, + "step": 9848, + "time_per_iteration": 2.8586573600769043 + }, + { + "auxiliary_loss_clip": 0.01446644, + "auxiliary_loss_mlp": 0.01231552, + "balance_loss_clip": 1.13715243, + "balance_loss_mlp": 1.03786111, + "epoch": 0.59215391552683, + "flos": 22531182736320.0, + "grad_norm": 1.8965442461954805, + "language_loss": 0.75902081, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.78580272, + "num_input_tokens_seen": 212201615, + "step": 9849, + "time_per_iteration": 2.845189094543457 + }, + { + "auxiliary_loss_clip": 0.0144929, + "auxiliary_loss_mlp": 0.01229995, + "balance_loss_clip": 1.13992381, + "balance_loss_mlp": 1.03363347, + "epoch": 0.592214038779498, + "flos": 24501342752640.0, + "grad_norm": 1.9272906487254893, + "language_loss": 0.7553553, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.78214812, + "num_input_tokens_seen": 212219355, + "step": 9850, + "time_per_iteration": 4.245331048965454 + }, + { + "auxiliary_loss_clip": 0.01446988, + "auxiliary_loss_mlp": 0.01228357, + "balance_loss_clip": 1.13782501, + "balance_loss_mlp": 1.0336163, + "epoch": 0.592274162032166, + "flos": 21833297670240.0, + "grad_norm": 2.9791398050547295, + "language_loss": 0.75890267, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.78565609, + "num_input_tokens_seen": 212236710, + "step": 9851, + "time_per_iteration": 4.335484027862549 + }, + { + "auxiliary_loss_clip": 0.01445314, + "auxiliary_loss_mlp": 0.01225494, + "balance_loss_clip": 1.13514149, + "balance_loss_mlp": 1.03227997, + "epoch": 0.5923342852848339, + "flos": 24610463161920.0, + "grad_norm": 2.0375983753998246, + "language_loss": 0.70725644, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.7339645, + "num_input_tokens_seen": 212256195, + "step": 9852, + "time_per_iteration": 2.8584940433502197 + }, + { + "auxiliary_loss_clip": 0.01449323, + "auxiliary_loss_mlp": 0.01229865, + "balance_loss_clip": 1.13849974, + "balance_loss_mlp": 1.03426647, + "epoch": 0.5923944085375019, + "flos": 19940208403680.0, + "grad_norm": 6.840408998748416, + "language_loss": 0.80458498, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.83137679, + "num_input_tokens_seen": 212274085, + "step": 9853, + "time_per_iteration": 2.8009889125823975 + }, + { + "auxiliary_loss_clip": 0.01449225, + "auxiliary_loss_mlp": 0.01225785, + "balance_loss_clip": 1.13955498, + "balance_loss_mlp": 1.02990031, + "epoch": 0.5924545317901698, + "flos": 28661079376800.0, + "grad_norm": 2.2824973660041947, + "language_loss": 0.67799759, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.70474768, + "num_input_tokens_seen": 212295530, + "step": 9854, + "time_per_iteration": 2.8477416038513184 + }, + { + "auxiliary_loss_clip": 0.01445396, + "auxiliary_loss_mlp": 0.0122062, + "balance_loss_clip": 1.13595462, + "balance_loss_mlp": 1.0269289, + "epoch": 0.5925146550428378, + "flos": 19866930469920.0, + "grad_norm": 3.2690968988565734, + "language_loss": 0.89141655, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.91807663, + "num_input_tokens_seen": 212313770, + "step": 9855, + "time_per_iteration": 2.833331823348999 + }, + { + "auxiliary_loss_clip": 0.0145113, + "auxiliary_loss_mlp": 0.0122123, + "balance_loss_clip": 1.14164019, + "balance_loss_mlp": 1.02763367, + "epoch": 0.5925747782955058, + "flos": 15124839048000.0, + "grad_norm": 2.1598600012551685, + "language_loss": 0.86924648, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.89597011, + "num_input_tokens_seen": 212331525, + "step": 9856, + "time_per_iteration": 4.313991069793701 + }, + { + "auxiliary_loss_clip": 0.01446311, + "auxiliary_loss_mlp": 0.01234842, + "balance_loss_clip": 1.1368432, + "balance_loss_mlp": 1.04010201, + "epoch": 0.5926349015481738, + "flos": 18407402372160.0, + "grad_norm": 7.282960400130517, + "language_loss": 0.77626896, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.80308056, + "num_input_tokens_seen": 212347295, + "step": 9857, + "time_per_iteration": 2.725945234298706 + }, + { + "auxiliary_loss_clip": 0.01453622, + "auxiliary_loss_mlp": 0.01231902, + "balance_loss_clip": 1.14391673, + "balance_loss_mlp": 1.0365901, + "epoch": 0.5926950248008417, + "flos": 23113309965120.0, + "grad_norm": 2.2213869211303248, + "language_loss": 0.6472249, + "learning_rate": 1.501918617901419e-06, + "loss": 0.67408013, + "num_input_tokens_seen": 212365750, + "step": 9858, + "time_per_iteration": 2.778988838195801 + }, + { + "auxiliary_loss_clip": 0.01450444, + "auxiliary_loss_mlp": 0.01218719, + "balance_loss_clip": 1.14135575, + "balance_loss_mlp": 1.02455068, + "epoch": 0.5927551480535097, + "flos": 28036206747360.0, + "grad_norm": 1.855629041797922, + "language_loss": 0.76789397, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79458559, + "num_input_tokens_seen": 212385300, + "step": 9859, + "time_per_iteration": 2.8332014083862305 + }, + { + "auxiliary_loss_clip": 0.01455902, + "auxiliary_loss_mlp": 0.01223409, + "balance_loss_clip": 1.14579916, + "balance_loss_mlp": 1.02571261, + "epoch": 0.5928152713061776, + "flos": 21800830800960.0, + "grad_norm": 2.296173867217472, + "language_loss": 0.75399339, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.78078651, + "num_input_tokens_seen": 212402140, + "step": 9860, + "time_per_iteration": 2.7787985801696777 + }, + { + "auxiliary_loss_clip": 0.01450332, + "auxiliary_loss_mlp": 0.01227857, + "balance_loss_clip": 1.14159083, + "balance_loss_mlp": 1.03454709, + "epoch": 0.5928753945588456, + "flos": 24319171978560.0, + "grad_norm": 1.7281469866842156, + "language_loss": 0.75992185, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78670371, + "num_input_tokens_seen": 212421790, + "step": 9861, + "time_per_iteration": 2.906928539276123 + }, + { + "auxiliary_loss_clip": 0.01445158, + "auxiliary_loss_mlp": 0.01220053, + "balance_loss_clip": 1.13595009, + "balance_loss_mlp": 1.02626657, + "epoch": 0.5929355178115137, + "flos": 26466799677120.0, + "grad_norm": 2.296514502486044, + "language_loss": 0.70877731, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.73542941, + "num_input_tokens_seen": 212442115, + "step": 9862, + "time_per_iteration": 2.8182878494262695 + }, + { + "auxiliary_loss_clip": 0.01447114, + "auxiliary_loss_mlp": 0.01221554, + "balance_loss_clip": 1.13737059, + "balance_loss_mlp": 1.0282445, + "epoch": 0.5929956410641816, + "flos": 24967939713120.0, + "grad_norm": 63.23263283844726, + "language_loss": 0.77878559, + "learning_rate": 1.500032899685832e-06, + "loss": 0.80547231, + "num_input_tokens_seen": 212459535, + "step": 9863, + "time_per_iteration": 2.845289707183838 + }, + { + "auxiliary_loss_clip": 0.01456869, + "auxiliary_loss_mlp": 0.01230177, + "balance_loss_clip": 1.14795136, + "balance_loss_mlp": 1.03228951, + "epoch": 0.5930557643168496, + "flos": 26210440693440.0, + "grad_norm": 1.7221175222954366, + "language_loss": 0.70800328, + "learning_rate": 1.499655812861921e-06, + "loss": 0.73487377, + "num_input_tokens_seen": 212479385, + "step": 9864, + "time_per_iteration": 2.819307565689087 + }, + { + "auxiliary_loss_clip": 0.01450873, + "auxiliary_loss_mlp": 0.01228179, + "balance_loss_clip": 1.1432966, + "balance_loss_mlp": 1.0339154, + "epoch": 0.5931158875695175, + "flos": 27857411579520.0, + "grad_norm": 1.5304470512878743, + "language_loss": 0.67423946, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.70103002, + "num_input_tokens_seen": 212500060, + "step": 9865, + "time_per_iteration": 2.8518872261047363 + }, + { + "auxiliary_loss_clip": 0.01451809, + "auxiliary_loss_mlp": 0.0123207, + "balance_loss_clip": 1.14212191, + "balance_loss_mlp": 1.03704357, + "epoch": 0.5931760108221855, + "flos": 15415447524480.0, + "grad_norm": 2.9220942081585384, + "language_loss": 0.77851713, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.80535591, + "num_input_tokens_seen": 212518590, + "step": 9866, + "time_per_iteration": 2.758070945739746 + }, + { + "auxiliary_loss_clip": 0.01462907, + "auxiliary_loss_mlp": 0.01225865, + "balance_loss_clip": 1.15558779, + "balance_loss_mlp": 1.02788234, + "epoch": 0.5932361340748534, + "flos": 30191040796320.0, + "grad_norm": 1.8480513414186281, + "language_loss": 0.72367197, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.75055969, + "num_input_tokens_seen": 212538190, + "step": 9867, + "time_per_iteration": 2.8867077827453613 + }, + { + "auxiliary_loss_clip": 0.01462212, + "auxiliary_loss_mlp": 0.0122473, + "balance_loss_clip": 1.15372407, + "balance_loss_mlp": 1.03084755, + "epoch": 0.5932962573275214, + "flos": 20159662923360.0, + "grad_norm": 2.206660728925822, + "language_loss": 0.66497028, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.69183969, + "num_input_tokens_seen": 212557820, + "step": 9868, + "time_per_iteration": 2.8881959915161133 + }, + { + "auxiliary_loss_clip": 0.01454877, + "auxiliary_loss_mlp": 0.01226581, + "balance_loss_clip": 1.14553332, + "balance_loss_mlp": 1.03155422, + "epoch": 0.5933563805801894, + "flos": 25448190811200.0, + "grad_norm": 1.6619487025962438, + "language_loss": 0.7535001, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.78031462, + "num_input_tokens_seen": 212577645, + "step": 9869, + "time_per_iteration": 2.797882318496704 + }, + { + "auxiliary_loss_clip": 0.01456359, + "auxiliary_loss_mlp": 0.01220802, + "balance_loss_clip": 1.14722455, + "balance_loss_mlp": 1.02329636, + "epoch": 0.5934165038328574, + "flos": 59999837952960.0, + "grad_norm": 1.5793082430708096, + "language_loss": 0.73978502, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76655662, + "num_input_tokens_seen": 212603430, + "step": 9870, + "time_per_iteration": 3.097993850708008 + }, + { + "auxiliary_loss_clip": 0.01452005, + "auxiliary_loss_mlp": 0.01224916, + "balance_loss_clip": 1.14285088, + "balance_loss_mlp": 1.02922177, + "epoch": 0.5934766270855253, + "flos": 24422565235680.0, + "grad_norm": 2.075005187357069, + "language_loss": 0.71785086, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.74462008, + "num_input_tokens_seen": 212620730, + "step": 9871, + "time_per_iteration": 2.8066413402557373 + }, + { + "auxiliary_loss_clip": 0.01455045, + "auxiliary_loss_mlp": 0.01228765, + "balance_loss_clip": 1.14569831, + "balance_loss_mlp": 1.03164029, + "epoch": 0.5935367503381933, + "flos": 23515390396800.0, + "grad_norm": 2.050225493369507, + "language_loss": 0.74908447, + "learning_rate": 1.496639802503271e-06, + "loss": 0.77592254, + "num_input_tokens_seen": 212639745, + "step": 9872, + "time_per_iteration": 2.8035740852355957 + }, + { + "auxiliary_loss_clip": 0.01449575, + "auxiliary_loss_mlp": 0.01228928, + "balance_loss_clip": 1.14007902, + "balance_loss_mlp": 1.03170824, + "epoch": 0.5935968735908612, + "flos": 18950083950240.0, + "grad_norm": 2.9890455346346103, + "language_loss": 0.79383099, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.82061601, + "num_input_tokens_seen": 212655915, + "step": 9873, + "time_per_iteration": 2.728286027908325 + }, + { + "auxiliary_loss_clip": 0.01456313, + "auxiliary_loss_mlp": 0.0122772, + "balance_loss_clip": 1.1471808, + "balance_loss_mlp": 1.02821183, + "epoch": 0.5936569968435292, + "flos": 25485777982080.0, + "grad_norm": 1.711949879492699, + "language_loss": 0.84781897, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87465936, + "num_input_tokens_seen": 212676115, + "step": 9874, + "time_per_iteration": 2.841142416000366 + }, + { + "auxiliary_loss_clip": 0.01483835, + "auxiliary_loss_mlp": 0.01197014, + "balance_loss_clip": 1.20775676, + "balance_loss_mlp": 1.0112381, + "epoch": 0.5937171200961973, + "flos": 66384727799040.0, + "grad_norm": 0.7150158833104424, + "language_loss": 0.60023594, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62704444, + "num_input_tokens_seen": 212737560, + "step": 9875, + "time_per_iteration": 3.3832132816314697 + }, + { + "auxiliary_loss_clip": 0.01447902, + "auxiliary_loss_mlp": 0.01221795, + "balance_loss_clip": 1.13835633, + "balance_loss_mlp": 1.02495623, + "epoch": 0.5937772433488652, + "flos": 14905270743840.0, + "grad_norm": 1.9813011318089528, + "language_loss": 0.77838266, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.80507958, + "num_input_tokens_seen": 212755365, + "step": 9876, + "time_per_iteration": 2.7812631130218506 + }, + { + "auxiliary_loss_clip": 0.01452133, + "auxiliary_loss_mlp": 0.01212416, + "balance_loss_clip": 1.14291072, + "balance_loss_mlp": 1.01939273, + "epoch": 0.5938373666015332, + "flos": 22563497892960.0, + "grad_norm": 1.508879347031, + "language_loss": 0.7569958, + "learning_rate": 1.494755415907243e-06, + "loss": 0.78364134, + "num_input_tokens_seen": 212773875, + "step": 9877, + "time_per_iteration": 2.8016111850738525 + }, + { + "auxiliary_loss_clip": 0.01452597, + "auxiliary_loss_mlp": 0.01224355, + "balance_loss_clip": 1.14415216, + "balance_loss_mlp": 1.02866149, + "epoch": 0.5938974898542011, + "flos": 18442789709760.0, + "grad_norm": 2.2651870958845235, + "language_loss": 0.80810875, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83487833, + "num_input_tokens_seen": 212790590, + "step": 9878, + "time_per_iteration": 2.7667477130889893 + }, + { + "auxiliary_loss_clip": 0.01452724, + "auxiliary_loss_mlp": 0.01225968, + "balance_loss_clip": 1.14376402, + "balance_loss_mlp": 1.03132367, + "epoch": 0.5939576131068691, + "flos": 45590179212000.0, + "grad_norm": 1.782127318001743, + "language_loss": 0.71041322, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73720014, + "num_input_tokens_seen": 212812265, + "step": 9879, + "time_per_iteration": 4.332936763763428 + }, + { + "auxiliary_loss_clip": 0.01457155, + "auxiliary_loss_mlp": 0.0122561, + "balance_loss_clip": 1.14758575, + "balance_loss_mlp": 1.03153729, + "epoch": 0.594017736359537, + "flos": 23590033744320.0, + "grad_norm": 1.4991148049694976, + "language_loss": 0.57454896, + "learning_rate": 1.493625013742401e-06, + "loss": 0.60137653, + "num_input_tokens_seen": 212831915, + "step": 9880, + "time_per_iteration": 2.828855037689209 + }, + { + "auxiliary_loss_clip": 0.01448151, + "auxiliary_loss_mlp": 0.01227343, + "balance_loss_clip": 1.13820326, + "balance_loss_mlp": 1.03107643, + "epoch": 0.594077859612205, + "flos": 29459626872480.0, + "grad_norm": 1.8894705358744908, + "language_loss": 0.7747941, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.80154896, + "num_input_tokens_seen": 212851350, + "step": 9881, + "time_per_iteration": 2.820244312286377 + }, + { + "auxiliary_loss_clip": 0.01448878, + "auxiliary_loss_mlp": 0.0123242, + "balance_loss_clip": 1.13894224, + "balance_loss_mlp": 1.03634453, + "epoch": 0.594137982864873, + "flos": 16801963185600.0, + "grad_norm": 2.08234742648229, + "language_loss": 0.82482165, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.85163462, + "num_input_tokens_seen": 212867995, + "step": 9882, + "time_per_iteration": 2.8379428386688232 + }, + { + "auxiliary_loss_clip": 0.0145313, + "auxiliary_loss_mlp": 0.01229949, + "balance_loss_clip": 1.14305639, + "balance_loss_mlp": 1.0351131, + "epoch": 0.594198106117541, + "flos": 12751915893120.0, + "grad_norm": 2.1854531094651084, + "language_loss": 0.79155099, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81838179, + "num_input_tokens_seen": 212885220, + "step": 9883, + "time_per_iteration": 2.7573657035827637 + }, + { + "auxiliary_loss_clip": 0.01452638, + "auxiliary_loss_mlp": 0.01230192, + "balance_loss_clip": 1.14232326, + "balance_loss_mlp": 1.03507042, + "epoch": 0.5942582293702089, + "flos": 20998680130080.0, + "grad_norm": 3.392113678087491, + "language_loss": 0.74303347, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.76986182, + "num_input_tokens_seen": 212903195, + "step": 9884, + "time_per_iteration": 2.832425594329834 + }, + { + "auxiliary_loss_clip": 0.01459998, + "auxiliary_loss_mlp": 0.01244629, + "balance_loss_clip": 1.15139651, + "balance_loss_mlp": 1.04731369, + "epoch": 0.5943183526228769, + "flos": 28293855288480.0, + "grad_norm": 2.145937132110804, + "language_loss": 0.67007899, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.6971252, + "num_input_tokens_seen": 212923340, + "step": 9885, + "time_per_iteration": 2.8856961727142334 + }, + { + "auxiliary_loss_clip": 0.01448215, + "auxiliary_loss_mlp": 0.01232222, + "balance_loss_clip": 1.13981509, + "balance_loss_mlp": 1.03738654, + "epoch": 0.5943784758755448, + "flos": 26617034576160.0, + "grad_norm": 4.354395944636293, + "language_loss": 0.77388585, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.80069029, + "num_input_tokens_seen": 212942755, + "step": 9886, + "time_per_iteration": 2.8000686168670654 + }, + { + "auxiliary_loss_clip": 0.01486898, + "auxiliary_loss_mlp": 0.01204437, + "balance_loss_clip": 1.21442938, + "balance_loss_mlp": 1.0186615, + "epoch": 0.5944385991282128, + "flos": 64197275168160.0, + "grad_norm": 0.8286860023201387, + "language_loss": 0.64564139, + "learning_rate": 1.490988081420423e-06, + "loss": 0.67255473, + "num_input_tokens_seen": 212999355, + "step": 9887, + "time_per_iteration": 4.658694267272949 + }, + { + "auxiliary_loss_clip": 0.01447499, + "auxiliary_loss_mlp": 0.01227919, + "balance_loss_clip": 1.13882244, + "balance_loss_mlp": 1.03470421, + "epoch": 0.5944987223808808, + "flos": 19573970447520.0, + "grad_norm": 3.296326242797004, + "language_loss": 0.69424379, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.72099793, + "num_input_tokens_seen": 213018570, + "step": 9888, + "time_per_iteration": 2.7628610134124756 + }, + { + "auxiliary_loss_clip": 0.01457519, + "auxiliary_loss_mlp": 0.012313, + "balance_loss_clip": 1.14917827, + "balance_loss_mlp": 1.03541529, + "epoch": 0.5945588456335488, + "flos": 26180021944800.0, + "grad_norm": 1.6209099672027223, + "language_loss": 0.79539239, + "learning_rate": 1.490234845687366e-06, + "loss": 0.82228059, + "num_input_tokens_seen": 213037735, + "step": 9889, + "time_per_iteration": 4.396183252334595 + }, + { + "auxiliary_loss_clip": 0.01445816, + "auxiliary_loss_mlp": 0.0123524, + "balance_loss_clip": 1.1372118, + "balance_loss_mlp": 1.04316962, + "epoch": 0.5946189688862168, + "flos": 20448450848160.0, + "grad_norm": 2.248361942802354, + "language_loss": 0.70865953, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.73547006, + "num_input_tokens_seen": 213057160, + "step": 9890, + "time_per_iteration": 2.900785446166992 + }, + { + "auxiliary_loss_clip": 0.01449514, + "auxiliary_loss_mlp": 0.01232579, + "balance_loss_clip": 1.14101887, + "balance_loss_mlp": 1.03993666, + "epoch": 0.5946790921388847, + "flos": 13438877649120.0, + "grad_norm": 3.369674469443462, + "language_loss": 0.69163835, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71845925, + "num_input_tokens_seen": 213073630, + "step": 9891, + "time_per_iteration": 2.8083648681640625 + }, + { + "auxiliary_loss_clip": 0.01452119, + "auxiliary_loss_mlp": 0.01239963, + "balance_loss_clip": 1.14397216, + "balance_loss_mlp": 1.04426908, + "epoch": 0.5947392153915527, + "flos": 20414353068000.0, + "grad_norm": 1.9748453733807116, + "language_loss": 0.53528726, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.56220812, + "num_input_tokens_seen": 213092450, + "step": 9892, + "time_per_iteration": 2.7457332611083984 + }, + { + "auxiliary_loss_clip": 0.01486874, + "auxiliary_loss_mlp": 0.01226944, + "balance_loss_clip": 1.21357203, + "balance_loss_mlp": 1.03964233, + "epoch": 0.5947993386442206, + "flos": 65625853523040.0, + "grad_norm": 0.6762482054861383, + "language_loss": 0.54466474, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.57180291, + "num_input_tokens_seen": 213155465, + "step": 9893, + "time_per_iteration": 3.357952117919922 + }, + { + "auxiliary_loss_clip": 0.01445841, + "auxiliary_loss_mlp": 0.01221197, + "balance_loss_clip": 1.13656855, + "balance_loss_mlp": 1.02893639, + "epoch": 0.5948594618968887, + "flos": 23185222485120.0, + "grad_norm": 1.738343582711097, + "language_loss": 0.74795556, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.7746259, + "num_input_tokens_seen": 213174875, + "step": 9894, + "time_per_iteration": 4.355402946472168 + }, + { + "auxiliary_loss_clip": 0.01446494, + "auxiliary_loss_mlp": 0.01229255, + "balance_loss_clip": 1.13785148, + "balance_loss_mlp": 1.03537321, + "epoch": 0.5949195851495566, + "flos": 13628520270720.0, + "grad_norm": 1.827479186492684, + "language_loss": 0.77491575, + "learning_rate": 1.487975602873434e-06, + "loss": 0.80167323, + "num_input_tokens_seen": 213192695, + "step": 9895, + "time_per_iteration": 2.7656660079956055 + }, + { + "auxiliary_loss_clip": 0.01451545, + "auxiliary_loss_mlp": 0.01232923, + "balance_loss_clip": 1.14250135, + "balance_loss_mlp": 1.0405674, + "epoch": 0.5949797084022246, + "flos": 19752803543520.0, + "grad_norm": 1.7105492421590067, + "language_loss": 0.79144889, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.81829357, + "num_input_tokens_seen": 213211195, + "step": 9896, + "time_per_iteration": 2.8266751766204834 + }, + { + "auxiliary_loss_clip": 0.01444547, + "auxiliary_loss_mlp": 0.0123024, + "balance_loss_clip": 1.13627315, + "balance_loss_mlp": 1.03635812, + "epoch": 0.5950398316548925, + "flos": 25776158889600.0, + "grad_norm": 1.5330430239517312, + "language_loss": 0.83905029, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.86579812, + "num_input_tokens_seen": 213231975, + "step": 9897, + "time_per_iteration": 2.8214120864868164 + }, + { + "auxiliary_loss_clip": 0.01444897, + "auxiliary_loss_mlp": 0.01226054, + "balance_loss_clip": 1.13562441, + "balance_loss_mlp": 1.03236258, + "epoch": 0.5950999549075605, + "flos": 23041245732480.0, + "grad_norm": 3.1315794266226065, + "language_loss": 0.70956576, + "learning_rate": 1.486846243389939e-06, + "loss": 0.73627526, + "num_input_tokens_seen": 213249760, + "step": 9898, + "time_per_iteration": 2.8292927742004395 + }, + { + "auxiliary_loss_clip": 0.01448388, + "auxiliary_loss_mlp": 0.01233364, + "balance_loss_clip": 1.13845539, + "balance_loss_mlp": 1.03423738, + "epoch": 0.5951600781602284, + "flos": 32448775036320.0, + "grad_norm": 2.2915759246579004, + "language_loss": 0.64046276, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66728032, + "num_input_tokens_seen": 213269890, + "step": 9899, + "time_per_iteration": 2.9367339611053467 + }, + { + "auxiliary_loss_clip": 0.01448992, + "auxiliary_loss_mlp": 0.01218742, + "balance_loss_clip": 1.14040124, + "balance_loss_mlp": 1.0206635, + "epoch": 0.5952202014128964, + "flos": 23802888764160.0, + "grad_norm": 1.6798781588196305, + "language_loss": 0.71851277, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.74519002, + "num_input_tokens_seen": 213289400, + "step": 9900, + "time_per_iteration": 2.805694103240967 + }, + { + "auxiliary_loss_clip": 0.01445441, + "auxiliary_loss_mlp": 0.01229887, + "balance_loss_clip": 1.13685322, + "balance_loss_mlp": 1.03390682, + "epoch": 0.5952803246655644, + "flos": 22494392056800.0, + "grad_norm": 1.7444911308544995, + "language_loss": 0.84550112, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.87225443, + "num_input_tokens_seen": 213308040, + "step": 9901, + "time_per_iteration": 2.8166913986206055 + }, + { + "auxiliary_loss_clip": 0.0147506, + "auxiliary_loss_mlp": 0.01192253, + "balance_loss_clip": 1.20235181, + "balance_loss_mlp": 1.00647736, + "epoch": 0.5953404479182324, + "flos": 51240089887200.0, + "grad_norm": 0.7966483876290743, + "language_loss": 0.58169383, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60836691, + "num_input_tokens_seen": 213358585, + "step": 9902, + "time_per_iteration": 3.16749906539917 + }, + { + "auxiliary_loss_clip": 0.01443902, + "auxiliary_loss_mlp": 0.01228374, + "balance_loss_clip": 1.1347549, + "balance_loss_mlp": 1.03411031, + "epoch": 0.5954005711709004, + "flos": 23114940876000.0, + "grad_norm": 1.8951968328822255, + "language_loss": 0.77549422, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.80221701, + "num_input_tokens_seen": 213379585, + "step": 9903, + "time_per_iteration": 2.8437588214874268 + }, + { + "auxiliary_loss_clip": 0.01443474, + "auxiliary_loss_mlp": 0.01225345, + "balance_loss_clip": 1.13374305, + "balance_loss_mlp": 1.03289378, + "epoch": 0.5954606944235683, + "flos": 35957657877120.0, + "grad_norm": 1.7601043948925201, + "language_loss": 0.77845085, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.80513906, + "num_input_tokens_seen": 213401465, + "step": 9904, + "time_per_iteration": 2.914339542388916 + }, + { + "auxiliary_loss_clip": 0.01439403, + "auxiliary_loss_mlp": 0.01227873, + "balance_loss_clip": 1.12926996, + "balance_loss_mlp": 1.03599381, + "epoch": 0.5955208176762363, + "flos": 30446186078880.0, + "grad_norm": 1.648769740345118, + "language_loss": 0.73193288, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.75860566, + "num_input_tokens_seen": 213422720, + "step": 9905, + "time_per_iteration": 2.9070935249328613 + }, + { + "auxiliary_loss_clip": 0.01442877, + "auxiliary_loss_mlp": 0.0122266, + "balance_loss_clip": 1.13327229, + "balance_loss_mlp": 1.02696609, + "epoch": 0.5955809409289042, + "flos": 17642080308960.0, + "grad_norm": 1.6215376438589977, + "language_loss": 0.69908655, + "learning_rate": 1.483835475336295e-06, + "loss": 0.72574192, + "num_input_tokens_seen": 213439480, + "step": 9906, + "time_per_iteration": 2.9504895210266113 + }, + { + "auxiliary_loss_clip": 0.01443805, + "auxiliary_loss_mlp": 0.01224014, + "balance_loss_clip": 1.1336292, + "balance_loss_mlp": 1.03060842, + "epoch": 0.5956410641815723, + "flos": 24282191658240.0, + "grad_norm": 1.9067542049711246, + "language_loss": 0.75542903, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.78210723, + "num_input_tokens_seen": 213458895, + "step": 9907, + "time_per_iteration": 2.7907416820526123 + }, + { + "auxiliary_loss_clip": 0.0145192, + "auxiliary_loss_mlp": 0.0123167, + "balance_loss_clip": 1.1410296, + "balance_loss_mlp": 1.03673863, + "epoch": 0.5957011874342402, + "flos": 35737103440800.0, + "grad_norm": 1.5803334143182988, + "language_loss": 0.67243803, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69927394, + "num_input_tokens_seen": 213481730, + "step": 9908, + "time_per_iteration": 2.9328720569610596 + }, + { + "auxiliary_loss_clip": 0.01444097, + "auxiliary_loss_mlp": 0.01225587, + "balance_loss_clip": 1.13430679, + "balance_loss_mlp": 1.03256297, + "epoch": 0.5957613106869082, + "flos": 21246770774880.0, + "grad_norm": 2.5453052404906895, + "language_loss": 0.76495707, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.79165393, + "num_input_tokens_seen": 213497225, + "step": 9909, + "time_per_iteration": 2.801542282104492 + }, + { + "auxiliary_loss_clip": 0.01470006, + "auxiliary_loss_mlp": 0.0119577, + "balance_loss_clip": 1.19545043, + "balance_loss_mlp": 1.00961304, + "epoch": 0.5958214339395761, + "flos": 65947335886080.0, + "grad_norm": 0.9163242489559109, + "language_loss": 0.73369372, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.76035148, + "num_input_tokens_seen": 213556890, + "step": 9910, + "time_per_iteration": 3.4259345531463623 + }, + { + "auxiliary_loss_clip": 0.01445128, + "auxiliary_loss_mlp": 0.01219405, + "balance_loss_clip": 1.1355474, + "balance_loss_mlp": 1.02437901, + "epoch": 0.5958815571922441, + "flos": 23220761535360.0, + "grad_norm": 1.6469327261531561, + "language_loss": 0.69774055, + "learning_rate": 1.481954380961799e-06, + "loss": 0.72438586, + "num_input_tokens_seen": 213575800, + "step": 9911, + "time_per_iteration": 2.8179447650909424 + }, + { + "auxiliary_loss_clip": 0.01457679, + "auxiliary_loss_mlp": 0.01230599, + "balance_loss_clip": 1.14782262, + "balance_loss_mlp": 1.03490543, + "epoch": 0.595941680444912, + "flos": 16540142546880.0, + "grad_norm": 2.0571458197406596, + "language_loss": 0.65342659, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.68030941, + "num_input_tokens_seen": 213592740, + "step": 9912, + "time_per_iteration": 2.7494568824768066 + }, + { + "auxiliary_loss_clip": 0.01449167, + "auxiliary_loss_mlp": 0.0124326, + "balance_loss_clip": 1.1381712, + "balance_loss_mlp": 1.04880595, + "epoch": 0.59600180369758, + "flos": 27821379463200.0, + "grad_norm": 2.107740407597522, + "language_loss": 0.7285918, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75551611, + "num_input_tokens_seen": 213611970, + "step": 9913, + "time_per_iteration": 2.8676648139953613 + }, + { + "auxiliary_loss_clip": 0.01441728, + "auxiliary_loss_mlp": 0.01232731, + "balance_loss_clip": 1.13030505, + "balance_loss_mlp": 1.04008865, + "epoch": 0.596061926950248, + "flos": 29493838437120.0, + "grad_norm": 2.24975735966567, + "language_loss": 0.80180514, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.82854974, + "num_input_tokens_seen": 213632230, + "step": 9914, + "time_per_iteration": 2.834775447845459 + }, + { + "auxiliary_loss_clip": 0.01448591, + "auxiliary_loss_mlp": 0.01228536, + "balance_loss_clip": 1.13862252, + "balance_loss_mlp": 1.03350985, + "epoch": 0.596122050202916, + "flos": 16838678008800.0, + "grad_norm": 1.812337484705886, + "language_loss": 0.6799143, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.70668554, + "num_input_tokens_seen": 213649645, + "step": 9915, + "time_per_iteration": 2.807555913925171 + }, + { + "auxiliary_loss_clip": 0.01446842, + "auxiliary_loss_mlp": 0.0123046, + "balance_loss_clip": 1.13705444, + "balance_loss_mlp": 1.03734136, + "epoch": 0.596182173455584, + "flos": 20998793914560.0, + "grad_norm": 1.599000654944391, + "language_loss": 0.78415507, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.81092811, + "num_input_tokens_seen": 213668850, + "step": 9916, + "time_per_iteration": 2.770397901535034 + }, + { + "auxiliary_loss_clip": 0.01441559, + "auxiliary_loss_mlp": 0.01233499, + "balance_loss_clip": 1.13133466, + "balance_loss_mlp": 1.0414288, + "epoch": 0.5962422967082519, + "flos": 16067249511840.0, + "grad_norm": 2.4603971033822374, + "language_loss": 0.83075058, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.85750115, + "num_input_tokens_seen": 213685695, + "step": 9917, + "time_per_iteration": 4.108014822006226 + }, + { + "auxiliary_loss_clip": 0.01449601, + "auxiliary_loss_mlp": 0.012272, + "balance_loss_clip": 1.13906753, + "balance_loss_mlp": 1.03379512, + "epoch": 0.5963024199609199, + "flos": 12168612891360.0, + "grad_norm": 1.7818476518592496, + "language_loss": 0.77197921, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79874718, + "num_input_tokens_seen": 213703515, + "step": 9918, + "time_per_iteration": 2.7822399139404297 + }, + { + "auxiliary_loss_clip": 0.014539, + "auxiliary_loss_mlp": 0.01228081, + "balance_loss_clip": 1.14289141, + "balance_loss_mlp": 1.03353179, + "epoch": 0.5963625432135878, + "flos": 28076790242880.0, + "grad_norm": 1.5209946815267108, + "language_loss": 0.78988194, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.81670177, + "num_input_tokens_seen": 213724170, + "step": 9919, + "time_per_iteration": 2.831080198287964 + }, + { + "auxiliary_loss_clip": 0.01452818, + "auxiliary_loss_mlp": 0.01227999, + "balance_loss_clip": 1.14118695, + "balance_loss_mlp": 1.03373528, + "epoch": 0.5964226664662559, + "flos": 19862113593600.0, + "grad_norm": 2.0600007381490713, + "language_loss": 0.77878904, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.80559719, + "num_input_tokens_seen": 213740620, + "step": 9920, + "time_per_iteration": 2.8655028343200684 + }, + { + "auxiliary_loss_clip": 0.01455055, + "auxiliary_loss_mlp": 0.01231968, + "balance_loss_clip": 1.14480472, + "balance_loss_mlp": 1.03675056, + "epoch": 0.5964827897189238, + "flos": 12934276308000.0, + "grad_norm": 3.1771004302524264, + "language_loss": 0.82776725, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.85463744, + "num_input_tokens_seen": 213755390, + "step": 9921, + "time_per_iteration": 2.831364631652832 + }, + { + "auxiliary_loss_clip": 0.01453475, + "auxiliary_loss_mlp": 0.01233031, + "balance_loss_clip": 1.14377713, + "balance_loss_mlp": 1.03981662, + "epoch": 0.5965429129715918, + "flos": 18152788083840.0, + "grad_norm": 1.888125318021351, + "language_loss": 0.80714023, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.83400536, + "num_input_tokens_seen": 213773225, + "step": 9922, + "time_per_iteration": 2.795870304107666 + }, + { + "auxiliary_loss_clip": 0.01450384, + "auxiliary_loss_mlp": 0.01221567, + "balance_loss_clip": 1.13972962, + "balance_loss_mlp": 1.02539659, + "epoch": 0.5966030362242597, + "flos": 21765443463360.0, + "grad_norm": 3.176585809677138, + "language_loss": 0.76894951, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79566896, + "num_input_tokens_seen": 213791860, + "step": 9923, + "time_per_iteration": 2.8544836044311523 + }, + { + "auxiliary_loss_clip": 0.01452506, + "auxiliary_loss_mlp": 0.01232964, + "balance_loss_clip": 1.13963509, + "balance_loss_mlp": 1.03593516, + "epoch": 0.5966631594769277, + "flos": 18809217306720.0, + "grad_norm": 2.4130832755553255, + "language_loss": 0.75908852, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.78594327, + "num_input_tokens_seen": 213809455, + "step": 9924, + "time_per_iteration": 2.793118953704834 + }, + { + "auxiliary_loss_clip": 0.01452248, + "auxiliary_loss_mlp": 0.01224456, + "balance_loss_clip": 1.14115798, + "balance_loss_mlp": 1.03009748, + "epoch": 0.5967232827295956, + "flos": 14065912183680.0, + "grad_norm": 2.483028163055094, + "language_loss": 0.66577387, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.69254094, + "num_input_tokens_seen": 213826615, + "step": 9925, + "time_per_iteration": 2.800001859664917 + }, + { + "auxiliary_loss_clip": 0.01458052, + "auxiliary_loss_mlp": 0.01235384, + "balance_loss_clip": 1.14671588, + "balance_loss_mlp": 1.04140687, + "epoch": 0.5967834059822636, + "flos": 17240151589920.0, + "grad_norm": 2.021691838171144, + "language_loss": 0.71492893, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.74186325, + "num_input_tokens_seen": 213844495, + "step": 9926, + "time_per_iteration": 5.795275449752808 + }, + { + "auxiliary_loss_clip": 0.01460564, + "auxiliary_loss_mlp": 0.01230005, + "balance_loss_clip": 1.1498251, + "balance_loss_mlp": 1.03106821, + "epoch": 0.5968435292349316, + "flos": 42523429304160.0, + "grad_norm": 1.8254284602392687, + "language_loss": 0.70258516, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.72949088, + "num_input_tokens_seen": 213869125, + "step": 9927, + "time_per_iteration": 3.0095114707946777 + }, + { + "auxiliary_loss_clip": 0.01452484, + "auxiliary_loss_mlp": 0.01222911, + "balance_loss_clip": 1.13906491, + "balance_loss_mlp": 1.02693057, + "epoch": 0.5969036524875996, + "flos": 37633795882560.0, + "grad_norm": 1.7310652741168273, + "language_loss": 0.63230944, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65906346, + "num_input_tokens_seen": 213891115, + "step": 9928, + "time_per_iteration": 2.9384498596191406 + }, + { + "auxiliary_loss_clip": 0.01449134, + "auxiliary_loss_mlp": 0.01215052, + "balance_loss_clip": 1.1374619, + "balance_loss_mlp": 1.02136111, + "epoch": 0.5969637757402676, + "flos": 23150593710720.0, + "grad_norm": 1.8322951666443676, + "language_loss": 0.69527316, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.72191507, + "num_input_tokens_seen": 213911925, + "step": 9929, + "time_per_iteration": 2.801605701446533 + }, + { + "auxiliary_loss_clip": 0.01452834, + "auxiliary_loss_mlp": 0.01221132, + "balance_loss_clip": 1.1405127, + "balance_loss_mlp": 1.0260098, + "epoch": 0.5970238989929355, + "flos": 24023025990720.0, + "grad_norm": 2.279225955127647, + "language_loss": 0.76292408, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78966367, + "num_input_tokens_seen": 213930715, + "step": 9930, + "time_per_iteration": 2.857839822769165 + }, + { + "auxiliary_loss_clip": 0.0145363, + "auxiliary_loss_mlp": 0.01235228, + "balance_loss_clip": 1.14116526, + "balance_loss_mlp": 1.03962898, + "epoch": 0.5970840222456035, + "flos": 19428893778240.0, + "grad_norm": 1.722343824650689, + "language_loss": 0.69165164, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.71854025, + "num_input_tokens_seen": 213950015, + "step": 9931, + "time_per_iteration": 2.7861974239349365 + }, + { + "auxiliary_loss_clip": 0.01472299, + "auxiliary_loss_mlp": 0.01198174, + "balance_loss_clip": 1.19329238, + "balance_loss_mlp": 1.01354218, + "epoch": 0.5971441454982714, + "flos": 62982310396320.0, + "grad_norm": 0.8539633699575738, + "language_loss": 0.64227962, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66898435, + "num_input_tokens_seen": 214003330, + "step": 9932, + "time_per_iteration": 4.815271854400635 + }, + { + "auxiliary_loss_clip": 0.01454259, + "auxiliary_loss_mlp": 0.01227927, + "balance_loss_clip": 1.14248478, + "balance_loss_mlp": 1.02908552, + "epoch": 0.5972042687509395, + "flos": 20268290266560.0, + "grad_norm": 1.755143118081757, + "language_loss": 0.74059546, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76741731, + "num_input_tokens_seen": 214021680, + "step": 9933, + "time_per_iteration": 2.7796084880828857 + }, + { + "auxiliary_loss_clip": 0.01469408, + "auxiliary_loss_mlp": 0.01188667, + "balance_loss_clip": 1.18961024, + "balance_loss_mlp": 1.00327301, + "epoch": 0.5972643920036074, + "flos": 71660170107360.0, + "grad_norm": 0.6708346256296215, + "language_loss": 0.51988655, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54646736, + "num_input_tokens_seen": 214090265, + "step": 9934, + "time_per_iteration": 3.3685317039489746 + }, + { + "auxiliary_loss_clip": 0.01468517, + "auxiliary_loss_mlp": 0.01186546, + "balance_loss_clip": 1.18839431, + "balance_loss_mlp": 1.00115204, + "epoch": 0.5973245152562754, + "flos": 56898572690880.0, + "grad_norm": 0.8314203170643836, + "language_loss": 0.54228473, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56883538, + "num_input_tokens_seen": 214146375, + "step": 9935, + "time_per_iteration": 3.1618614196777344 + }, + { + "auxiliary_loss_clip": 0.01450073, + "auxiliary_loss_mlp": 0.01218172, + "balance_loss_clip": 1.13799334, + "balance_loss_mlp": 1.02171516, + "epoch": 0.5973846385089433, + "flos": 24166320036480.0, + "grad_norm": 4.491309685513783, + "language_loss": 0.65886545, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.68554789, + "num_input_tokens_seen": 214165340, + "step": 9936, + "time_per_iteration": 2.791477918624878 + }, + { + "auxiliary_loss_clip": 0.01452928, + "auxiliary_loss_mlp": 0.01224732, + "balance_loss_clip": 1.14041758, + "balance_loss_mlp": 1.02865648, + "epoch": 0.5974447617616113, + "flos": 17672233560480.0, + "grad_norm": 2.2101555363795415, + "language_loss": 0.67812884, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.70490545, + "num_input_tokens_seen": 214181360, + "step": 9937, + "time_per_iteration": 2.820707321166992 + }, + { + "auxiliary_loss_clip": 0.01452477, + "auxiliary_loss_mlp": 0.01230167, + "balance_loss_clip": 1.1400317, + "balance_loss_mlp": 1.03409123, + "epoch": 0.5975048850142792, + "flos": 22894689864960.0, + "grad_norm": 2.4888131094530497, + "language_loss": 0.77223539, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79906183, + "num_input_tokens_seen": 214198525, + "step": 9938, + "time_per_iteration": 2.7883057594299316 + }, + { + "auxiliary_loss_clip": 0.01452672, + "auxiliary_loss_mlp": 0.01231189, + "balance_loss_clip": 1.14034271, + "balance_loss_mlp": 1.03692532, + "epoch": 0.5975650082669473, + "flos": 24355431663840.0, + "grad_norm": 2.173162661456271, + "language_loss": 0.75625235, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78309095, + "num_input_tokens_seen": 214218710, + "step": 9939, + "time_per_iteration": 2.8251945972442627 + }, + { + "auxiliary_loss_clip": 0.01452856, + "auxiliary_loss_mlp": 0.01231174, + "balance_loss_clip": 1.13996315, + "balance_loss_mlp": 1.03509808, + "epoch": 0.5976251315196152, + "flos": 20925478052640.0, + "grad_norm": 2.321408210048986, + "language_loss": 0.69058049, + "learning_rate": 1.471053774486878e-06, + "loss": 0.7174207, + "num_input_tokens_seen": 214237800, + "step": 9940, + "time_per_iteration": 2.8270351886749268 + }, + { + "auxiliary_loss_clip": 0.01452446, + "auxiliary_loss_mlp": 0.01224432, + "balance_loss_clip": 1.14124048, + "balance_loss_mlp": 1.03026354, + "epoch": 0.5976852547722832, + "flos": 35847475479360.0, + "grad_norm": 1.390582969440959, + "language_loss": 0.70063388, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72740269, + "num_input_tokens_seen": 214260355, + "step": 9941, + "time_per_iteration": 2.8678200244903564 + }, + { + "auxiliary_loss_clip": 0.01447152, + "auxiliary_loss_mlp": 0.01219174, + "balance_loss_clip": 1.13438296, + "balance_loss_mlp": 1.02681816, + "epoch": 0.5977453780249512, + "flos": 12857395199040.0, + "grad_norm": 2.192787959168803, + "language_loss": 0.7711491, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79781234, + "num_input_tokens_seen": 214277120, + "step": 9942, + "time_per_iteration": 2.8213465213775635 + }, + { + "auxiliary_loss_clip": 0.01449999, + "auxiliary_loss_mlp": 0.01226141, + "balance_loss_clip": 1.13702476, + "balance_loss_mlp": 1.02749062, + "epoch": 0.5978055012776191, + "flos": 20961244671840.0, + "grad_norm": 2.2181089185275913, + "language_loss": 0.75807559, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78483707, + "num_input_tokens_seen": 214295300, + "step": 9943, + "time_per_iteration": 2.7712979316711426 + }, + { + "auxiliary_loss_clip": 0.01448058, + "auxiliary_loss_mlp": 0.01221888, + "balance_loss_clip": 1.13502872, + "balance_loss_mlp": 1.0285778, + "epoch": 0.5978656245302871, + "flos": 34060851650880.0, + "grad_norm": 1.9141802099289824, + "language_loss": 0.61536747, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64206696, + "num_input_tokens_seen": 214317050, + "step": 9944, + "time_per_iteration": 2.893446445465088 + }, + { + "auxiliary_loss_clip": 0.01458208, + "auxiliary_loss_mlp": 0.01226151, + "balance_loss_clip": 1.14522791, + "balance_loss_mlp": 1.03007555, + "epoch": 0.597925747782955, + "flos": 37376033556960.0, + "grad_norm": 1.6377402645773822, + "language_loss": 0.72623122, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.75307482, + "num_input_tokens_seen": 214337470, + "step": 9945, + "time_per_iteration": 2.942507743835449 + }, + { + "auxiliary_loss_clip": 0.01454051, + "auxiliary_loss_mlp": 0.01229269, + "balance_loss_clip": 1.14083219, + "balance_loss_mlp": 1.03185892, + "epoch": 0.5979858710356231, + "flos": 25377188567040.0, + "grad_norm": 2.956171783647372, + "language_loss": 0.66856635, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69539952, + "num_input_tokens_seen": 214357975, + "step": 9946, + "time_per_iteration": 2.830288887023926 + }, + { + "auxiliary_loss_clip": 0.01452343, + "auxiliary_loss_mlp": 0.0123693, + "balance_loss_clip": 1.13803673, + "balance_loss_mlp": 1.04228544, + "epoch": 0.598045994288291, + "flos": 13700091437280.0, + "grad_norm": 2.154774307434209, + "language_loss": 0.88813823, + "learning_rate": 1.468425107717461e-06, + "loss": 0.91503102, + "num_input_tokens_seen": 214374125, + "step": 9947, + "time_per_iteration": 2.739396572113037 + }, + { + "auxiliary_loss_clip": 0.01451199, + "auxiliary_loss_mlp": 0.01223995, + "balance_loss_clip": 1.13827765, + "balance_loss_mlp": 1.02877784, + "epoch": 0.598106117540959, + "flos": 21983532569280.0, + "grad_norm": 1.920922167023217, + "language_loss": 0.72043365, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74718559, + "num_input_tokens_seen": 214393395, + "step": 9948, + "time_per_iteration": 2.9003524780273438 + }, + { + "auxiliary_loss_clip": 0.01453795, + "auxiliary_loss_mlp": 0.01227312, + "balance_loss_clip": 1.1404835, + "balance_loss_mlp": 1.02894807, + "epoch": 0.5981662407936269, + "flos": 20561515786080.0, + "grad_norm": 1.8387631919385883, + "language_loss": 0.89054954, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91736066, + "num_input_tokens_seen": 214411550, + "step": 9949, + "time_per_iteration": 2.824192762374878 + }, + { + "auxiliary_loss_clip": 0.01446932, + "auxiliary_loss_mlp": 0.01228629, + "balance_loss_clip": 1.13329983, + "balance_loss_mlp": 1.03751302, + "epoch": 0.5982263640462949, + "flos": 14065570830240.0, + "grad_norm": 1.81385123031213, + "language_loss": 0.70573485, + "learning_rate": 1.467298838320673e-06, + "loss": 0.73249042, + "num_input_tokens_seen": 214429780, + "step": 9950, + "time_per_iteration": 2.816657066345215 + }, + { + "auxiliary_loss_clip": 0.01452013, + "auxiliary_loss_mlp": 0.01225181, + "balance_loss_clip": 1.13856435, + "balance_loss_mlp": 1.03206182, + "epoch": 0.5982864872989628, + "flos": 17708872527360.0, + "grad_norm": 1.688932002995054, + "language_loss": 0.78039259, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.80716455, + "num_input_tokens_seen": 214447775, + "step": 9951, + "time_per_iteration": 2.856457233428955 + }, + { + "auxiliary_loss_clip": 0.0145316, + "auxiliary_loss_mlp": 0.01229239, + "balance_loss_clip": 1.14036584, + "balance_loss_mlp": 1.03430796, + "epoch": 0.5983466105516309, + "flos": 16766727560640.0, + "grad_norm": 1.8833870253616831, + "language_loss": 0.7403487, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.76717269, + "num_input_tokens_seen": 214467245, + "step": 9952, + "time_per_iteration": 2.7950615882873535 + }, + { + "auxiliary_loss_clip": 0.01453187, + "auxiliary_loss_mlp": 0.01220077, + "balance_loss_clip": 1.13992155, + "balance_loss_mlp": 1.02314341, + "epoch": 0.5984067338042988, + "flos": 20044511936640.0, + "grad_norm": 3.0330043765570776, + "language_loss": 0.78747141, + "learning_rate": 1.466172750724613e-06, + "loss": 0.8142041, + "num_input_tokens_seen": 214484385, + "step": 9953, + "time_per_iteration": 2.8051681518554688 + }, + { + "auxiliary_loss_clip": 0.01448927, + "auxiliary_loss_mlp": 0.01222343, + "balance_loss_clip": 1.13625038, + "balance_loss_mlp": 1.02855587, + "epoch": 0.5984668570569668, + "flos": 26322140217600.0, + "grad_norm": 1.5782761950650661, + "language_loss": 0.69724649, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.72395915, + "num_input_tokens_seen": 214503465, + "step": 9954, + "time_per_iteration": 2.860710382461548 + }, + { + "auxiliary_loss_clip": 0.01447436, + "auxiliary_loss_mlp": 0.01220428, + "balance_loss_clip": 1.1340847, + "balance_loss_mlp": 1.0247345, + "epoch": 0.5985269803096348, + "flos": 20595765278880.0, + "grad_norm": 2.201924208915878, + "language_loss": 0.7326467, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75932533, + "num_input_tokens_seen": 214520725, + "step": 9955, + "time_per_iteration": 4.1803529262542725 + }, + { + "auxiliary_loss_clip": 0.01448366, + "auxiliary_loss_mlp": 0.0122813, + "balance_loss_clip": 1.13460064, + "balance_loss_mlp": 1.03386617, + "epoch": 0.5985871035623027, + "flos": 26867552623200.0, + "grad_norm": 2.3166924556376514, + "language_loss": 0.6854099, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.71217489, + "num_input_tokens_seen": 214540675, + "step": 9956, + "time_per_iteration": 2.883420944213867 + }, + { + "auxiliary_loss_clip": 0.01457126, + "auxiliary_loss_mlp": 0.01232656, + "balance_loss_clip": 1.14414549, + "balance_loss_mlp": 1.03247988, + "epoch": 0.5986472268149707, + "flos": 19611747259200.0, + "grad_norm": 2.730190071758333, + "language_loss": 0.73597765, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.7628755, + "num_input_tokens_seen": 214559910, + "step": 9957, + "time_per_iteration": 2.7650578022003174 + }, + { + "auxiliary_loss_clip": 0.01452485, + "auxiliary_loss_mlp": 0.01227709, + "balance_loss_clip": 1.13950741, + "balance_loss_mlp": 1.03411341, + "epoch": 0.5987073500676386, + "flos": 21796544918880.0, + "grad_norm": 1.7693892108032068, + "language_loss": 0.85038519, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.87718713, + "num_input_tokens_seen": 214575960, + "step": 9958, + "time_per_iteration": 2.8306949138641357 + }, + { + "auxiliary_loss_clip": 0.01448564, + "auxiliary_loss_mlp": 0.01219893, + "balance_loss_clip": 1.13517928, + "balance_loss_mlp": 1.02410364, + "epoch": 0.5987674733203067, + "flos": 24316554935520.0, + "grad_norm": 2.4087077177769594, + "language_loss": 0.66147107, + "learning_rate": 1.463921122471864e-06, + "loss": 0.68815565, + "num_input_tokens_seen": 214594230, + "step": 9959, + "time_per_iteration": 2.8255820274353027 + }, + { + "auxiliary_loss_clip": 0.014468, + "auxiliary_loss_mlp": 0.0122139, + "balance_loss_clip": 1.13456368, + "balance_loss_mlp": 1.02541018, + "epoch": 0.5988275965729746, + "flos": 21322096829280.0, + "grad_norm": 1.763631631383268, + "language_loss": 0.84039783, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.86707973, + "num_input_tokens_seen": 214613130, + "step": 9960, + "time_per_iteration": 2.7514617443084717 + }, + { + "auxiliary_loss_clip": 0.01446122, + "auxiliary_loss_mlp": 0.01222348, + "balance_loss_clip": 1.13245153, + "balance_loss_mlp": 1.02779806, + "epoch": 0.5988877198256426, + "flos": 25120336517280.0, + "grad_norm": 1.7020128256825522, + "language_loss": 0.79338229, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.82006705, + "num_input_tokens_seen": 214634470, + "step": 9961, + "time_per_iteration": 2.8477511405944824 + }, + { + "auxiliary_loss_clip": 0.01449675, + "auxiliary_loss_mlp": 0.01226178, + "balance_loss_clip": 1.13740516, + "balance_loss_mlp": 1.02781415, + "epoch": 0.5989478430783105, + "flos": 26431336483200.0, + "grad_norm": 1.7714490701260508, + "language_loss": 0.67123854, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69799709, + "num_input_tokens_seen": 214654030, + "step": 9962, + "time_per_iteration": 2.835071086883545 + }, + { + "auxiliary_loss_clip": 0.01452595, + "auxiliary_loss_mlp": 0.01227164, + "balance_loss_clip": 1.139166, + "balance_loss_mlp": 1.02975392, + "epoch": 0.5990079663309785, + "flos": 25780975765920.0, + "grad_norm": 1.3794646679370264, + "language_loss": 0.74354231, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.77033991, + "num_input_tokens_seen": 214676985, + "step": 9963, + "time_per_iteration": 4.30025577545166 + }, + { + "auxiliary_loss_clip": 0.01451274, + "auxiliary_loss_mlp": 0.01217743, + "balance_loss_clip": 1.13862264, + "balance_loss_mlp": 1.02309799, + "epoch": 0.5990680895836464, + "flos": 36834831177120.0, + "grad_norm": 1.7013932763140582, + "language_loss": 0.67945468, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70614481, + "num_input_tokens_seen": 214700105, + "step": 9964, + "time_per_iteration": 4.430784702301025 + }, + { + "auxiliary_loss_clip": 0.01463577, + "auxiliary_loss_mlp": 0.01223109, + "balance_loss_clip": 1.15142417, + "balance_loss_mlp": 1.02483976, + "epoch": 0.5991282128363145, + "flos": 24135825431520.0, + "grad_norm": 2.0963961934302495, + "language_loss": 0.77206039, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79892731, + "num_input_tokens_seen": 214717885, + "step": 9965, + "time_per_iteration": 2.817302942276001 + }, + { + "auxiliary_loss_clip": 0.01448826, + "auxiliary_loss_mlp": 0.01216911, + "balance_loss_clip": 1.13552737, + "balance_loss_mlp": 1.01978683, + "epoch": 0.5991883360889824, + "flos": 10305487235520.0, + "grad_norm": 2.8698086769492193, + "language_loss": 0.77457333, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.80123067, + "num_input_tokens_seen": 214733680, + "step": 9966, + "time_per_iteration": 2.7404234409332275 + }, + { + "auxiliary_loss_clip": 0.01448254, + "auxiliary_loss_mlp": 0.0121627, + "balance_loss_clip": 1.13581729, + "balance_loss_mlp": 1.02191091, + "epoch": 0.5992484593416504, + "flos": 23953465016640.0, + "grad_norm": 1.6044418856267917, + "language_loss": 0.73823535, + "learning_rate": 1.460920090376422e-06, + "loss": 0.76488066, + "num_input_tokens_seen": 214753285, + "step": 9967, + "time_per_iteration": 2.831702947616577 + }, + { + "auxiliary_loss_clip": 0.01448996, + "auxiliary_loss_mlp": 0.01227262, + "balance_loss_clip": 1.13661873, + "balance_loss_mlp": 1.0306145, + "epoch": 0.5993085825943184, + "flos": 11945517268320.0, + "grad_norm": 2.5476740893274665, + "language_loss": 0.68981898, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.71658158, + "num_input_tokens_seen": 214767810, + "step": 9968, + "time_per_iteration": 2.7650725841522217 + }, + { + "auxiliary_loss_clip": 0.01440717, + "auxiliary_loss_mlp": 0.01219885, + "balance_loss_clip": 1.12763, + "balance_loss_mlp": 1.02724326, + "epoch": 0.5993687058469863, + "flos": 19028899395360.0, + "grad_norm": 2.215732170696406, + "language_loss": 0.7941736, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.82077968, + "num_input_tokens_seen": 214786040, + "step": 9969, + "time_per_iteration": 2.8433773517608643 + }, + { + "auxiliary_loss_clip": 0.01448014, + "auxiliary_loss_mlp": 0.01217812, + "balance_loss_clip": 1.13465869, + "balance_loss_mlp": 1.02230835, + "epoch": 0.5994288290996543, + "flos": 14284721924640.0, + "grad_norm": 2.050523726428674, + "language_loss": 0.81245112, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83910936, + "num_input_tokens_seen": 214803110, + "step": 9970, + "time_per_iteration": 4.420806884765625 + }, + { + "auxiliary_loss_clip": 0.0145714, + "auxiliary_loss_mlp": 0.01227412, + "balance_loss_clip": 1.14514685, + "balance_loss_mlp": 1.02628255, + "epoch": 0.5994889523523222, + "flos": 19208566910880.0, + "grad_norm": 2.2902528486644784, + "language_loss": 0.62155795, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64840341, + "num_input_tokens_seen": 214819945, + "step": 9971, + "time_per_iteration": 2.8011460304260254 + }, + { + "auxiliary_loss_clip": 0.01449755, + "auxiliary_loss_mlp": 0.01219641, + "balance_loss_clip": 1.13846302, + "balance_loss_mlp": 1.02394712, + "epoch": 0.5995490756049903, + "flos": 28039506497280.0, + "grad_norm": 1.5839850215697806, + "language_loss": 0.79184079, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81853473, + "num_input_tokens_seen": 214838810, + "step": 9972, + "time_per_iteration": 2.8673477172851562 + }, + { + "auxiliary_loss_clip": 0.01439793, + "auxiliary_loss_mlp": 0.01227461, + "balance_loss_clip": 1.12705326, + "balance_loss_mlp": 1.02890623, + "epoch": 0.5996091988576582, + "flos": 29055156966720.0, + "grad_norm": 2.4298794717302705, + "language_loss": 0.76821733, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.79488993, + "num_input_tokens_seen": 214857040, + "step": 9973, + "time_per_iteration": 2.798600196838379 + }, + { + "auxiliary_loss_clip": 0.01445848, + "auxiliary_loss_mlp": 0.0121547, + "balance_loss_clip": 1.13358891, + "balance_loss_mlp": 1.01977611, + "epoch": 0.5996693221103262, + "flos": 20816092146240.0, + "grad_norm": 2.0379220423431366, + "language_loss": 0.65297842, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67959166, + "num_input_tokens_seen": 214873375, + "step": 9974, + "time_per_iteration": 2.8525073528289795 + }, + { + "auxiliary_loss_clip": 0.01449646, + "auxiliary_loss_mlp": 0.01225273, + "balance_loss_clip": 1.13741851, + "balance_loss_mlp": 1.02881622, + "epoch": 0.5997294453629941, + "flos": 23771180458080.0, + "grad_norm": 1.507958004066606, + "language_loss": 0.7420429, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76879215, + "num_input_tokens_seen": 214893900, + "step": 9975, + "time_per_iteration": 2.962529182434082 + }, + { + "auxiliary_loss_clip": 0.01450172, + "auxiliary_loss_mlp": 0.01227155, + "balance_loss_clip": 1.1378758, + "balance_loss_mlp": 1.03031695, + "epoch": 0.5997895686156621, + "flos": 20962154947680.0, + "grad_norm": 1.8802642712036004, + "language_loss": 0.77227509, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79904842, + "num_input_tokens_seen": 214912110, + "step": 9976, + "time_per_iteration": 2.7838845252990723 + }, + { + "auxiliary_loss_clip": 0.01453728, + "auxiliary_loss_mlp": 0.01219252, + "balance_loss_clip": 1.14271224, + "balance_loss_mlp": 1.02222252, + "epoch": 0.59984969186833, + "flos": 28367929713600.0, + "grad_norm": 3.4305817581416376, + "language_loss": 0.74821275, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.77494252, + "num_input_tokens_seen": 214930140, + "step": 9977, + "time_per_iteration": 2.932729482650757 + }, + { + "auxiliary_loss_clip": 0.01445116, + "auxiliary_loss_mlp": 0.0121805, + "balance_loss_clip": 1.13386822, + "balance_loss_mlp": 1.02426302, + "epoch": 0.5999098151209981, + "flos": 22568163056640.0, + "grad_norm": 2.2423291764742097, + "language_loss": 0.6877349, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.71436656, + "num_input_tokens_seen": 214949200, + "step": 9978, + "time_per_iteration": 2.776803731918335 + }, + { + "auxiliary_loss_clip": 0.01450067, + "auxiliary_loss_mlp": 0.01230333, + "balance_loss_clip": 1.13754046, + "balance_loss_mlp": 1.0343529, + "epoch": 0.599969938373666, + "flos": 18770606075520.0, + "grad_norm": 4.556144180982141, + "language_loss": 0.81282455, + "learning_rate": 1.456420997543594e-06, + "loss": 0.83962858, + "num_input_tokens_seen": 214965775, + "step": 9979, + "time_per_iteration": 2.837730646133423 + }, + { + "auxiliary_loss_clip": 0.01444394, + "auxiliary_loss_mlp": 0.01222404, + "balance_loss_clip": 1.13232422, + "balance_loss_mlp": 1.02880824, + "epoch": 0.600030061626334, + "flos": 11328799193280.0, + "grad_norm": 1.8713066253489368, + "language_loss": 0.69553888, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.72220689, + "num_input_tokens_seen": 214982480, + "step": 9980, + "time_per_iteration": 2.7250120639801025 + }, + { + "auxiliary_loss_clip": 0.01451172, + "auxiliary_loss_mlp": 0.01226694, + "balance_loss_clip": 1.13800883, + "balance_loss_mlp": 1.02880657, + "epoch": 0.600090184879002, + "flos": 16580384688960.0, + "grad_norm": 2.4366167252795994, + "language_loss": 0.68237495, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70915365, + "num_input_tokens_seen": 214998110, + "step": 9981, + "time_per_iteration": 2.8111977577209473 + }, + { + "auxiliary_loss_clip": 0.0144173, + "auxiliary_loss_mlp": 0.01219257, + "balance_loss_clip": 1.12943971, + "balance_loss_mlp": 1.02470744, + "epoch": 0.6001503081316699, + "flos": 23620642133760.0, + "grad_norm": 2.2855441752637233, + "language_loss": 0.7868588, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.81346864, + "num_input_tokens_seen": 215017995, + "step": 9982, + "time_per_iteration": 2.820430278778076 + }, + { + "auxiliary_loss_clip": 0.01445414, + "auxiliary_loss_mlp": 0.01227614, + "balance_loss_clip": 1.13282061, + "balance_loss_mlp": 1.0324924, + "epoch": 0.6002104313843379, + "flos": 20669763847680.0, + "grad_norm": 1.6574868663793332, + "language_loss": 0.72968656, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.7564168, + "num_input_tokens_seen": 215038285, + "step": 9983, + "time_per_iteration": 2.8818204402923584 + }, + { + "auxiliary_loss_clip": 0.0144376, + "auxiliary_loss_mlp": 0.01226222, + "balance_loss_clip": 1.13311505, + "balance_loss_mlp": 1.03033757, + "epoch": 0.6002705546370058, + "flos": 22457411736480.0, + "grad_norm": 2.1222240962267622, + "language_loss": 0.78344667, + "learning_rate": 1.454547250154447e-06, + "loss": 0.81014645, + "num_input_tokens_seen": 215057825, + "step": 9984, + "time_per_iteration": 2.8319098949432373 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 215057825, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.385027962054902e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}